In [2]:
!pip install chinesecalendar

Collecting chinesecalendar
  Obtaining dependency information for chinesecalendar from https://files.pythonhosted.org/packages/b0/9e/70893ed9c1c1e237d5fdc6d54211e6d1c86d82be429d663f059ba9658b56/chinesecalendar-1.9.0-py2.py3-none-any.whl.metadata
  Downloading chinesecalendar-1.9.0-py2.py3-none-any.whl.metadata (3.0 kB)
Downloading chinesecalendar-1.9.0-py2.py3-none-any.whl (11 kB)
Installing collected packages: chinesecalendar
Successfully installed chinesecalendar-1.9.0


In [3]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import MultipleLocator
import csv
from chinese_calendar import is_holiday
import datetime

In [4]:
links = {}
with open("../datasets/gy_link_info.txt") as csvfile:
    csv_reader = csv.reader(csvfile, delimiter=";")
    header = next(csv_reader)
    for row in csv_reader:
        ID = row[0]
        length = int(row[1])
        width = int(row[2])
        links[ID] = [length, width]

In [5]:
data = []
with open("../datasets/gy_link_top.txt") as csvfile:
    csv_reader = csv.reader(csvfile, delimiter=";")
    header = next(csv_reader)
    for row in csv_reader:
        ID = row[0]
        in_links = [-1 if link == "" else int(link) for link in row[1].split("#")]
        out_links = [0 if link == "" else int(link) for link in row[2].split("#")]
        data.append([ID, in_links, out_links])
MAP = dict(zip([ID[0] for ID in data], [X[1:] for X in data]))

In [6]:
def load_data(filename):
    data = []
    with open(filename) as csvfile:
        csv_reader = csv.reader(csvfile, delimiter=";")
        next(csvfile)
        for row in csv_reader:
            ID = row[0]
            date = [int(x) for x in row[1].split("-")]
            [start, end] = row[2].split(",")
            time_interval = [int(i) for i in start[12:].split(":")]
            travel_time = float(row[3])
            data.append([ID, date[0], date[1], date[2], time_interval[0] * 60 + time_interval[1] + 1, travel_time])
    return data

In [7]:
row_data = load_data("../datasets/gy_link_travel_time_part1.txt")

In [8]:
def sort_samples_by_link(data):
    sorted_data = {}
    for record in data:
        if record[0] in sorted_data.keys():
            sorted_data[record[0]].append(record[1:])
        else: sorted_data[record[0]] = [record[1:]]
    return sorted_data

In [9]:
SortedRowDataByLink = sort_samples_by_link(row_data)

In [10]:
def sort_samples_by_time(data):
    sorted_data = {}
    for links in data:
        sorted_data[links] = sorted(data[links], key=lambda x: (x[0]-2016)*500000 + x[1]*45000 + x[2]*1440 + x[3])
    return sorted_data

In [11]:
SortedRowDataByTime = sort_samples_by_time(SortedRowDataByLink)

In [12]:
def is_weekend(day):
    if (day.weekday() == 4 or day.weekday() == 5):
        return True
    else: return False

In [13]:
def feature_expand(data):
    expanded_data = {}

    for each in data:
        expanded_data[each] = np.zeros((len(data[each]), 75))
        for i in range(len(data[each])):
            day = datetime.date(data[each][i][0], data[each][i][1], data[each][i][2])

            # year
            year = data[each][i][0] - 2016
            expanded_data[each][i, 0] = year
            
            # month
            expanded_data[each][i, data[each][i][1]-2] = 1

            # weekday
            expanded_data[each][i, 6 + day.weekday()] = 1

            # day of month
            expanded_data[each][i, 13 + data[each][i][2]-1] = 1

            # hour of day
            expanded_data[each][i, 44 + (data[each][i][3]//60)] = 1

            # time_of_day
            expanded_data[each][i, 68] = data[each][i][3]

            # is_holiday
            expanded_data[each][i, 69] = int(is_holiday(day))

            # is_weekend
            expanded_data[each][i, 70] = int(is_weekend(day))

            # width
            expanded_data[each][i, 71] = links[each][1]

            # length
            expanded_data[each][i, 72] = links[each][0]

            # speed
            expanded_data[each][i, 73] = links[each][0] / data[each][i][4]

            # travel_time
            expanded_data[each][i, 74] = data[each][i][4]

    return expanded_data

In [None]:
ExpandedData = feature_expand(SortedRowDataByTime)

In [13]:
ExpandedData['9377906285566510514'].shape

(63305, 75)

In [16]:
def save_expanded_data(data):
    with open("../datasets/expanded_data_header.csv", "w", newline="") as f:
        csv_writer = csv.writer(f)
        for each in data:
            for i in range(len(data[each])):
                row = [each]
                row.extend(data[each][i])
                csv_writer.writerow(row)
            break

In [17]:
save_expanded_data(ExpandedData)

## 08/02/2024

In [2]:
ExpandedData

NameError: name 'SortedRowDataByTime' is not defined