# Generating data frames with varying granularity 

Given the "relative" sparsity of the data, its important to increase statistical power by collapsing time dimensions

Collapsing time will be seperate into "day of the week" and "time within the day" 

Collapsing time dimensions does run the risk of losing time related patterning

Thus the goal of this is generate a variety of combinations of collapsing time, and testing the predictive performance of a variety of ML methods


In [2]:
import pandas as pd 

### Back bone of the ML algorithm --> The ticket inspection data set

In [4]:
Inspections = pd.read_csv("C:/users/henry chapman/Documents/Coding/Data_science/Project_final/Output/1_Compiling_data/Pipe2/Inspections.csv")
Inspections.rename(columns = {"Stop_code" : "Stop_ID"}, inplace = True)
Inspections.head()

Unnamed: 0,Date,Start_Dtime,End_Dtime,Stop_ID,Line,Repeat_bus,Visit_number,Unvalidated,Pre_2019
0,2016-02-12,2016-02-12 13:16:51,2016-02-12 13:17:17,7820064-1,1,0.0,1,0,True
1,2016-02-12,2016-02-12 13:33:31,2016-02-12 13:34:12,7820064-1,1,1.0,1,1,True
2,2016-02-12,2016-02-12 13:50:55,2016-02-12 13:51:11,7820064-1,15,0.0,1,1,True
3,2016-02-12,2016-02-12 13:12:26,2016-02-12 13:14:13,7820064-1,20,0.0,1,14,True
4,2016-02-12,2016-02-12 13:19:15,2016-02-12 13:20:15,7820064-1,26,0.0,1,9,True


### Additional static metric_1 --> The bus stop visting score from 1_Compiling_data/Pipe5

In [8]:
Stop_visit_score = pd.read_csv("C:/users/henry chapman/Documents/Coding/Data_science/Project_final/Output/1_Compiling_data/Pipe5/Stop_visit_score.csv")
Stop_visit_score.drop_duplicates(inplace = True)
Stop_visit_score.head()

Unnamed: 0,Stop_ID,Pre_2019,Visiting_score,Rel_Visiting_score
0,144360,True,1,0.03125
1,144382,False,1,0.047619
2,144384,False,1,0.047619
3,144466,True,1,0.03125
4,7800070-1,False,1,0.047619


### Additional static metric_2 --> The bus stop visting score from 1_Compiling_data/Pipe7

In [9]:
Mapping_and_density = pd.read_csv("C:/users/henry chapman/Documents/Coding/Data_science/Project_final/Output/1_Compiling_data/Pipe7/Mapping_and_density.csv")
Mapping_and_density.head()

Unnamed: 0,Stop_ID,Pre_2019,density,NIMI
0,7820277-1,True,5,Ränilinna
1,7820088-1,True,12,Kesklinna
2,7820165-1,True,1,Annelinna
3,7820155-1,True,1,Annelinna
4,7820014-1,True,3,Annelinna


## Merging metrics_1 and metric_2 with Inspections

In [12]:
Merge_1 = Inspections.merge(Stop_visit_score, left_on = ["Stop_ID", "Pre_2019"], right_on = ["Stop_ID", "Pre_2019"], how = "left")

In [13]:
Merge_2 = Merge_1.merge(Mapping_and_density, left_on = ["Stop_ID", "Pre_2019"], right_on = ["Stop_ID", "Pre_2019"], how = "left")

In [15]:
Merge_2.head()

Unnamed: 0,Date,Start_Dtime,End_Dtime,Stop_ID,Line,Repeat_bus,Visit_number,Unvalidated,Pre_2019,Visiting_score,Rel_Visiting_score,density,NIMI
0,2016-02-12,2016-02-12 13:16:51,2016-02-12 13:17:17,7820064-1,1,0.0,1,0,True,14,0.4375,2.0,Annelinna
1,2016-02-12,2016-02-12 13:33:31,2016-02-12 13:34:12,7820064-1,1,1.0,1,1,True,14,0.4375,2.0,Annelinna
2,2016-02-12,2016-02-12 13:50:55,2016-02-12 13:51:11,7820064-1,15,0.0,1,1,True,14,0.4375,2.0,Annelinna
3,2016-02-12,2016-02-12 13:12:26,2016-02-12 13:14:13,7820064-1,20,0.0,1,14,True,14,0.4375,2.0,Annelinna
4,2016-02-12,2016-02-12 13:19:15,2016-02-12 13:20:15,7820064-1,26,0.0,1,9,True,14,0.4375,2.0,Annelinna


In [14]:
len(Merge_2)

60798

In [16]:
# There are a few bus stops which dont have a density metric because their location is not within the GTSF data base (due to the 2019 upheaval)
# I will simply set them as the mean
mean_density = Merge_2["density"].mean()
Merge_2["density"] = Merge_2["density"].fillna(mean_density)

### Formating columns 

In [17]:
Merge_2.dtypes

Date                   object
Start_Dtime            object
End_Dtime              object
Stop_ID                object
Line                   object
Repeat_bus            float64
Visit_number            int64
Unvalidated             int64
Pre_2019                 bool
Visiting_score          int64
Rel_Visiting_score    float64
density               float64
NIMI                   object
dtype: object

In [18]:
Merge_2["Date"] = pd.to_datetime(Merge_2["Date"], format = "%Y-%m-%d")
Merge_2["Start_Dtime"] = pd.to_datetime(Merge_2["Start_Dtime"])
Merge_2["End_Dtime"] = pd.to_datetime(Merge_2["End_Dtime"])

Merge_2["Repeat_bus"] = Merge_2["Repeat_bus"].astype("int16")
Merge_2["Visit_number"] = Merge_2["Visit_number"].astype("int16")
Merge_2["Unvalidated"] = Merge_2["Unvalidated"].astype("int16")
Merge_2["density"] = Merge_2["density"].astype("int16")
Merge_2["Pre_2019"] = Merge_2["Pre_2019"].astype("boolean")
Merge_2["Visiting_score"] = Merge_2["Visiting_score"].astype("int16")

In [19]:
Merge_2.dtypes

Date                  datetime64[ns]
Start_Dtime           datetime64[ns]
End_Dtime             datetime64[ns]
Stop_ID                       object
Line                          object
Repeat_bus                     int16
Visit_number                   int16
Unvalidated                    int16
Pre_2019                     boolean
Visiting_score                 int16
Rel_Visiting_score           float64
density                        int16
NIMI                          object
dtype: object

### Merging with ticket validations

This metric is the one which is dependant on the level of granularity chosen 

Thus, the function iterates through different granularities for day of week and time and generates, and exports a data frame 

The metric which is actually extracted out is : The mean number of ticket validations that occur at that bus stops within the "Binned_time" and "Day_of_week"
- ie, if Day_of_week is set to "collapseall" and "Binned_time" is set to 60 minutes. Then, for each bus stop, it will calculate the mean number of people who get on from that bus stop 30 minutes before and after the nearest hour 

In [20]:
Ticket_validations = pd.read_csv("C:/users/henry chapman/Documents/Coding/Data_science/Project_final/Output/1_Compiling_data/Pipe4/Ticket_validations.csv", parse_dates = ["DateTime"])
Ticket_validations.head()

  Ticket_validations = pd.read_csv("C:/users/henry chapman/Documents/Coding/Data_science/Project_final/Output/1_Compiling_data/Pipe4/Ticket_validations.csv", parse_dates = ["DateTime"])


Unnamed: 0,DateTime,Line_clean,Stop_ID,Pre_2019,Validation_Count
0,2016-01-01 10:01:00,1.0,7820277-1,True,1.0
1,2016-01-01 10:12:00,1.0,7820088-1,True,3.0
2,2016-01-01 10:01:00,1.0,7820165-1,True,2.0
3,2016-01-01 10:02:00,1.0,7820155-1,True,1.0
4,2016-01-01 10:04:00,1.0,7820014-1,True,3.0


In [21]:
Ticket_vals = Ticket_validations.copy()

In [22]:

def DataFrame_Generator(Time_gran, Date_gran, name):
    # Bin into time granularity
    Ticket_vals["Binned_time"] = Ticket_vals["DateTime"].dt.round(Time_gran)
    Merge_2["Binned_time"] = Merge_2["Start_Dtime"].dt.round(Time_gran)

    # Bin into desired days
    if Date_gran == "Dayofweek" or Date_gran == "Weekendsplit":
        Ticket_vals["Binned_day"] = Ticket_vals["DateTime"].dt.weekday
        Merge_2["Binned_day"] = Merge_2["Start_Dtime"].dt.weekday

        if Date_gran == "Weekendsplit":
            # returns 0 for weekdays, 1 for weekends
            Ticket_vals["Binned_day"] = Ticket_vals["DateTime"].dt.weekday.map(lambda x: 0 if x < 5 else 1)
            Merge_2["Binned_day"] = Merge_2["Start_Dtime"].dt.weekday.map(lambda x: 0 if x < 5 else 1)

    if Date_gran == "Collapseall":
        Ticket_vals["Binned_day"] = 0
        Merge_2["Binned_day"] = 0

    # Merge raw ticket validations with inspection dataset
    merged = Merge_2.merge(
        Ticket_vals,
        left_on=["Stop_ID", "Binned_time", "Binned_day", "Pre_2019"],
        right_on=["Stop_ID", "Binned_time", "Binned_day", "Pre_2019"],
        how="left"
    )

    # Aggregate to compute mean validations and other metrics
    Grouped = (
        merged.groupby(["Stop_ID", "Binned_time", "Binned_day", "Pre_2019"])
        .agg(
            Validation_Count_mean=("Validation_Count", "mean"),
            Unvalidated_mean=("Unvalidated", "mean"),
            NIMI=("NIMI", "first"),
            Visiting_score=("Visiting_score", "first"),
            Rel_Visiting_score=("Rel_Visiting_score", "first"),
            density=("density", "first"),
            n_rows=("Unvalidated", "count")
        )
        .reset_index()
    )

    # Export as CSV
    Grouped.to_csv(f"c:/users/henry chapman/Documents/Coding/Data_science/Project_final/Output/2_Granularity_tuning/Pipe1/{name}", index=False)



        

In [23]:

Time_granularity = ["30min", "60min", "120min", "180min", "360min", "720min"]
Day_granularity = ["Dayofweek", "Weekendsplit", "Collapseall"]

for time in Time_granularity: 
    for day in Day_granularity:
        name = f"Combo_{time}_{day}.csv"
        DataFrame_Generator(time, day, name)
        print(name)


Combo_30min_Dayofweek.csv
Combo_30min_Weekendsplit.csv
Combo_30min_Collapseall.csv
Combo_60min_Dayofweek.csv
Combo_60min_Weekendsplit.csv
Combo_60min_Collapseall.csv
Combo_120min_Dayofweek.csv
Combo_120min_Weekendsplit.csv
Combo_120min_Collapseall.csv
Combo_180min_Dayofweek.csv
Combo_180min_Weekendsplit.csv
Combo_180min_Collapseall.csv
Combo_360min_Dayofweek.csv
Combo_360min_Weekendsplit.csv
Combo_360min_Collapseall.csv
Combo_720min_Dayofweek.csv
Combo_720min_Weekendsplit.csv
Combo_720min_Collapseall.csv
