In [1]:
# Import the required packages
# hide ipykernel warnings- Taken from sample solution
import warnings
warnings.filterwarnings('ignore')

# Import package pandas for data analysis
import pandas as pd

# Import individual sklearn modules used in model building and analysis
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics


pd.set_option('max_columns', 30)

In [2]:
# Reading from a csv file, into a data frame
clean_data = pd.read_csv('../database_code/route_46A_leavetimes.csv')
clean_data.head(5)

Unnamed: 0,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,DELAY,TIMEATSTOP,LINEID,DIRECTION,PLANNED_TRIP_DURATION,ACTUAL_TRIP_DURATION,YEAR,MONTH,DAY,HOUR,TEMP,FEELS_LIKE,TEMP_MIN,TEMP_MAX,PRESSURE,HUMIDITY,WIND_SPEED,WIND_DEG,CLOUDS_ALL,WEATHER_ID,WEATHER_MAIN,DAYOFWEEK
0,5970412,1,807,84600,84600,84442,84442,2868385,-158,0,46A,1,612,582.0,2018,1,1,23,4.18,-2.82,3.94,6.07,1008,75,7.2,260,40,802,Clouds,0
1,5970412,2,808,84644,84644,84617,84617,2868385,-27,0,46A,1,612,582.0,2018,1,1,23,4.18,-2.82,3.94,6.07,1008,75,7.2,260,40,802,Clouds,0
2,5970412,3,809,84670,84670,84631,84631,2868385,-39,0,46A,1,612,582.0,2018,1,1,23,4.18,-2.82,3.94,6.07,1008,75,7.2,260,40,802,Clouds,0
3,5970412,6,812,84754,84754,84676,84676,2868385,-78,0,46A,1,612,582.0,2018,1,1,23,4.18,-2.82,3.94,6.07,1008,75,7.2,260,40,802,Clouds,0
4,5970412,7,813,84776,84776,84691,84691,2868385,-85,0,46A,1,612,582.0,2018,1,1,23,4.18,-2.82,3.94,6.07,1008,75,7.2,260,40,802,Clouds,0


In [3]:
def clean_and_split(main_df,seed):
    """Function to Clean and Split the data
    
        This first cleans the data by getting the dummies for the required fields 
        and then splits the data up on the bases of trip ID for continuity."""
    # Get Dummies for whole table on spcific columns
    main_df_dummies = pd.get_dummies(main_df, columns=["STOPPOINTID", "DIRECTION", "MONTH", "HOUR", "WEATHER_MAIN", "DAYOFWEEK", "WEATHER_ID"], drop_first=True)
    # Get list of all unique Trip IDs
    full_list = list(main_df_dummies["TRIPID"].unique())
    # Split Trip Ids into 70:30 for Train:Test
    tripid_train, tripid_test= train_test_split(full_list, test_size=0.3, random_state=seed)
    # Seperate out the data
    ids_present = main_df_dummies['TRIPID'].isin(tripid_train)
    train_data = main_df_dummies.loc[ids_present]
    ids_present = main_df_dummies['TRIPID'].isin(tripid_test)
    test_data = main_df_dummies.loc[ids_present]
    train_data.reset_index(drop=True, inplace=True)
    test_data.reset_index(drop=True, inplace=True)
    # Get Train and Test Targets
    train_trgt = train_data["ACTUALTIME_ARR"]
    test_trgt = test_data["ACTUALTIME_ARR"]
    # Get Train and Test Features
    train_fetr = train_data.drop(columns=["ACTUALTIME_ARR", "TRIPID","PROGRNUMBER", "PLANNEDTIME_ARR", "VEHICLEID", "PLANNEDTIME_DEP", "ACTUALTIME_DEP", "DELAY", "TIMEATSTOP", "LINEID", "PLANNED_TRIP_DURATION", "ACTUAL_TRIP_DURATION", "YEAR", "DAY"])
    test_fetr = test_data.drop(columns=["ACTUALTIME_ARR", "TRIPID","PROGRNUMBER", "PLANNEDTIME_ARR", "VEHICLEID", "PLANNEDTIME_DEP", "ACTUALTIME_DEP", "DELAY", "TIMEATSTOP", "LINEID", "PLANNED_TRIP_DURATION", "ACTUAL_TRIP_DURATION", "YEAR", "DAY"])
    # Get Train and test Planned time (used for metrics)
    train_plan = train_data["PLANNEDTIME_ARR"]
    test_plan = test_data["PLANNEDTIME_ARR"]
    # Clear unneeded data to solve memory issues
    del main_df, main_df_dummies, train_data, test_data
    # Return all variables to function call
    return train_fetr, train_trgt, test_fetr, test_trgt, train_plan, test_plan

In [4]:
def test_model_outcome(predicted, actual, planned):
    """ Sort and Obtain Metrics in Dictionary format
        
        Function used to obtain the metrics of the model in dictionary format  """
    if not isinstance(predicted, pd.DataFrame):
        predicted = pd.DataFrame(predicted, columns= ["PREDICTED_ARR"])
    if not isinstance(actual, pd.DataFrame):
        actual = pd.DataFrame(actual, columns= ["ACTUALTIME_ARR"])
    if not isinstance(planned, pd.DataFrame):
        planned = pd.DataFrame(planned, columns= ["PLANNEDTIME_ARR"])
    # Initialise the combined dataframe 
    combined = pd.concat([predicted,actual,planned], axis=1)
    # Calculate the actual delay
    actual_delay = combined["PLANNEDTIME_ARR"] - combined["ACTUALTIME_ARR"]
    # Calculate the predicted delay
    predicted_delay = combined["PLANNEDTIME_ARR"] - combined["PREDICTED_ARR"]
    # Calculate the difference in delay
    delay_diff = actual_delay - predicted_delay
    # Combine the delays into a single dataframe
    combined_delay = pd.concat([pd.DataFrame(actual_delay, columns=['Actual_Delay']), pd.DataFrame(predicted_delay, columns=['Predicted_Delay']), pd.DataFrame(delay_diff, columns=['Difference_In_Delay'])], axis=1)
    # Obtain the index of the max and min values of the actual, predicted and difference delays
    actual_max_index = combined_delay["Actual_Delay"].argmax()
    actual_min_index = combined_delay["Actual_Delay"].argmin()
    predicted_max_index = combined_delay["Predicted_Delay"].argmax()
    predicted_min_index = combined_delay["Predicted_Delay"].argmin()
    delay_diff_max_index = combined_delay["Difference_In_Delay"].argmax()
    delay_diff_min_index = combined_delay["Difference_In_Delay"].argmin()
    # Get the Mean Absolute Error
    MAE = metrics.mean_absolute_error(combined["ACTUALTIME_ARR"], combined["PLANNEDTIME_ARR"])
    # Get the R2 Score
    R2 = metrics.r2_score(combined["ACTUALTIME_ARR"], combined["PLANNEDTIME_ARR"])
    # Get the Root Mean Squared Error
    RMSE = metrics.mean_squared_error(combined["ACTUALTIME_ARR"], combined["PLANNEDTIME_ARR"], squared=False)
    # Get the Median Absolute Error
    MEDAE = metrics.median_absolute_error(combined["ACTUALTIME_ARR"], combined["PLANNEDTIME_ARR"])
    # Get the Mean Squared Error Log Value
    MSLE = metrics.mean_squared_log_error(combined["ACTUALTIME_ARR"], combined["PLANNEDTIME_ARR"])
    # Build Dictionary
    pass_val = {"combined" : combined,
                "combined_delay" : combined_delay,
                "actual_max_index" : actual_max_index,
                "actual_min_index" : actual_min_index,
                "predicted_max_index" : predicted_max_index,
                "predicted_min_index" : predicted_min_index,
                "delay_diff_max_index" : predicted_max_index,
                "delay_diff_min_index" : predicted_min_index,
                "MAE":MAE,
                "R2":R2,
                "MEDAE":MEDAE,
                "RMSE":RMSE,
                "MSLE":MSLE}
    # Return Dictionary
    return pass_val
    

In [5]:
def print_metrics(metrics_dict):
    """A function to print all of the metrics obtained"""
    print("\n==================== Random Forest Model Data ======================")
    print("\n\nArrival Time Metrics")
    print(metrics_dict["combined"].sample(n=10, random_state=0))
    print("\n\nDelay Metrics")
    print(metrics_dict["combined_delay"].sample(n=10, random_state=0))
    print("\n\nActual Arrival Time Extremes")
    print("====MAX====")
    print(metrics_dict["combined"].iloc[metrics_dict["actual_max_index"]])
    print("Difference To Planned")
    print(metrics_dict["combined_delay"]["Actual_Delay"].iloc[metrics_dict["actual_max_index"]])
    print("Difference To Predicted (Actual - Predicted)")
    print(metrics_dict["combined_delay"]["Difference_In_Delay"].iloc[metrics_dict["actual_max_index"]])
    print("====MIN====")
    print(metrics_dict["combined"].iloc[metrics_dict["actual_min_index"]])
    print("Difference To Planned")
    print(metrics_dict["combined_delay"]["Actual_Delay"].iloc[metrics_dict["actual_min_index"]])
    print("Difference To Predicted (Actual - Predicted)")
    print(metrics_dict["combined_delay"]["Difference_In_Delay"].iloc[metrics_dict["actual_min_index"]])
    print("\n\nPredicted Arrival Time Extremes")
    print("====MAX====")
    print(metrics_dict["combined"].iloc[metrics_dict["predicted_max_index"]])
    print("Difference To Planned")
    print(metrics_dict["combined_delay"]["Predicted_Delay"].iloc[metrics_dict["predicted_max_index"]])
    print("Difference To Actual (Actual - Predicted)")
    print(metrics_dict["combined_delay"]["Difference_In_Delay"].iloc[metrics_dict["predicted_max_index"]])
    print("====MIN====")
    print(metrics_dict["combined"].iloc[metrics_dict["predicted_min_index"]])
    print("Difference To Planned")
    print(metrics_dict["combined_delay"]["Predicted_Delay"].iloc[metrics_dict["predicted_min_index"]])
    print("Difference To Actual (Actual - Predicted)")
    print(metrics_dict["combined_delay"]["Difference_In_Delay"].iloc[metrics_dict["predicted_min_index"]])
    print("\n\nDelay Difference Extremes (Actual - Predicted)")
    print("====MAX====")
    print(metrics_dict["combined_delay"].iloc[metrics_dict["delay_diff_max_index"]])
    print("====MODEL DATA====")
    print(metrics_dict["combined"].iloc[metrics_dict["delay_diff_max_index"]])
    print("\n====MIN====")
    print(metrics_dict["combined_delay"].iloc[metrics_dict["delay_diff_min_index"]])
    print("====MODEL DATA====")
    print(metrics_dict["combined"].iloc[metrics_dict["delay_diff_min_index"]])
    print("\n\n====METRICS====")
    print("Mean Absolute Error\t", metrics_dict["MAE"])
    print("R2\t\t\t", metrics_dict["R2"])
    print("Median Absolute Error\t", metrics_dict["MEDAE"])
    print("Root Mean Squared Error\t", metrics_dict["RMSE"])
    print("Mean Squared Log Error\t", metrics_dict["MSLE"])
    print("\n\n====Overall Mean====")
    print(metrics_dict["combined_delay"].mean(axis=0))

In [6]:
def metrics_builder(metrics_dict):
    """ Returns a Datafram of the metrics
    
        Built from the dictionary"""
    df = pd.DataFrame(metrics_dict["combined_delay"].mean(axis=0), columns=["Metrics"])
    df.loc["Max_Actual_Delay"] = metrics_dict["combined_delay"]["Actual_Delay"].loc[metrics_dict["actual_max_index"]]
    df.loc["Min_Actual_Delay"] = metrics_dict["combined_delay"]["Actual_Delay"].loc[metrics_dict["actual_min_index"]]
    df.loc["Max_Predicted_Delay"] = metrics_dict["combined_delay"]["Predicted_Delay"].loc[metrics_dict["predicted_max_index"]]
    df.loc["Min_Predicted_Delay"] = metrics_dict["combined_delay"]["Predicted_Delay"].loc[metrics_dict["predicted_min_index"]]
    df.loc["Mean_Absolute_Error"] = metrics_dict["MAE"]
    df.loc["R2"] = metrics_dict["R2"]
    df.loc["Median_Absolute_Error"] = metrics_dict["MEDAE"]
    df.loc["Root_Mean_Squared_Error"] = metrics_dict["RMSE"]
    df.loc["Mean_Squared_Log_Error"] = metrics_dict["MSLE"]
    df = df.rename(index={"Actual_Delay": "Actual_Delay_Mean", "Predicted_Delay": "Predicted_Delay_Mean", "Difference_In_Delay": "Difference_In_Delay_Mean"})
    return df

In [7]:
# initialised empty daraframe and train and test data
depth_trials = pd.DataFrame()
# Get all of the training and testing features, targets and planned data for building and testing
train_fetr, train_trgt, test_fetr, test_trgt, train_plan, test_plan = clean_and_split(clean_data, 0)
# calculate cross val score max-depth incrimental
for i in range(1,21):
    # Keep track of what loop we are on 
    print("Loop {} of 20".format(i), end="\r")
    if i < 20:
        # Make the model with the current depth value, Train data used in making model
        randforest_model = RandomForestRegressor(n_estimators=100, max_features='auto', max_depth = i, oob_score=True, random_state=1).fit(train_fetr, train_trgt)
        # Obtain predicted values using test data as input
        randforest_model_predict = list(map(round, randforest_model.predict(test_fetr)))
        # Build the metrics dictionary with test data and predictions
        metrics_dict = test_model_outcome(randforest_model_predict, test_trgt, test_plan)
        # Build the dataframe
        randforrest_results = metrics_builder(metrics_dict)
        # Rename column name to show what trial this is
        randforrest_results.rename(columns={'Metrics':f'depth={i}'}, inplace=True)
    else:
        # Data much as above only depth is set to none to show maximum depth
        randforest_model = RandomForestRegressor(n_estimators=100, max_features='auto', max_depth = None, oob_score=True, random_state=1).fit(train_fetr, train_trgt)
        randforest_model_predict = list(map(round, randforest_model.predict(test_fetr)))
        metrics_dict = test_model_outcome(randforest_model_predict, test_trgt, test_plan)
        randforrest_results = metrics_builder(metrics_dict)
        randforrest_results.rename(columns={'Metrics':f'depth=None'}, inplace=True)
    # Append dataframe to trials dataframe
    depth_trials = pd.concat([depth_trials, randforrest_results], axis=1)
# Show results
depth_trials

Loop 20 of 20

Unnamed: 0,depth=1,depth=2,depth=3,depth=4,depth=5,depth=6,depth=7,depth=8,depth=9,depth=10,depth=11,depth=12,depth=13,depth=14,depth=15,depth=16,depth=17,depth=18,depth=19,depth=None
Actual_Delay_Mean,-50.729612,-50.729612,-50.729612,-50.729612,-50.729612,-50.729612,-50.729612,-50.729612,-50.729612,-50.729612,-50.729612,-50.729612,-50.729612,-50.729612,-50.729612,-50.729612,-50.729612,-50.729612,-50.729612,-50.729612
Predicted_Delay_Mean,-501.073803,-584.607309,-489.485812,-302.766364,-121.808409,-19.435949,-93.837387,-101.780131,-54.290326,-1.607124,-65.413204,-123.207899,-190.766762,-88.66638,-12.980814,-51.852757,-63.515463,-53.290016,-52.40289,-56.046525
Difference_In_Delay_Mean,450.344191,533.877697,438.7562,252.036751,71.078797,-31.293664,43.107774,51.050518,3.560713,-49.122489,14.683592,72.478287,140.03715,37.936768,-37.748798,1.123144,12.78585,2.560403,1.673277,5.316913
Max_Actual_Delay,1766.0,1766.0,1766.0,1766.0,1766.0,1766.0,1766.0,1766.0,1766.0,1766.0,1766.0,1766.0,1766.0,1766.0,1766.0,1766.0,1766.0,1766.0,1766.0,1766.0
Min_Actual_Delay,-3689.0,-3689.0,-3689.0,-3689.0,-3689.0,-3689.0,-3689.0,-3689.0,-3689.0,-3689.0,-3689.0,-3689.0,-3689.0,-3689.0,-3689.0,-3689.0,-3689.0,-3689.0,-3689.0,-3689.0
Max_Predicted_Delay,32194.0,30602.0,28894.0,27497.0,26130.0,24432.0,22664.0,20869.0,19064.0,17289.0,15402.0,13422.0,13002.0,10896.0,8778.0,6561.0,4505.0,3280.0,3290.0,3676.0
Min_Predicted_Delay,-35931.0,-37523.0,-39231.0,-40628.0,-41995.0,-43693.0,-45461.0,-47256.0,-49061.0,-50836.0,-52723.0,-53662.0,-10616.0,-9204.0,-7350.0,-6762.0,-5365.0,-5365.0,-5364.0,-5014.0
Mean_Absolute_Error,263.105924,263.105924,263.105924,263.105924,263.105924,263.105924,263.105924,263.105924,263.105924,263.105924,263.105924,263.105924,263.105924,263.105924,263.105924,263.105924,263.105924,263.105924,263.105924,263.105924
R2,0.999515,0.999515,0.999515,0.999515,0.999515,0.999515,0.999515,0.999515,0.999515,0.999515,0.999515,0.999515,0.999515,0.999515,0.999515,0.999515,0.999515,0.999515,0.999515,0.999515
Median_Absolute_Error,183.0,183.0,183.0,183.0,183.0,183.0,183.0,183.0,183.0,183.0,183.0,183.0,183.0,183.0,183.0,183.0,183.0,183.0,183.0,183.0


In [8]:
est_tials = pd.DataFrame()

n_est = [1, 2, 4, 8, 16, 32, 64, 100, 200, 250]

for i, est in enumerate(n_est):
    # Keep track of what loop we are on
    print("Loop {} of 10".format(i), end="\r")
    # Using optimised Depth build model with current estimator value(obtained in loop)
    randforest_model = RandomForestRegressor(n_estimators=est, max_features='auto', max_depth = 18, oob_score=True, random_state=1).fit(train_fetr, train_trgt)
    # Predict values using test features on model
    randforest_model_predict = list(map(round, randforest_model.predict(test_fetr)))
    # Build the metrics dictionary with test data and predictions
    metrics_dict = test_model_outcome(randforest_model_predict, test_trgt, test_plan)
    # Build the dataframe
    randforrest_results = metrics_builder(metrics_dict)
    # Rename column name to show what trial this is
    randforrest_results.rename(columns={'Metrics':f'estimators={est}'}, inplace=True)
    # Append dataframe to trials dataframe
    est_tials = pd.concat([est_tials, randforrest_results], axis=1)
# Show results
est_tials

Loop 9 of 10

Unnamed: 0,estimators=1,estimators=2,estimators=4,estimators=8,estimators=16,estimators=32,estimators=64,estimators=100,estimators=200,estimators=250
Actual_Delay_Mean,-50.729612,-50.729612,-50.729612,-50.729612,-50.729612,-50.729612,-50.729612,-50.729612,-50.729612,-50.729612
Predicted_Delay_Mean,-51.798475,-53.070235,-54.071298,-54.378817,-54.656943,-54.223995,-53.686487,-53.290016,-53.248316,-53.331645
Difference_In_Delay_Mean,1.068863,2.340623,3.341685,3.649205,3.92733,3.494382,2.956874,2.560403,2.518703,2.602033
Max_Actual_Delay,1766.0,1766.0,1766.0,1766.0,1766.0,1766.0,1766.0,1766.0,1766.0,1766.0
Min_Actual_Delay,-3689.0,-3689.0,-3689.0,-3689.0,-3689.0,-3689.0,-3689.0,-3689.0,-3689.0,-3689.0
Max_Predicted_Delay,3727.0,3554.0,3180.0,3183.0,3196.0,3234.0,3257.0,3280.0,3259.0,3256.0
Min_Predicted_Delay,-5448.0,-5455.0,-5423.0,-5409.0,-5392.0,-5391.0,-5384.0,-5365.0,-5374.0,-5376.0
Mean_Absolute_Error,263.105924,263.105924,263.105924,263.105924,263.105924,263.105924,263.105924,263.105924,263.105924,263.105924
R2,0.999515,0.999515,0.999515,0.999515,0.999515,0.999515,0.999515,0.999515,0.999515,0.999515
Median_Absolute_Error,183.0,183.0,183.0,183.0,183.0,183.0,183.0,183.0,183.0,183.0


In [9]:
rand_split_trials = pd.DataFrame()

# Similar to previous tests only this time we use optimal estimators and depth but alter the 
# Random train test split to show variability.
for i in range(1, 21):
    print("Loop {} of 20".format(i), end="\r")
    train_fetr, train_trgt, test_fetr, test_trgt, train_plan, test_plan = clean_and_split(clean_data, i)
    randforest_model = RandomForestRegressor(n_estimators=16, max_features='auto', max_depth = 18, oob_score=True, random_state=1).fit(train_fetr, train_trgt)
    randforest_model_predict = list(map(round, randforest_model.predict(test_fetr)))
    metrics_dict = test_model_outcome(randforest_model_predict, test_trgt, test_plan)
    randforrest_results = metrics_builder(metrics_dict)
    randforrest_results.rename(columns={'Metrics':f'Rand_Split={i}'}, inplace=True)
    rand_split_trials = pd.concat([rand_split_trials, randforrest_results], axis=1)

rand_split_trials

Loop 20 of 20

Unnamed: 0,Rand_Split=1,Rand_Split=2,Rand_Split=3,Rand_Split=4,Rand_Split=5,Rand_Split=6,Rand_Split=7,Rand_Split=8,Rand_Split=9,Rand_Split=10,Rand_Split=11,Rand_Split=12,Rand_Split=13,Rand_Split=14,Rand_Split=15,Rand_Split=16,Rand_Split=17,Rand_Split=18,Rand_Split=19,Rand_Split=20
Actual_Delay_Mean,-50.304758,-40.634251,-35.779128,-67.011905,-51.562343,-53.321618,-54.833109,-33.977788,-49.494612,-38.582098,-57.452273,-46.675027,-38.774435,-52.952516,-74.462787,-57.50783,-50.998429,-45.400804,-53.302604,-54.003673
Predicted_Delay_Mean,-28.261209,-33.189863,-54.496304,-62.726301,-64.79253,-67.298411,-88.832072,-30.636345,-110.512806,-40.508357,-55.83964,-57.812908,-17.956885,-46.859755,-71.004437,-61.722472,-62.278734,-36.545742,-47.101202,-46.093886
Difference_In_Delay_Mean,-22.043549,-7.444388,18.717176,-4.285604,13.230187,13.976793,33.998963,-3.341443,61.018194,1.926259,-1.612634,11.137881,-20.81755,-6.09276,-3.45835,4.214642,11.280305,-8.855062,-6.201402,-7.909786
Max_Actual_Delay,4338.0,4338.0,1791.0,4338.0,1766.0,4338.0,4338.0,1766.0,4338.0,2219.0,1418.0,4338.0,1595.0,2219.0,1766.0,1517.0,4338.0,1766.0,2219.0,1791.0
Min_Actual_Delay,-2319.0,-4526.0,-2084.0,-2834.0,-4526.0,-2319.0,-3098.0,-2389.0,-2319.0,-3098.0,-4526.0,-2637.0,-2834.0,-3098.0,-4526.0,-3098.0,-3098.0,-4526.0,-4526.0,-4526.0
Max_Predicted_Delay,3519.0,3090.0,3292.0,3304.0,3214.0,3332.0,3238.0,3600.0,26906.0,3204.0,3138.0,3391.0,3267.0,3260.0,3321.0,3020.0,3154.0,3195.0,3047.0,3077.0
Min_Predicted_Delay,-3911.0,-5817.0,-3434.0,-3894.0,-5903.0,-3582.0,-4720.0,-3639.0,-33589.0,-4355.0,-5861.0,-3695.0,-3851.0,-4632.0,-5877.0,-4713.0,-4676.0,-5768.0,-5837.0,-5790.0
Mean_Absolute_Error,265.859736,264.121647,258.191825,267.695471,279.44859,263.999843,278.686477,253.774158,269.806611,272.552103,261.877083,272.022668,266.205825,271.091515,275.420468,268.67892,271.247348,268.889174,270.560689,267.028465
R2,0.999512,0.999509,0.999553,0.999485,0.999476,0.999521,0.999449,0.999525,0.999488,0.999481,0.99953,0.999509,0.999497,0.99945,0.999427,0.999502,0.999482,0.999521,0.999477,0.9995
Median_Absolute_Error,180.0,186.0,178.0,186.0,187.0,181.0,196.0,175.0,186.0,189.0,180.0,189.0,183.0,186.0,187.0,187.0,186.0,186.0,187.0,187.0


In [10]:
# Import package pandas for data analysis
import pandas as pd
# Import OS to check if files exist
import os
# Import Pickle to save prediction model
import pickle
# Import individual sklearn modules used in model building and analysis
from sklearn.ensemble import RandomForestRegressor






pd.set_option('max_columns', 30)
def get_routes():
    route_df = pd.read_csv("../database_code/routes_tripids.csv")
    return route_df["Routes"]

def clean_and_split(route):
    if not os.path.isfile("../database_code/route_{}_leavetimes.csv".format(route)):
        with open('model_log.txt', 'a') as f:
            f.writelines("{} is not a valid route".format(route))
        return
    else:
        # imports file to DataFrame
        leave_df = pd.read_csv("../database_code/route_{}_leavetimes.csv".format(route))
    # Get Dummies for whole table on
    train_data = pd.get_dummies(leave_df, columns=["STOPPOINTID", "DIRECTION", "MONTH", "HOUR", "WEATHER_MAIN", "DAYOFWEEK", "WEATHER_ID"], drop_first=True)
    train_data.reset_index(drop=True, inplace=True)
    train_trgt = train_data["ACTUALTIME_ARR"]
    train_fetr = train_data.drop(columns=["ACTUALTIME_ARR", "TRIPID","PROGRNUMBER", "PLANNEDTIME_ARR", "VEHICLEID", "PLANNEDTIME_DEP", "ACTUALTIME_DEP", "DELAY", "TIMEATSTOP", "LINEID", "PLANNED_TRIP_DURATION", "ACTUAL_TRIP_DURATION", "YEAR", "DAY"])
    plan = train_data["PLANNEDTIME_ARR"]
    del leave_df, train_data
    return train_fetr, train_trgt, plan
def test_model(fetr, trgt, plan):
    with open('models/route_{}_RF_model.pkl'.format('49A'), 'rb') as handle:
        rand_forest_model = pickle.load(handle)
    randforest_model_predict = list(map(round, rand_forest_model.predict(fetr)))
    metrics_dict = test_model_outcome(randforest_model_predict, trgt, plan)
    randforrest_results = metrics_builder(metrics_dict)

def main():
    try:
        if os.path.isfile("model_tracker.csv"):
            track_df = pd.read_csv("model_tracker.csv")
        else:
            track_df = pd.DataFrame(columns=["Route", "Model"])
        if os.path.isfile("model_features.csv"):
            fetr_df = pd.read_csv("model_features.csv")
        else:
            fetr_df = pd.DataFrame(columns=["Route", "Features"])
        with open('model_log.txt', 'w') as f:
                f.write("Starting Model Building\n\n")

        routes = get_routes()
        routes = ['46A']
        for route in routes:
            if track_df[(track_df["Route"] == route)].empty:
                track_df.loc[track_df.shape[0]] = [route, 0]
            if not track_df[(track_df["Route"] == route) & (track_df["Model"] == 0)].empty:
                try:
                    train_fetr, train_trgt, plan = clean_and_split(route)
                    print(list(train_fetr.columns))
                    randforest_model = RandomForestRegressor(n_estimators=16, max_features='auto', max_depth=18,
                                                             oob_score=True, random_state=1).fit(train_fetr, train_trgt)
                    with open('models/route_{}_RF_model.pkl'.format(route), 'wb') as handle:
                        pickle.dump(randforest_model, handle, pickle.HIGHEST_PROTOCOL)
                    if fetr_df[(fetr_df["Route"] == route)].empty:
                        fetr_df.loc[fetr_df.shape[0]] = [route, list(train_fetr.columns)]
                    track_df.loc[track_df["Route"] == route, ["Model"]] = 1
                    test_model(train_fetr, train_trgt, plan)
                except:
                    continue


    except:
        pass
    finally:
        track_df.to_csv("model_tracker.csv", index=False)
        fetr_df.to_csv("model_features.csv", index=False)

if __name__ == '__main__':
    main()

['TEMP', 'FEELS_LIKE', 'TEMP_MIN', 'TEMP_MAX', 'PRESSURE', 'HUMIDITY', 'WIND_SPEED', 'WIND_DEG', 'CLOUDS_ALL', 'STOPPOINTID_81', 'STOPPOINTID_192', 'STOPPOINTID_264', 'STOPPOINTID_278', 'STOPPOINTID_320', 'STOPPOINTID_334', 'STOPPOINTID_401', 'STOPPOINTID_406', 'STOPPOINTID_435', 'STOPPOINTID_461', 'STOPPOINTID_747', 'STOPPOINTID_756', 'STOPPOINTID_757', 'STOPPOINTID_758', 'STOPPOINTID_759', 'STOPPOINTID_760', 'STOPPOINTID_761', 'STOPPOINTID_762', 'STOPPOINTID_763', 'STOPPOINTID_767', 'STOPPOINTID_768', 'STOPPOINTID_769', 'STOPPOINTID_770', 'STOPPOINTID_771', 'STOPPOINTID_772', 'STOPPOINTID_773', 'STOPPOINTID_774', 'STOPPOINTID_775', 'STOPPOINTID_776', 'STOPPOINTID_777', 'STOPPOINTID_786', 'STOPPOINTID_792', 'STOPPOINTID_795', 'STOPPOINTID_796', 'STOPPOINTID_797', 'STOPPOINTID_798', 'STOPPOINTID_799', 'STOPPOINTID_800', 'STOPPOINTID_801', 'STOPPOINTID_802', 'STOPPOINTID_803', 'STOPPOINTID_804', 'STOPPOINTID_805', 'STOPPOINTID_806', 'STOPPOINTID_807', 'STOPPOINTID_808', 'STOPPOINTID_809

In [11]:
randforrest_results

Unnamed: 0,Rand_Split=20
Actual_Delay_Mean,-54.003673
Predicted_Delay_Mean,-46.093886
Difference_In_Delay_Mean,-7.909786
Max_Actual_Delay,1791.0
Min_Actual_Delay,-4526.0
Max_Predicted_Delay,3077.0
Min_Predicted_Delay,-5790.0
Mean_Absolute_Error,267.028465
R2,0.9995
Median_Absolute_Error,187.0
