## Test Notebook - will modify as I go with more detailed metrics in the function

In [1]:
import modules.preprocessing_functions as pre

In [2]:
import pandas as pd
import numpy as np


from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


def split_data(X, y, test_size=0.2, random_state=99):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = random_state)
    
    return X_train, X_test, y_train, y_test

def random_forest(X_train, X_test, y_train, y_test, random_state=99):
    random_forest = RandomForestClassifier(random_state=random_state)
    random_forest.fit(X_train, y_train)
    y_pred = random_forest.predict(X_test)
    col_names = X_train.columns.tolist()
    feature_scores = pd.Series(random_forest.feature_importances_, index=[col_names]).sort_values(ascending=False)
    print(feature_scores)
    print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))



## Quick Test using just two features with 1% of Jan 2018 data

In [3]:
# Note: the data is in my local file but not pushed to GitHub- adjust your path

flights = pd.read_csv('../data/2018-01.csv', low_memory=False).sample(frac=0.01, replace=True, random_state=1)
flights
test = flights[['crs_dep_time', 'crs_arr_time', 'arr_delay']].dropna()
X = test[['crs_dep_time', 'crs_arr_time']]
y = test.arr_delay

X_train, X_test, y_train, y_test = split_data(X, y)
random_forest(X_train, X_test, y_train, y_test)

crs_arr_time    0.561826
crs_dep_time    0.438174
dtype: float64
Model accuracy score: 0.0292


## Test of some of Doron's Functions

In [4]:
flight_test = pre.flight_test_features(flights, purged=True)
flight_test

Unnamed: 0,fl_date,op_unique_carrier,tail_num,origin,origin_city_name,dest,dest_city_name,crs_dep_time,dep_time,dep_delay,...,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name
128037,2018-01-07,DL,N330DX,LGA,"New York, NY",MCO,"Orlando, FL",705,705,0.0,...,950,,,,,,,,,
491755,2018-01-26,YV,N86316,ICT,"Wichita, KS",IAH,"Houston, TX",625,623,-2.0,...,542,,,,,,,,,
470924,2018-01-25,WN,N915WN,LGA,"New York, NY",MDW,"Chicago, IL",600,555,-5.0,...,725,,,,,,,,,
491263,2018-01-26,OH,N258PS,CRW,"Charleston/Dunbar, WV",DCA,"Washington, DC",1847,1836,-11.0,...,249,,,,,,,,,
371403,2018-01-20,OO,N203SY,SFO,"San Francisco, CA",SLC,"Salt Lake City, UT",800,758,-2.0,...,599,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428126,2018-01-23,AA,N942NN,MIA,"Miami, FL",SJU,"San Juan, PR",1405,1412,7.0,...,1045,,,,,,,,,
146816,2018-01-08,DL,N333NB,SLC,"Salt Lake City, UT",JAC,"Jackson, WY",2009,2052,43.0,...,205,0.0,0.0,0.0,0.0,25.0,,,,
385842,2018-01-21,DL,N676DL,RSW,"Fort Myers, FL",ATL,"Atlanta, GA",1500,1456,-4.0,...,515,,,,,,,,,
578181,2018-01-30,WN,N7726A,PHX,"Phoenix, AZ",SAN,"San Diego, CA",1725,1748,23.0,...,304,0.0,0.0,0.0,0.0,21.0,,,,
