## Naive Byes

In [9]:
# imports
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report, r2_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics

In [21]:
# Read the Dataframe here
#load data
jan_2019 = pd.read_csv('../Kyle_eda/data/01_2019_flights_weather.csv', low_memory=False)
# drop rows with nan in arr_delay
jan_2019.dropna(inplace=True,subset=['arr_delay'])
jan_2019 = jan_2019[jan_2019['arr_delay'] <240]


In [22]:
# encode mkt_unique_carrier
enc = OneHotEncoder()
enc.fit(jan_2019['mkt_unique_carrier'].values.reshape(-1,1))
enc_df = pd.DataFrame(enc.transform(jan_2019['mkt_unique_carrier'].values.reshape(-1,1)).toarray())

enc_df.reset_index(drop=True, inplace=True)
jan_2019.reset_index(drop=True, inplace=True)
jan_2019 = pd.concat([jan_2019, enc_df], axis=1,)

In [23]:
# get mapping dictionary from file to use for encoding origin and dest
dest_dict = pd.read_csv('../Kyle_eda/data/dest_dict.csv', index_col=0)
origin_dict = pd.read_csv('../Kyle_eda/data/origin_dict.csv', index_col=0)
dest_dict = dest_dict.T.to_dict('records')[0]
origin_dict = origin_dict.T.to_dict('records')[0]

In [24]:
# encode origin and dest
jan_2019['dest'] = jan_2019['dest'].map(dest_dict)
jan_2019['origin'] = jan_2019['origin'].map(origin_dict)


In [25]:
# mask for columns to keep
cols_to_keep = ['origin', 
                'dest', 
                'crs_elapsed_time',
                'distance',
                'crs_dep_time_hr',
                'crs_arr_time_hr',
                'precip_origin',
                'windspeedKmph_origin',
                'winddirDegree_origin',
                'visibility_origin',
                'DewPointC_origin',
                'pressure_origin',
                'cloudcover_origin',
                'WindGustKmph_origin',
                'humidity_origin',
                'tempC_origin',
                'precip_dest',
                'windspeedKmph_dest',
                'winddirDegree_dest',
                'visibility_dest',
                'DewPointC_dest',
                'pressure_dest',
                'cloudcover_dest',
                'WindGustKmph_dest',
                'humidity_dest',
                'tempC_dest',
               0,1,2,3,4,5,6,7,8,9]

X = jan_2019[cols_to_keep]
y = jan_2019['arr_delay']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [26]:
#Create the model
naive_model = GaussianNB()

#Train the model 
naive_model.fit(X_train,y_train)

GaussianNB()

In [27]:
# Pred
y_pred = naive_model.predict(X_test)

In [28]:
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

RMSE: 144.1723010567653


In [29]:
y_pred

array([225., -60., 238., ..., -64., -59., -69.])

pandas.core.series.Series

In [11]:
# Coefficient of determination
print('Coefficient of determination: %.2f \n'
      % r2_score(y_test, y_pred))

Coefficient of determination: -71.29 



### ROC curve 

In [None]:
# Claculation
y_score1 = naive_model.predict_proba(X_test)[:,1]
y_score2 = naive_model.predict_proba(X_test)[:,1]

false_positive_rate1, true_positive_rate1, threshold1 = roc_curve(y_test, y_score1)
false_positive_rate2, true_positive_rate2, threshold2 = roc_curve(y_test, y_score2)

In [None]:
# ROC Score
print('roc_auc_score: ', roc_auc_score(y_test, y_score1))

In [None]:
# Visulaization
plt.subplots(1, figsize=(10,10))
plt.title('Random Forest ROC')
plt.plot(false_positive_rate1, true_positive_rate1)
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
