In [2]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt 
from sklearn import preprocessing
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.ensemble import RandomForestClassifier

In [3]:
dfFlight = pd.read_csv('FlightDelayWithWeather.csv')
dfFlight.head()

Unnamed: 0,DEPARTURE_DELAY,SCHEDULED_TIME,DISTANCE,Mon,Tue,Wed,Thu,Fri,Sat,AA,...,VX,Air_Temperature,Dew_Point_Temperature,Relative_Humidity,Wind_Direction,Wind_Speed,Precipitation_Per_Hour,Pressure_Altimeter,Sea_Level_Pressure,Visibility
0,0,286.0,2296,0,0,0,1,0,0,0,...,0,53.06,32.0,44.48,280.0,7.0,0.0,30.09,1018.9,10.0
1,0,217.0,1589,0,0,0,1,0,0,0,...,0,53.06,32.0,44.48,280.0,7.0,0.0,30.09,1018.9,10.0
2,0,195.0,1464,0,0,0,1,0,0,1,...,0,53.06,32.0,44.48,280.0,7.0,0.0,30.09,1018.9,10.0
3,0,218.0,1635,0,0,0,1,0,0,0,...,0,53.06,32.0,44.48,280.0,7.0,0.0,30.09,1018.9,10.0
4,1,146.0,967,0,0,0,1,0,0,0,...,0,44.96,35.96,70.52,150.0,4.0,0.0,30.08,1018.7,10.0


# Normalization

In [3]:
def sparse_normalizer(data):
    max_abs_scaler = preprocessing.MaxAbsScaler()
    np_scaled = max_abs_scaler.fit_transform(data)
    df_normalized = pd.DataFrame(np_scaled)
    return df_normalized

In [4]:
def normalizer(data):
    min_max_scaler = preprocessing.MinMaxScaler()
    np_scaled = min_max_scaler.fit_transform(data)
    df_normalized = pd.DataFrame(np_scaled)
    return df_normalized

In [5]:
X = np.array(dfFlight.drop(['DEPARTURE_DELAY'], axis = 1).values)
Y = np.array(dfFlight['DEPARTURE_DELAY'])

X = normalizer(X)
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,0.75,0.84469,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.316456,0.25,0.337944,0.777778,0.21875,0.0,0.506667,0.505882,1.0
1,0.537037,0.575561,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.316456,0.25,0.337944,0.777778,0.21875,0.0,0.506667,0.505882,1.0
2,0.469136,0.527979,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.316456,0.25,0.337944,0.777778,0.21875,0.0,0.506667,0.505882,1.0
3,0.540123,0.593072,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.316456,0.25,0.337944,0.777778,0.21875,0.0,0.506667,0.505882,1.0
4,0.317901,0.338789,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.174051,0.340164,0.648462,0.416667,0.125,0.0,0.493333,0.498039,1.0


# Model Selection

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [7]:
model = RandomForestClassifier(n_jobs=-1, random_state=0) 

predicted = cross_val_predict(model, X_train, Y_train, cv=10)
print(metrics.accuracy_score(Y_train, predicted))

0.6384443718702373


In [8]:
def calc_score(X, Y, _criterion, _max_features):
    clf = RandomForestClassifier(n_jobs=-1, random_state=0, criterion = _criterion, max_features = _max_features)
    predicted = cross_val_predict(clf, X, Y, cv = 10)
    score = metrics.accuracy_score(Y, predicted)
    return score

In [9]:
calc_score(X_train, Y_train, 'gini', 'sqrt')

0.6384443718702373

In [10]:
calc_score(X_train, Y_train, 'entropy', 'auto')

0.6392971188039771

In [11]:
calc_score(X_train, Y_train, 'gini', 'log2')

0.6381268597140576

In [12]:
from sklearn import linear_model

In [13]:
clf = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
predicted = cross_val_predict(clf, X_train, Y_train, cv=10)
print(metrics.accuracy_score(Y_train, predicted))

0.6121725088903404


In [14]:
from sklearn.linear_model import RidgeClassifier

In [15]:
clf = RidgeClassifier()
predicted = cross_val_predict(clf, X_train, Y_train, cv=10)
print(metrics.accuracy_score(Y_train, predicted))

0.626678278539807


In [16]:
from sklearn.svm import LinearSVC

In [17]:
clf = LinearSVC()
predicted = cross_val_predict(clf, X_train, Y_train, cv=10)
print(metrics.accuracy_score(Y_train, predicted))

0.6265694172291167


In [20]:
model = RandomForestClassifier(n_jobs=-1, random_state=0) 
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
print(metrics.accuracy_score(Y_test, predicted))

0.6354004136579702


# Dimentional Reduction

In [39]:
from sklearn.decomposition import PCA

In [42]:
def pca_reduction(data, n):
    pca = PCA(n_components=n, svd_solver='full')
    data_pca = pca.fit_transform(data)
    df_pca = pd.DataFrame(data_pca)
    return df_pca
    
X_pca = pca_reduction(X, 2)
X_pca.head()

Unnamed: 0,0,1
0,0.089135,-0.053831
1,0.11086,0.034027
2,0.122255,0.076569
3,-0.713153,-0.368042
4,-0.691951,-0.573489


In [43]:
X.var()

0     0.128094
1     0.122136
2     0.127750
3     0.125189
4     0.128740
5     0.103973
6     0.075831
7     0.033585
8     0.032302
9     0.061334
10    0.012860
11    0.004449
12    0.180330
13    0.213339
14    0.017479
15    0.096063
16    0.017829
17    0.023668
18    0.033336
19    0.109251
20    0.033226
21    0.001031
22    0.033263
23    0.032937
24    0.011921
dtype: float64

In [69]:
X.corr()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,1.0,-0.17176,-0.177304,-0.174768,-0.178288,-0.154064,-0.002178,-0.002173,-0.00124,0.001868,...,-0.000918,0.017342,0.005371,-0.012665,0.056285,0.053346,0.0219,-0.04078,-0.040466,-0.028901
1,-0.17176,1.0,-0.171429,-0.168978,-0.172381,-0.148959,0.000872,-0.001603,-0.000474,-0.003994,...,0.000819,-0.010754,-0.021508,-0.010273,0.070511,0.073126,0.031933,0.006084,0.006335,-0.038219
2,-0.177304,-0.171429,1.0,-0.174432,-0.177945,-0.153767,0.000662,-0.002356,-0.002564,0.002154,...,0.00054,-0.036525,-0.037621,-0.002968,-0.021911,-0.038852,-0.007473,0.070159,0.069451,0.054729
3,-0.174768,-0.168978,-0.174432,1.0,-0.1754,-0.151568,-0.003451,-0.001095,-0.001359,0.003004,...,0.001745,0.015021,0.00524,-0.008796,-0.059771,-0.049525,-0.009181,0.012348,0.012245,-0.03596
4,-0.178288,-0.172381,-0.177945,-0.1754,1.0,-0.154621,-0.005,-0.000565,0.000641,0.003135,...,0.002811,0.010664,0.024988,0.01175,-0.000617,0.001803,-0.02059,-0.022662,-0.022824,0.006332
5,-0.154064,-0.148959,-0.153767,-0.151568,-0.154621,1.0,0.00855,0.009656,0.002272,-0.013532,...,-0.006726,-0.014354,0.011157,0.031552,-0.032143,-0.027412,-0.005288,-0.007553,-0.007322,0.029854
6,-0.002178,0.000872,0.000662,-0.003451,-0.005,0.00855,1.0,-0.056995,-0.055817,-0.079566,...,-0.104264,-0.00897,0.001379,0.009954,0.015539,5.7e-05,0.000437,-0.012972,-0.012992,0.011909
7,-0.002173,-0.001603,-0.002356,-0.001095,-0.000565,0.009656,-0.056995,1.0,-0.035304,-0.050325,...,-0.065946,0.030389,0.005551,-0.029527,-0.00364,0.010759,-0.001093,0.001006,0.001041,-0.001194
8,-0.00124,-0.000474,-0.002564,-0.001359,0.000641,0.002272,-0.055817,-0.035304,1.0,-0.049285,...,-0.064583,0.00021,-0.003728,-0.003143,0.006139,0.006379,0.000223,-0.002618,-0.002606,0.004529
9,0.001868,-0.003994,0.002154,0.003004,0.003135,-0.013532,-0.079566,-0.050325,-0.049285,1.0,...,-0.092062,0.006877,0.003983,-0.005933,0.023848,0.016598,0.001546,-0.021689,-0.021713,0.016057
