# Module 2 Classification

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder

In [None]:
# dataset = pd.read_feather('data/mergeTotal')    
# dataset = dataset.drop(columns=['date','time','airport','roundTime','Quarter','Year','FlightDate','ArrTime','DepTime','DepDel15'])
# dataset = dataset.drop(columns=['ArrDelayMinutes','weatherCode','OriginAirportID','DestAirportID'])
# dataset.to_feather('data/readyData')

In [8]:
# Preprocessing
dataset = pd.read_feather('data/readyData')

In [9]:
label_encode = LabelEncoder()
dataset['Origin'] = label_encode.fit_transform(dataset['Origin'])
dataset['Dest'] = label_encode.transform(dataset['Dest'])


In [10]:
Y= dataset.iloc[:,7].values
dataset = dataset.drop(columns=['ArrDel15'])
X = dataset.iloc[:,:].values

In [11]:
# train test split
X_train,X_test,Y_train,y_test = train_test_split(X,Y,test_size=0.25,random_state=0)
del dataset,X,Y

In [6]:
columns = ['Month', 'DayofMonth', 'Origin', 'Dest', 'CRSDepTime',
       'DepDelayMinutes', 'CRSArrTime', 'windspeedKmph', 'winddirDegree',
       'precipMM', 'visibility', 'pressure', 'cloudcover', 'DewPointF',
       'WindGustKmph', 'tempF', 'WindChillF', 'humidity']

# np.save('data/xtrain.npy',X_train,allow_pickle=True)
# np.save('data/xtest.npy',X_test,allow_pickle=True)
# np.save('data/ytrain_CLasi.npy',Y_train,allow_pickle=True)
# np.save('data/ytest_Clasi.npy',y_test,allow_pickle=True)

X_train = np.load('data/xtrain.npy',allow_pickle=True)
X_test = np.load('data/xtest.npy',allow_pickle=True)
Y_train = np.load('data/ytrain_Clasi.npy',allow_pickle=True)
y_test = np.load('data/ytest_Clasi.npy',allow_pickle=True)

X_train = pd.DataFrame(X_train,columns=columns)
X_test = pd.DataFrame(X_test,columns=columns)
# Y_train = pd.DataFrame(Y_train,columns=['ArrDel15'])
# y_test = pd.DataFrame(y_test,columns=['ArrDel15'])

In [None]:
# Oversampling 
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler()
X_train,Y_train = ros.fit_resample(X_train, Y_train)

In [None]:
# Undersampling 
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler()
X_train,Y_train = rus.fit_resample(X_train, Y_train)

In [6]:
# SMOTE 
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_train,Y_train = sm.fit_resample(X_train, Y_train)

In [7]:
#feature scaling
sc_x = StandardScaler()
X_train = sc_x.fit_transform(X_train)
X_test = sc_x.transform(X_test)

In [8]:
# Training with Logistic Regression
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression( random_state=0)
classifier.fit(X_train,Y_train)

LogisticRegression(random_state=0)

In [9]:
# Predicting 
y_pred = classifier.predict(X_test)

In [10]:
# Classifier results
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
cm = confusion_matrix(y_test,y_pred)
print('Precision: %.3f' % precision_score(y_test, y_pred))
print('Recall: %.3f' % recall_score(y_test, y_pred))
print('F1 Score: %.3f' % f1_score(y_test, y_pred))
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Precision: 0.734
Recall: 0.778
F1 Score: 0.755
Accuracy: 0.895


In [11]:
# Decision tree classification
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train,Y_train)
y_pred = clf.predict(X_test)

In [12]:
print("Decision Tree results")
print('Precision: %.3f' % precision_score(y_test, y_pred))
print('Recall: %.3f' % recall_score(y_test, y_pred))
print('F1 Score: %.3f' % f1_score(y_test, y_pred))
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Decision Tree results
Precision: 0.666
Recall: 0.703
F1 Score: 0.684
Accuracy: 0.865


In [None]:
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import BaggingClassifier
n_estimators = 10
svm_clf = OneVsRestClassifier(BaggingClassifier(SVC(kernel='linear', probability=False, class_weight='balanced'), max_samples=1.0 / n_estimators, n_estimators=n_estimators))
svm_clf.fit(X_train,Y_train)
y_pred = svm_clf.predict(X_test)

In [None]:
print("SVM results")
print('Precision: %.3f' % precision_score(y_test, y_pred))
print('Recall: %.3f' % recall_score(y_test, y_pred))
print('F1 Score: %.3f' % f1_score(y_test, y_pred))
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

In [13]:
# Extra Tree classifier
from sklearn.ensemble import ExtraTreesClassifier
ET_clf = ExtraTreesClassifier(n_estimators=100, random_state=0)
ET_clf.fit(X_train,Y_train)

ExtraTreesClassifier(random_state=0)

In [14]:
y_pred = ET_clf.predict(X_test)

In [15]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
print("Extra tree results")
print('Precision: %.3f' % precision_score(y_test, y_pred))
print('Recall: %.3f' % recall_score(y_test, y_pred))
print('F1 Score: %.3f' % f1_score(y_test, y_pred))
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Extra tree results
Precision: 0.822
Recall: 0.727
F1 Score: 0.772
Accuracy: 0.910


In [12]:
from sklearn.ensemble import GradientBoostingClassifier
GB_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=0)
GB_clf.fit(X_train, Y_train)
y_pred = GB_clf.predict(X_test)

In [17]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
print("Gradient Boost results")
print('Precision: %.3f' % precision_score(y_test, y_pred))
print('Recall: %.3f' % recall_score(y_test, y_pred))
print('F1 Score: %.3f' % f1_score(y_test, y_pred))
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Gradient Boost results
Precision: 0.885
Recall: 0.690
F1 Score: 0.775
Accuracy: 0.917
