# Module 2 Classification

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder

In [13]:
# dataset = pd.read_feather('data/mergeTotal')    
# dataset = dataset.drop(columns=['date','time','airport','roundTime','Quarter','Year','FlightDate','ArrTime','DepTime','DepDel15'])
# dataset = dataset.drop(columns=['ArrDelayMinutes','weatherCode','OriginAirportID','DestAirportID'])
# dataset.to_feather('data/readyData')

In [2]:
# Preprocessing
dataset = pd.read_feather('data/readyData')

In [3]:
label_encode = LabelEncoder()
dataset['Origin'] = label_encode.fit_transform(dataset['Origin'])
dataset['Dest'] = label_encode.transform(dataset['Dest'])


In [4]:
Y= dataset.iloc[:,7].values
dataset = dataset.drop(columns=['ArrDel15'])
X = dataset.iloc[:,:].values

In [5]:
# train test split
X_train,X_test,Y_train,y_test = train_test_split(X,Y,test_size=0.25,random_state=0)
del dataset,X,Y

In [9]:
# Oversampling 
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler()
X_train,Y_train = ros.fit_resample(X_train, Y_train)

In [6]:
# Undersampling 
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler()
X_train,Y_train = rus.fit_resample(X_train, Y_train)

In [7]:
#feature scaling
sc_x = StandardScaler()
X_train = sc_x.fit_transform(X_train)
X_test = sc_x.transform(X_test)

In [10]:
# Training with Logistic Regression
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression( random_state=0)
classifier.fit(X_train,Y_train)

LogisticRegression(random_state=0)

In [11]:
# Predicting 
y_pred = classifier.predict(X_test)

In [12]:
# Classifier results
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
cm = confusion_matrix(y_test,y_pred)
print('Precision: %.3f' % precision_score(y_test, y_pred))
print('Recall: %.3f' % recall_score(y_test, y_pred))
print('F1 Score: %.3f' % f1_score(y_test, y_pred))
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Precision: 0.738
Recall: 0.775
F1 Score: 0.757
Accuracy: 0.896


In [13]:
# Decision tree classification
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train,Y_train)
y_pred = clf.predict(X_test)

In [14]:
print("Decision Tree results")
print('Precision: %.3f' % precision_score(y_test, y_pred))
print('Recall: %.3f' % recall_score(y_test, y_pred))
print('F1 Score: %.3f' % f1_score(y_test, y_pred))
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Decision Tree results
Precision: 0.496
Recall: 0.799
F1 Score: 0.612
Accuracy: 0.789


In [15]:
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import BaggingClassifier
n_estimators = 10
svm_clf = OneVsRestClassifier(BaggingClassifier(SVC(kernel='linear', probability=False, class_weight='balanced'), max_samples=1.0 / n_estimators, n_estimators=n_estimators))
svm_clf.fit(X_train,Y_train)
y_pred = svm_clf.predict(X_test)

In [None]:
print("SVM results")
print('Precision: %.3f' % precision_score(y_test, y_pred))
print('Recall: %.3f' % recall_score(y_test, y_pred))
print('F1 Score: %.3f' % f1_score(y_test, y_pred))
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

In [8]:
# Extra Tree classifier
from sklearn.ensemble import ExtraTreesClassifier
ET_clf = ExtraTreesClassifier(n_estimators=100, random_state=0)
ET_clf.fit(X_train,Y_train)

ExtraTreesClassifier(random_state=0)

In [9]:
y_pred = ET_clf.predict(X_test)

In [10]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
print("Extra tree results")
print('Precision: %.3f' % precision_score(y_test, y_pred))
print('Recall: %.3f' % recall_score(y_test, y_pred))
print('F1 Score: %.3f' % f1_score(y_test, y_pred))
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Extra tree results
Precision: 0.656
Recall: 0.821
F1 Score: 0.729
Accuracy: 0.873


In [11]:
from sklearn.ensemble import GradientBoostingClassifier
GB_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=0)
GB_clf.fit(X_train, Y_train)
y_pred = GB_clf.predict(X_test)

In [12]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
print("Gradient Boost results")
print('Precision: %.3f' % precision_score(y_test, y_pred))
print('Recall: %.3f' % recall_score(y_test, y_pred))
print('F1 Score: %.3f' % f1_score(y_test, y_pred))
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Gradient Boost results
Precision: 0.719
Recall: 0.792
F1 Score: 0.754
Accuracy: 0.892
