In [88]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
df_train = pd.read_csv("Data/train.csv", index_col=0)
df_test = pd.read_csv("Data/test.csv", index_col=0)
df_train.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [89]:
# cleaning

# removing id column
df_train.drop(columns=['id'],inplace=True)
df_test.drop(columns=['id'],inplace=True)

In [90]:
# checking if there are any nulls
for c in df_train.columns:
    if df_train[c].isnull().values.any():
        print(c)

# there are nulls in 'Arrival Delay in Minutes' column. fill with the avg
Arrival_Delay_in_Minutes_avg = df_train['Arrival Delay in Minutes'].mean()
df_train['Arrival Delay in Minutes'].fillna(Arrival_Delay_in_Minutes_avg, inplace=True)

# final check for nulls
df_train.isnull().values.any()

print(df_test['Arrival Delay in Minutes'].isnull().values.any())
df_test['Arrival Delay in Minutes'].fillna(Arrival_Delay_in_Minutes_avg, inplace=True)
df_test.isnull().values.any()


Arrival Delay in Minutes
True


False

In [91]:
# encoding categorical features and target feature

cat_features = ['Gender', 'Customer Type', 'Type of Travel', 'Class']
df_cat = df_train[cat_features]

oe = OrdinalEncoder()
df_cat = oe.fit_transform(df_cat)
df_cat = pd.DataFrame(df_cat, columns=cat_features)
df_train[cat_features] = df_cat[cat_features]
le = LabelEncoder()
df_train['satisfaction'] = le.fit_transform(df_train['satisfaction'])
df_train

df_cat = df_test[cat_features]
df_cat = oe.transform(df_cat)
df_cat = pd.DataFrame(df_cat, columns=cat_features)
df_test[cat_features] = df_cat[cat_features]
df_test['satisfaction'] = le.transform(df_test['satisfaction'])
df_test

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0.0,0.0,52,0.0,1.0,160,5,4,3,4,...,5,5,5,5,2,5,5,50,44.0,1
1,0.0,0.0,36,0.0,0.0,2863,1,1,3,1,...,4,4,4,4,3,4,5,0,0.0,1
2,1.0,1.0,20,0.0,1.0,192,2,0,2,4,...,2,4,1,3,2,2,2,0,0.0,0
3,1.0,0.0,44,0.0,0.0,3377,0,0,0,2,...,1,1,1,1,3,1,4,0,6.0,1
4,0.0,0.0,49,0.0,1.0,1182,2,3,4,3,...,2,2,2,2,4,2,4,0,20.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25971,1.0,1.0,34,0.0,0.0,526,3,3,3,1,...,4,3,2,4,4,5,4,0,0.0,0
25972,1.0,0.0,23,0.0,0.0,646,4,4,4,4,...,4,4,5,5,5,5,4,0,0.0,1
25973,0.0,0.0,17,1.0,1.0,828,2,5,1,5,...,2,4,3,4,5,4,2,0,0.0,0
25974,1.0,0.0,14,0.0,0.0,1127,3,3,3,3,...,4,3,2,5,4,5,4,0,0.0,1


Modeling

In [92]:
from sklearn.tree import DecisionTreeClassifier
from SoftDT import SoftDT
from sklearn.metrics import accuracy_score

X_train, y_train, X_test, y_test = df_train.drop(columns=['satisfaction']), df_train['satisfaction'],\
                                   df_test.drop(columns=['satisfaction']), df_test['satisfaction']

clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)
clf_pred = clf.predict(X_test)

clf_soft = SoftDT()
clf_soft.fit(X_train,y_train)
clf_soft_pred = clf_soft.predict(X_test)
clf_soft_pred.reshape(clf_soft_pred.shape[0],clf_soft_pred.shape[2])
clf_soft_pred = np.array([np.argmax(a) for a in clf_soft_pred])

print("clf:", accuracy_score(y_test, clf_pred))
print("soft clf:", accuracy_score(y_test, clf_soft_pred))

clf: 0.9479134585771481
soft clf: 0.81967970434247
