In [None]:
# import all required packages
import time
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
plt.style.use('ggplot')

import seaborn as sns
import statsmodels.api as sm

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

import sklearn.metrics
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

In [None]:
# used for scaling non-ordinal data within the range of 1 - 5 
def normalize(x):
    return (x-x.min())/(x.max()-x.min()) * (5-1) + 1

In [None]:
df = pd.read_csv('DataFrame')
X = pd.read_csv('clean_data')

In [None]:
X.drop(columns=['Unnamed: 0'],inplace=True)
X.Class = X.Class.apply(lambda x: 1 if x == 0 else(2 if x == 1 else 3))

In [None]:
features = pd.Series(X.satisfaction) 

In [None]:
x_var = X.drop(columns=['satisfaction'])
x_wd = pd.get_dummies(data=x_var,columns=['Customer Type','Type of Travel','Gender'])
x_wd[['Age','Departure Delay in Minutes','Flight Distance']] = normalize(x_wd[['Age','Departure Delay in Minutes','Flight Distance']])

In [None]:
# node2vec train data
t = pd.read_csv('embeddings', delimiter = " ", index_col=0, header = None)
t.sort_index(inplace=True)
t = t.to_numpy()
t.shape

In [None]:
Z = pd.DataFrame(t,columns = ['x1','x2'])

In [None]:
# node2vec train
XXXX = x_wd.copy(deep=True)
XXXX['z1'] = Z.x1
XXXX['z2'] = Z.x2
XXXX_label = pd.get_dummies(features)
XXXX

In [None]:
# node2vec test
t_test = pd.read_csv('embeddings_test', delimiter = " ", index_col=0, header = None)
t_test.sort_index(inplace=True)
t_test = t_test.to_numpy()
t_test.shape

In [None]:
Z_test = pd.DataFrame(t_test,columns = ['x1','x2'])

In [None]:
n2v_test_df = pd.read_csv('TEST_DF')
n2v_test_df.drop(columns=['Unnamed: 0'],inplace=True)
n2v_test_label = n2v_test_df.satisfaction
n2v_test_feats = n2v_test_df.drop(columns=['satisfaction'])
n2v_test_feats_dummy = pd.get_dummies(data=n2v_test_feats,columns=['Customer Type','Type of Travel','Gender'])
n2v_test_feats_dummy[['Age','Departure Delay in Minutes','Flight Distance']] = normalize(n2v_test_feats_dummy[['Age','Departure Delay in Minutes','Flight Distance']])

In [None]:
TTTTTT_label = pd.get_dummies(n2v_test_label)
TTTTTT_label_tf = tf.convert_to_tensor(TTTTTT_label)
TTTTTT = n2v_test_feats_dummy.copy(deep=True)
TTTTTT['z1'] = Z_test.x1
TTTTTT['z2'] = Z_test.x2
TTTTTT_tf = tf.convert_to_tensor(TTTTTT)
TTTTTT

In [None]:
P = sm.add_constant(t) 
log_reg = sm.Logit(features, P).fit()
print(log_reg.summary())

In [None]:
predictions = log_reg.predict(P) 

yhat_log = np.empty((len(predictions),1))

for i in range(len(predictions)):
    if predictions[i] < 0.5:
        yhat_log[i] = 0
    else:         
        yhat_log[i] = 1

print('Classification Accuracy validation set {}'.format(accuracy_score(features,yhat_log)))
print('-------------------------------------------------------------')
print('Confusion Matrix validation set:')
print(confusion_matrix(features,yhat_log))

In [None]:
x_train, x_test, y_train, y_test = train_test_split(XXXX,XXXX_label, test_size = 0.2, shuffle=True)

In [None]:
x_train_tf = tf.convert_to_tensor(x_train)
x_test_tf = tf.convert_to_tensor(x_test)
y_train_tf = tf.convert_to_tensor(y_train)
y_test_tf = tf.convert_to_tensor(y_test)

model = Sequential()

model.add(Dense(15, activation='relu'))
#model.add(Dense(8, activation='relu'))
model.add(Dense(2, activation='softmax'))
#opt = tf.keras.optimizers.Adam(learning_rate=0.0001)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(x_train_tf, y_train_tf,validation_data=(x_test_tf,y_test_tf), epochs=200, batch_size=10)

In [None]:
plt.subplots(figsize=(15,5))

plt.subplot(121)
plt.plot(history.history['loss'],label = 'Training Loss')
plt.plot(history.history['val_loss'],label = 'Validation Loss')
plt.xlabel('Number of Epochs')
plt.ylabel('Cross-Entropy Loss')
plt.title('Loss per Epoch')
plt.legend()

plt.subplot(122)
plt.plot(history.history['accuracy'],label = 'Training accuracy')
plt.plot(history.history['val_accuracy'],label = 'Validation accuracy')
plt.xlabel('Number of Epochs')
plt.ylabel('Accuracy')
plt.title('Accuracy per Epoch')
plt.legend()

plt.show()

In [None]:
results = model.evaluate(TTTTTT_tf, TTTTTT_label_tf)
print("test loss, test acc:", results)

In [None]:
svm_y = np.empty((len(features),1))

for i in range(len(features)):
    if features.iloc[i] == 1:
        svm_y[i] = 1
    else:
        svm_y[i] = -1

svm_y.astype(int)
x_train, x_test, y_train, y_test = train_test_split(XXXX,svm_y, test_size = 0.2, shuffle=True)


clf = svm.SVC(kernel='rbf')
clf.fit(x_train, y_train)
print(clf.score(x_train, y_train))
print(clf.score(x_test, y_test))

In [None]:
svm_y_test = np.empty((len(n2v_test_label),1))

for i in range(len(n2v_test_label)):
    if n2v_test_label.iloc[i] == 1:
        svm_y_test[i] = 1
    else:
        svm_y_test[i] = -1

y_pred = clf.predict(TTTTTT)
print(clf.score(TTTTTT, svm_y_test))
print(sklearn.metrics.f1_score(y_pred, svm_y_test,average='micro'))

In [None]:
clf = RandomForestClassifier(max_depth=None, random_state=0)

x_train, x_test, y_train, y_test = train_test_split(XXXX,svm_y, test_size = 0.2, shuffle=True)

clf.fit(x_train, y_train)
print(clf.score(x_train, y_train))
print(clf.score(x_test, y_test))

In [None]:
y_pred = clf.predict(TTTTTT)
print(clf.score(TTTTTT, svm_y_test))
print(sklearn.metrics.f1_score(y_pred, svm_y_test))

In [None]:
clf = LogisticRegression(random_state=0,max_iter=500).fit(x_train, y_train)
y_pred = clf.predict(TTTTTT)
print(clf.score(x_train, y_train))
print(clf.score(x_test, y_test))
print(clf.score(TTTTTT, svm_y_test))
print(sklearn.metrics.f1_score(y_pred, svm_y_test))

In [None]:
plt.figure(figsize=(15,15))
sns.scatterplot(x=Z.x1,y=Z.x2,hue=df['satisfaction'],alpha=0.8)
plt.show()