## Deep Neural Network

In [25]:
# Import Files
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, recall_score
from sklearn import metrics
from sklearn.model_selection import GridSearchCV


In [11]:
import pandas as pd

In [17]:
# Import pre-processed data
dftrain = pd.read_csv("train_final.csv")
dfvalid = pd.read_csv("validation_final.csv")
dftest = pd.read_csv("test_final.csv")

In [27]:
# Split into training and testing x,y
ytrain = dftrain['Overall_Experience']
Xtrain = dftrain.drop(columns = "Overall_Experience")
yvalid = dfvalid['Overall_Experience']
Xvalid = dfvalid.drop(columns = "Overall_Experience")
Xtrain.shape, ytrain.shape, Xvalid.shape, yvalid.shape

((75503, 23), (75503,), (18876, 23), (18876,))

In [29]:
# Since DNN function has validation_split option, concat train and validation sets
trainfinal = pd.concat([dftrain, dfvalid], axis = 0)
ytrain_final = trainfinal['Overall_Experience']
Xtrain_final = trainfinal.drop(columns = "Overall_Experience")

## Support Vector Machine

In [31]:
from sklearn.svm import SVC

In [35]:
# Initialize SVM classifier
clf = SVC(kernel='linear')

# Fit data
clf = clf.fit(Xtrain_final, ytrain_final)

In [36]:
# Model prediction
predictions = clf.predict(dftest)

In [None]:
#Hp Tuning
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ["linear", "poly", "sigmoid", "rbf"]}

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)

grid.fit(Xtrain_final, ytrain_final)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 1/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.836 total time= 2.7min
[CV 2/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.834 total time= 2.7min
[CV 3/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.831 total time= 2.7min
[CV 4/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.836 total time= 2.6min
[CV 5/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.840 total time= 2.7min
[CV 1/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.933 total time= 6.1min
[CV 2/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.932 total time= 6.1min
[CV 3/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.934 total time= 6.2min
[CV 4/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.935 total time= 6.3min
[CV 5/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.933 total time= 6.5min
[CV 1/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.547 total time=10.8min
[CV 2/5] END ....C=0.1, gamma=1, kernel=sigmoi

In [None]:
print(grid.best_params_)

In [None]:
print(grid.best_estimator_)

In [None]:
grid_predictions = grid.predict(dftest)

In [None]:
# Final Implementation
ID_data = pd.read_csv("Traveldata_test.csv")
submission_df = pd.DataFrame()
submission_df['ID'] = ID_data['ID']
submission_df['Overall_Experience'] = grid_predictions
submission_df['Overall_Experience'] = submission_df['Overall_Experience'].round(0).astype(int)
submission_df.to_csv("submission_2.csv", index=False)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer,mean_squared_error, r2_score, mean_absolute_error

In [None]:
# Hyperparameter tuning on multiple RF parameters
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

max_features = ['auto', 'sqrt']

max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

min_samples_split = [2, 5, 10]

min_samples_leaf = [1, 2, 4]

bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
rf = RandomForestRegressor()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(Xtrain_final, ytrain_final)

In [None]:
rf_random.best_params_

In [None]:
prediction_tuned = rf_random.predict()

In [None]:
# Check feature importance
importances = rf.feature_importances_

indices = np.argsort(importances)

plt.figure(figsize = (10, 10))

plt.title('Feature Importances')

plt.barh(range(len(indices)), importances[indices], color = 'violet', align = 'center')

plt.yticks(range(len(indices)), [features[i] for i in indices])

plt.xlabel('Relative Importance')

plt.show()

In [None]:
# Final Implementation
ID_data = pd.read_csv("Traveldata_test.csv")
submission_df = pd.DataFrame()
submission_df['ID'] = ID_data['ID']
submission_df['Overall_Experience'] = prediction_tuned
submission_df['Overall_Experience'] = submission_df['Overall_Experience'].round(0).astype(int)
submission_df.to_csv("submission_3.csv", index=False)

## Ensemble Learning

In [None]:
submission_RF = pd.read_csv("submission_data_RF.csv")
submission_SVM = pd.read_csv("submission_data_SVM.csv")

In [None]:
submission_RF.rename(columns={'Overall_Experience': 'RF_Model'}, inplace=True)
submission_SVM.rename(columns={'Overall_Experience': 'SVM_Model'}, inplace=True)

In [None]:
submission_RF.drop(columns = "ID", inplace = True)
submission_SVM.drop(columns = "ID", inplace = True)

In [None]:
comparison_table = pd.concat([submission_3, submission_2,], axis = 1)

In [None]:
comparison_table['Final_Pred'] = comparison_table.iloc[:, -5:].sum(axis=1)

In [None]:
comparison_table['Overall_Experience'] = comparison_table['Final_Pred'].apply(lambda x: 1 if x >= 2 else 0) 

In [None]:
comparison_table.drop(columns = ["RF_Model","SVM_Model"], inplace = True)