In [None]:
# Required imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings 
warnings.filterwarnings( "ignore")

# Preprocessing Required imports
from scipy.stats import zscore
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# Model Imports
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier


# Accuracy and Performance Metric Imports
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score,roc_curve,classification_report,confusion_matrix,plot_confusion_matrix

# Read and Preliminary Checks

In [None]:
depression_dataset = pd.read_csv('data.csv')

In [None]:
depression_dataset.info()

In [None]:
depression_dataset.tail(18)

In [None]:
positions = []
for item in range(268,286):
    positions.append(item)

In [None]:
depression_dataset.drop(depression_dataset.index[positions], inplace=True)

# Null Value Check

In [None]:
depression_dataset.isnull().sum()

# Null Value Treatment

In [None]:
for column in depression_dataset.columns:
    if depression_dataset[column].dtype != 'object':
        mean = depression_dataset[column].mean()
        #mean = train_data[column].median()
        depression_dataset[column] = depression_dataset[column].fillna(mean)   

In [None]:
depression_dataset.isnull().sum()

# Data Encoding

In [None]:
for column in depression_dataset.columns:
    if depression_dataset[column].dtype == 'object':
        print(column.upper(),': ',depression_dataset[column].nunique())
        print(depression_dataset[column].value_counts().sort_values())
        print('\n')

In [None]:
depression_dataset.columns

In [None]:
depression_dataset['inter_dom'] = np.where(depression_dataset['inter_dom'] =='Dom', 0, depression_dataset['inter_dom'])
depression_dataset['inter_dom'] = np.where(depression_dataset['inter_dom'] =='Inter', 1, depression_dataset['inter_dom'])

depression_dataset['Region'] = np.where(depression_dataset['Region'] =='SA', 0, depression_dataset['Region'])
depression_dataset['Region'] = np.where(depression_dataset['Region'] =='EA', 1, depression_dataset['Region'])
depression_dataset['Region'] = np.where(depression_dataset['Region'] =='SEA', 2, depression_dataset['Region'])
depression_dataset['Region'] = np.where(depression_dataset['Region'] =='JAP', 3, depression_dataset['Region'])
depression_dataset['Region'] = np.where(depression_dataset['Region'] =='Others', 4, depression_dataset['Region'])

depression_dataset['Gender'] = np.where(depression_dataset['Gender'] =='Female', 0, depression_dataset['Gender'])
depression_dataset['Gender'] = np.where(depression_dataset['Gender'] =='Male', 1, depression_dataset['Gender'])

depression_dataset['Academic'] = np.where(depression_dataset['Academic'] =='Under', 0, depression_dataset['Academic'])
depression_dataset['Academic'] = np.where(depression_dataset['Academic'] =='Grad', 1, depression_dataset['Academic'])

depression_dataset['Stay_Cate'] = np.where(depression_dataset['Stay_Cate'] =='Short', 0, depression_dataset['Stay_Cate'])
depression_dataset['Stay_Cate'] = np.where(depression_dataset['Stay_Cate'] =='Medium', 1, depression_dataset['Stay_Cate'])
depression_dataset['Stay_Cate'] = np.where(depression_dataset['Stay_Cate'] =='Long', 2, depression_dataset['Stay_Cate'])

depression_dataset['DepType'] = np.where(depression_dataset['DepType'] =='No', 0, depression_dataset['DepType'])
depression_dataset['DepType'] = np.where(depression_dataset['DepType'] =='Major', 1, depression_dataset['DepType'])
depression_dataset['DepType'] = np.where(depression_dataset['DepType'] =='Other', 2, depression_dataset['DepType'])

depression_dataset['DepSev'] = np.where(depression_dataset['DepSev'] =='Min', 0, depression_dataset['DepSev'])
depression_dataset['DepSev'] = np.where(depression_dataset['DepSev'] =='Mild', 1, depression_dataset['DepSev'])
depression_dataset['DepSev'] = np.where(depression_dataset['DepSev'] =='Mod', 2, depression_dataset['DepSev'])
depression_dataset['DepSev'] = np.where(depression_dataset['DepSev'] =='Sev', 3, depression_dataset['DepSev'])
depression_dataset['DepSev'] = np.where(depression_dataset['DepSev'] =='ModSev', 4, depression_dataset['DepSev'])

In [None]:
depression_dataset['Japanese_cate'] = np.where(depression_dataset['Japanese_cate'] =='Low', 0, depression_dataset['Japanese_cate'])
depression_dataset['Japanese_cate'] = np.where(depression_dataset['Japanese_cate'] =='Average', 1, depression_dataset['Japanese_cate'])
depression_dataset['Japanese_cate'] = np.where(depression_dataset['Japanese_cate'] =='High', 2, depression_dataset['Japanese_cate'])

depression_dataset['English_cate'] = np.where(depression_dataset['English_cate'] =='Low', 0, depression_dataset['English_cate'])
depression_dataset['English_cate'] = np.where(depression_dataset['English_cate'] =='Average', 1, depression_dataset['English_cate'])
depression_dataset['English_cate'] = np.where(depression_dataset['English_cate'] =='High', 2, depression_dataset['English_cate'])

In [None]:
depression_dataset['Intimate'] = np.where(depression_dataset['Intimate'] =='Yes', 0, depression_dataset['Intimate'])
depression_dataset['Intimate'] = np.where(depression_dataset['Intimate'] =='No', 1, depression_dataset['Intimate'])

depression_dataset['Religion'] = np.where(depression_dataset['Religion'] =='Yes', 0, depression_dataset['Religion'])
depression_dataset['Religion'] = np.where(depression_dataset['Religion'] =='No', 1, depression_dataset['Religion'])

depression_dataset['Suicide'] = np.where(depression_dataset['Suicide'] =='Yes', 0, depression_dataset['Suicide'])
depression_dataset['Suicide'] = np.where(depression_dataset['Suicide'] =='No', 1, depression_dataset['Suicide'])

depression_dataset['Dep'] = np.where(depression_dataset['Dep'] =='Yes', 0, depression_dataset['Dep'])
depression_dataset['Dep'] = np.where(depression_dataset['Dep'] =='No', 1, depression_dataset['Dep'])

depression_dataset['Partner_bi'] = np.where(depression_dataset['Partner_bi'] =='Yes', 0, depression_dataset['Partner_bi'])
depression_dataset['Partner_bi'] = np.where(depression_dataset['Partner_bi'] =='No', 1, depression_dataset['Partner_bi'])

depression_dataset['Friends_bi'] = np.where(depression_dataset['Friends_bi'] =='Yes', 0, depression_dataset['Friends_bi'])
depression_dataset['Friends_bi'] = np.where(depression_dataset['Friends_bi'] =='No', 1, depression_dataset['Friends_bi'])

depression_dataset['Parents_bi'] = np.where(depression_dataset['Parents_bi'] =='Yes', 0, depression_dataset['Parents_bi'])
depression_dataset['Parents_bi'] = np.where(depression_dataset['Parents_bi'] =='No', 1, depression_dataset['Parents_bi'])

depression_dataset['Relative_bi'] = np.where(depression_dataset['Relative_bi'] =='Yes', 0, depression_dataset['Relative_bi'])
depression_dataset['Relative_bi'] = np.where(depression_dataset['Relative_bi'] =='No', 1, depression_dataset['Relative_bi'])

depression_dataset['Professional_bi'] = np.where(depression_dataset['Professional_bi'] =='Yes', 0, depression_dataset['Professional_bi'])
depression_dataset['Professional_bi'] = np.where(depression_dataset['Professional_bi'] =='No', 1, depression_dataset['Professional_bi'])

depression_dataset['Phone_bi'] = np.where(depression_dataset['Phone_bi'] =='Yes', 0, depression_dataset['Phone_bi'])
depression_dataset['Phone_bi'] = np.where(depression_dataset['Phone_bi'] =='No', 1, depression_dataset['Phone_bi'])

depression_dataset['Doctor_bi'] = np.where(depression_dataset['Doctor_bi'] =='Yes', 0, depression_dataset['Doctor_bi'])
depression_dataset['Doctor_bi'] = np.where(depression_dataset['Doctor_bi'] =='No', 1, depression_dataset['Doctor_bi'])

depression_dataset['religion_bi'] = np.where(depression_dataset['religion_bi'] =='Yes', 0, depression_dataset['religion_bi'])
depression_dataset['religion_bi'] = np.where(depression_dataset['religion_bi'] =='No', 1, depression_dataset['religion_bi'])

depression_dataset['Alone_bi'] = np.where(depression_dataset['Alone_bi'] =='Yes', 0, depression_dataset['Alone_bi'])
depression_dataset['Alone_bi'] = np.where(depression_dataset['Alone_bi'] =='No', 1, depression_dataset['Alone_bi'])

depression_dataset['Others_bi'] = np.where(depression_dataset['Others_bi'] =='Yes', 0, depression_dataset['Others_bi'])
depression_dataset['Others_bi'] = np.where(depression_dataset['Others_bi'] =='No', 1, depression_dataset['Others_bi'])

depression_dataset['Internet_bi'] = np.where(depression_dataset['Internet_bi'] =='Yes', 0, depression_dataset['Internet_bi'])
depression_dataset['Internet_bi'] = np.where(depression_dataset['Internet_bi'] =='No', 1, depression_dataset['Internet_bi'])

In [None]:
for column in depression_dataset.columns:
    if depression_dataset[column].dtype == 'object':
        print(column.upper(),': ',depression_dataset[column].nunique())
        print(depression_dataset[column].value_counts().sort_values())
        print('\n')

# Dropping Unnecessary Values

In [None]:
#cat_new = ['']
#train_data.drop(columns=cat_new,axis=1,inplace=True)

In [None]:
depression_dataset.describe()

In [None]:
col_list = depression_dataset.columns

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
depression_dataset_scaled = scaler.fit_transform(depression_dataset)
depression_dataset_scaled = pd.DataFrame(depression_dataset_scaled)
depression_dataset_scaled.columns = col_list

# Building training and test sets

In [None]:
X = depression_dataset_scaled.drop('DepSev', axis=1)
y = depression_dataset_scaled.pop('DepSev')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3 , random_state=1)

# Modeling

## Logistic Regression

In [None]:
param_grid = {
    'solver': ['newton-cg', 'lbfgs', 'saga'],
    'max_iter': [40,50,60,70,80],
    'n_jobs': [2,3,4,5]
}
LogReg = LogisticRegression()
log_reg_model = GridSearchCV(estimator = LogReg, param_grid = param_grid, cv = 3)

In [None]:
log_reg_model = LogisticRegression()

In [None]:
log_reg_model.fit(X_train, y_train)

In [None]:
#log_reg_model.best_params_

In [None]:
ytrain_logreg_predict = log_reg_model.predict(X_train)
ytest_logreg_predict = log_reg_model.predict(X_test)

In [None]:
ytrain_logreg_prob_vals = log_reg_model.predict_proba(X_train)
ytest_logreg_prob_vals = log_reg_model.predict_proba(X_test)
prob_logreg_train = ytrain_logreg_prob_vals[:, 1]
prob_logreg_test = ytest_logreg_prob_vals[:, 1]

In [None]:
logreg_predictions = log_reg_model.predict(test_data_scaled)

In [None]:
output = pd.DataFrame({'ID': test_data.ID, 'Overall_Experience': logreg_predictions})
output.to_csv('02. submission_logReg.csv', index=False)
print("Your submission was successfully saved!")

In [None]:
cols = [train_data.columns]
coeff = [log_reg_model.coef_]
for name, i in zip(iter(cols), iter(coeff)):
    print('The coefficient of', name ,'is',i) 

## Linear Discriminant Analysis

In [None]:
LiDiA = LinearDiscriminantAnalysis()
LDA_model = LiDiA.fit(X,y)
LDA_model

In [None]:
pred_class = LDA_model.predict(X)
ytrain_lidia_predict = LDA_model.predict(X_train)
ytest_lidia_predict = LDA_model.predict(X_test)

In [None]:
ytrain_lidia_prob_vals = LDA_model.predict_proba(X_train)
ytest_lidia_prob_vals = LDA_model.predict_proba(X_test)
prob_lidia_train = ytrain_lidia_prob_vals[:, 1]
prob_lidia_test = ytest_lidia_prob_vals[:, 1]

In [None]:
output = pd.DataFrame({'ID': test_data.ID, 'Overall_Experience': pred_class})
output.to_csv('03. submission_LiDiA.csv', index=False)
print("Your submission was successfully saved!")

## KNN Model 

In [None]:
# param_grid = {
#     'algorithm':['ball_tree', 'kd_tree'],
#     'weights':['uniform', 'distance'],
#     'n_neighbors':[5,10,15],
#     'leaf_size':[30,45,60]
    
# }

# KNNModel=KNeighborsClassifier()

# KNN_model = GridSearchCV(estimator = KNNModel, param_grid = param_grid, cv = 3)

In [None]:
# KNN_model.best_params_

In [None]:
KNN_model = KNeighborsClassifier()

In [None]:
KNN_model.fit(X_train,y_train)

In [None]:
y_train_KNN_predict = KNN_model.predict(X_train)
y_test_KNN_predict = KNN_model.predict(X_test)

In [None]:
ytrain_KNN_prob_vals = KNN_model.predict_proba(X_train)
ytest_KNN_prob_vals = KNN_model.predict_proba(X_test)
prob_KNN_train = ytrain_KNN_prob_vals[:, 1]
prob_KNN_test = ytest_KNN_prob_vals[:, 1]

In [None]:
KNN_pred = KNN_model.predict(test_data_scaled)

In [None]:
output = pd.DataFrame({'ID': test_data.ID, 'Overall_Experience': KNN_pred})
output.to_csv('04. submission_KNN.csv', index=False)
print("Your submission was successfully saved!")

In [None]:
# cols = train_data_scaled.columns
# coeff = list(KNN_model.coef_)
# for name,i in zip(cols, iter(coeff)):
#     print('The coefficient of',name,'is',i.round(3)) 

## Naïve Bayes Model

In [None]:
NB_model = GaussianNB()
NB_model.fit(X, y)

In [None]:
NB_pred = NB_model.predict(test_data_scaled)

In [None]:
y_train_NB_predict = NB_model.predict(X_train)
y_test_NB_predict = NB_model.predict(X_test)

In [None]:
ytrain_NB_prob_vals = NB_model.predict_proba(X_train)
ytest_NB_prob_vals = NB_model.predict_proba(X_test)
prob_NB_train = ytrain_NB_prob_vals[:, 1]
prob_NB_test = ytest_NB_prob_vals[:, 1]

In [None]:
output = pd.DataFrame({'ID': test_data.ID, 'Overall_Experience': NB_pred})
output.to_csv('05. submission_NB.csv', index=False)
print("Your submission was successfully saved!")

## Bagging using RandomForest

In [None]:
# Parameters initially chosen for RandomForest but filtered after choosing best parameters

param_grid = {
   'max_depth': [7,8,9, 10, 11,12,13],
   'max_features': [5, 6, 7, 8, 9],
  'min_samples_leaf': [10, 15, 20, 25, 30],
  'min_samples_split': [30, 45, 60, 75, 90],
   'n_estimators': [71, 81, 91, 101],

}

rfCl = RandomForestClassifier()

In [None]:
param_grid = {
    'max_depth': [7,8,9,10],
    'max_features': [4, 5, 6, 7],
    'min_samples_leaf': [5, 10, 15, 20],
    'min_samples_split': [10, 20 , 30, 40, 50],
    'n_estimators': [91, 101, 111],
}

rfCl = RandomForestClassifier()

RFCl_model = GridSearchCV(estimator = rfCl, param_grid = param_grid, cv = 3)

In [None]:
# RFCl_model = RandomForestClassifier()

In [None]:
RFCl_model.fit(X_train, y_train)

In [None]:
RFCl_model.best_params_
best_grid_rfc = RFCl_model.best_estimator_

In [None]:
Bagging_model=BaggingClassifier(base_estimator=best_grid_rfc,random_state=1)
Bagging_model.fit(X_train, y_train)

In [None]:
y_train_Bagging_predict = Bagging_model.predict(X_train)
y_test_Bagging_predict = Bagging_model.predict(X_test)

In [None]:
ytrain_Bagging_prob_vals = Bagging_model.predict_proba(X_train)
ytest_Bagging_prob_vals = Bagging_model.predict_proba(X_test)
prob_Bagging_train = ytrain_Bagging_prob_vals[:, 1]
prob_Bagging_test = ytest_Bagging_prob_vals[:, 1]

In [None]:
feature_importances = np.mean([
    tree.feature_importances_ for tree in Bagging_model.estimators_
], axis=0)

In [None]:
feature_importances

In [None]:
col_list

In [None]:
Bagging_pred = Bagging_model.predict(test_data_scaled)

In [None]:
output = pd.DataFrame({'ID': test_data.ID, 'Overall_Experience':Bagging_pred})
output.to_csv('06. submission_Bagging.csv', index=False)
print("Your submission was successfully saved!")

## Ada Boosting

In [None]:
param_grid = {
    'n_estimators':[50,60,70,80,90,100],
    'algorithm':['SAMME','SAMME.R']
    
}

AdaB=AdaBoostClassifier()

AdaB_model = GridSearchCV(estimator = AdaB, param_grid = param_grid, cv = 3)

In [None]:
#AdaB_model = AdaBoostClassifier(n_estimators=100,random_state=1)
AdaB_model.fit(X_train,y_train)

In [None]:
AdaB_model.best_params_

In [None]:
y_train_AdaB_predict = AdaB_model.predict(X_train)
y_test_AdaB_predict = AdaB_model.predict(X_test)

In [None]:
ytrain_AdaB_prob_vals = AdaB_model.predict_proba(X_train)
ytest_AdaB_prob_vals = AdaB_model.predict_proba(X_test)
prob_AdaB_train = ytrain_AdaB_prob_vals[:, 1]
prob_AdaB_test = ytest_AdaB_prob_vals[:, 1]

In [None]:
Ada_predict = AdaB_model.predict(test_data_scaled)

In [None]:
output = pd.DataFrame({'ID': test_data.ID, 'Overall_Experience': Ada_predict})
output.to_csv('07. submission_AdaB.csv', index=False)
print("Your submission was successfully saved!")

END