In [None]:
import kagglehub
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, roc_curve, roc_auc_score, silhouette_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import statsmodels.api as sm
import numpy as np
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.cluster import KMeans, DBSCAN


dt = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/student_depression_dataset.csv')

display(dt.head())
print(dt.shape)

dt.replace('?', np.nan, inplace=True)
dt.dropna(subset=['Financial Stress'], inplace=True)
dt['Financial Stress'] = dt['Financial Stress'].astype(float)
numberOfValuesToRemove = int(np.round(dt.shape[0] * 0.7))

averageAccuraciesForMean = []
averageAccuraciesForMedian = []
averageAccuraciesForDeletion = []
averageAccuraciesForGauss = []

averageAccuraciesForBFill = []
averageAccuraciesForFFill = []
averageAccuraciesForMostCommon = []

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,'5-6 hours',Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,'5-6 hours',Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,'Less than 5 hours',Healthy,BA,No,9.0,1.0,Yes,0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,'7-8 hours',Moderate,BCA,Yes,4.0,5.0,Yes,1
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,'5-6 hours',Moderate,M.Tech,Yes,1.0,1.0,No,0


(27901, 18)


In [None]:
def runLogisticRegressionModel(database):
  database = pd.get_dummies(database, columns=['City', 'Profession', 'Gender', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness', 'Sleep Duration'])
  database.fillna(0, inplace=True)
  y = database['Depression']
  X = database.drop(['id', 'Depression'], axis=1)

  pca = PCA(n_components=5)
  pca.fit(X)
  X = pca.transform(X)

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  model = LogisticRegression()
  model.fit(X_train, y_train)

  predictions = model.predict(X_test)
  return accuracy_score(y_test, predictions)

In [None]:
def missingValuesDrop(database, column):
  databaseWithMissingValues = database.copy()

  indexes = list(range(databaseWithMissingValues.shape[0]))
  np.random.shuffle(indexes)
  indexes = indexes[:numberOfValuesToRemove]
  for i in indexes:
    databaseWithMissingValues.loc[i, column] = np.nan

  databaseWithMissingValues.dropna(subset=[column], inplace=True)

  accuracy = runLogisticRegressionModel(databaseWithMissingValues)
  averageAccuraciesForDeletion.append(accuracy)

  print('Accuracy when column ' + column + ' is missing values and technique is deletion: ' + str(accuracy))

In [None]:
def missingValuesMean(database, column):
  databaseWithMissingValues = database.copy()

  indexes = list(range(databaseWithMissingValues.shape[0]))
  np.random.shuffle(indexes)
  indexes = indexes[:numberOfValuesToRemove]
  for i in indexes:
    databaseWithMissingValues.loc[i, column] = np.nan

  databaseWithMissingValues[column] = databaseWithMissingValues[column].fillna(databaseWithMissingValues[column].mean())

  accuracy = runLogisticRegressionModel(databaseWithMissingValues)
  averageAccuraciesForMean.append(accuracy)

  print('Accuracy when column ' + column + ' is missing values and technique is mean: ' + str(accuracy))

In [None]:
def missingValuesMedian(database, column):
  databaseWithMissingValues = database.copy()

  indexes = list(range(databaseWithMissingValues.shape[0]))
  np.random.shuffle(indexes)
  indexes = indexes[:numberOfValuesToRemove]
  for i in indexes:
    databaseWithMissingValues.loc[i, column] = np.nan

  databaseWithMissingValues[column] = databaseWithMissingValues[column].fillna(databaseWithMissingValues[column].median())

  accuracy = runLogisticRegressionModel(databaseWithMissingValues)
  averageAccuraciesForMedian.append(accuracy)

  print('Accuracy when column ' + column + ' is missing values and technique is median: ' + str(accuracy))

In [None]:
def missingValuesGauss(database, column):
  databaseWithMissingValues = database.copy()

  indexes = list(range(databaseWithMissingValues.shape[0]))
  np.random.shuffle(indexes)
  indexes = indexes[:numberOfValuesToRemove]
  for i in indexes:
    databaseWithMissingValues.loc[i, column] = np.nan

  for i in range(databaseWithMissingValues.shape[0]):
    if np.isnan(databaseWithMissingValues.iloc[i][column]):
      databaseWithMissingValues.loc[i, column] = np.random.normal(databaseWithMissingValues[column].mean(), databaseWithMissingValues[column].std())

  accuracy = runLogisticRegressionModel(databaseWithMissingValues)
  averageAccuraciesForGauss.append(accuracy)

  print('Accuracy when column ' + column + ' is missing values and technique is gaussian: ' + str(accuracy))

In [None]:
def missingValuesAllColumnsMean(database, columns):
  databaseWithMissingValues = database.copy()

  for column in columns:
    indexes = list(range(databaseWithMissingValues.shape[0]))
    np.random.shuffle(indexes)
    indexes = indexes[:numberOfValuesToRemove]
    for i in indexes:
      databaseWithMissingValues.loc[i, column] = np.nan

  for column in columns:
    databaseWithMissingValues[column] = databaseWithMissingValues[column].fillna(databaseWithMissingValues[column].mean())

  accuracy = runLogisticRegressionModel(databaseWithMissingValues)

  print('Accuracy when all continuous columns are missing values and technique is mean: ' + str(accuracy))

In [None]:
def missingValuesAllColumnsMedian(database, columns):
  databaseWithMissingValues = database.copy()

  for column in columns:
    indexes = list(range(databaseWithMissingValues.shape[0]))
    np.random.shuffle(indexes)
    indexes = indexes[:numberOfValuesToRemove]
    for i in indexes:
      databaseWithMissingValues.loc[i, column] = np.nan

  for column in columns:
    databaseWithMissingValues[column] = databaseWithMissingValues[column].fillna(databaseWithMissingValues[column].median())

  accuracy = runLogisticRegressionModel(databaseWithMissingValues)

  print('Accuracy when all continuous columns are missing values and technique is median: ' + str(accuracy))

In [None]:
def missingValuesAllColumnsGauss(database, columns):
  databaseWithMissingValues = database.copy()

  for column in columns:
    indexes = list(range(databaseWithMissingValues.shape[0]))
    np.random.shuffle(indexes)
    indexes = indexes[:numberOfValuesToRemove]
    for i in indexes:
      databaseWithMissingValues.loc[i, column] = np.nan

  for column in columns:
    for i in range(databaseWithMissingValues.shape[0]):
      if np.isnan(databaseWithMissingValues.iloc[i][column]):
        databaseWithMissingValues.loc[i, column] = np.random.normal(databaseWithMissingValues[column].mean(), databaseWithMissingValues[column].std())

  accuracy = runLogisticRegressionModel(databaseWithMissingValues)

  print('Accuracy when all continuous columns are missing values and technique is gaussian: ' + str(accuracy))

In [None]:
def missingValuesAllColumnsOptimal(database, columns):
  databaseWithMissingValues = database.copy()

  for column in columns:
    indexes = list(range(databaseWithMissingValues.shape[0]))
    np.random.shuffle(indexes)
    indexes = indexes[:numberOfValuesToRemove]
    for i in indexes:
      databaseWithMissingValues.loc[i, column] = np.nan

  databaseWithMissingValues['Age'] = databaseWithMissingValues['Age'].fillna(databaseWithMissingValues['Age'].median())

  databaseWithMissingValues['Academic Pressure'] = databaseWithMissingValues['Academic Pressure'].fillna(databaseWithMissingValues['Academic Pressure'].mean())

  databaseWithMissingValues['Work Pressure'] = databaseWithMissingValues['Work Pressure'].fillna(databaseWithMissingValues['Work Pressure'].mean())

  databaseWithMissingValues['CGPA'] = databaseWithMissingValues['CGPA'].fillna(databaseWithMissingValues['CGPA'].median())

  databaseWithMissingValues['Study Satisfaction'] = databaseWithMissingValues['Study Satisfaction'].fillna(databaseWithMissingValues['Study Satisfaction'].mean())

  databaseWithMissingValues['Job Satisfaction'] = databaseWithMissingValues['Job Satisfaction'].fillna(databaseWithMissingValues['Job Satisfaction'].mean())

  databaseWithMissingValues['Work/Study Hours'] = databaseWithMissingValues['Work/Study Hours'].fillna(databaseWithMissingValues['Work/Study Hours'].median())

  databaseWithMissingValues['Financial Stress'] = databaseWithMissingValues['Financial Stress'].fillna(databaseWithMissingValues['Financial Stress'].median())

  accuracy = runLogisticRegressionModel(databaseWithMissingValues)

  print('Accuracy when all continuous columns are missing values and technique is optimal: ' + str(accuracy))

In [None]:
print('Accuracy with no missing values: ' + str(runLogisticRegressionModel(dt)) + '\n')

missingValuesDrop(dt, 'Age')
missingValuesMean(dt, 'Age')
missingValuesMedian(dt, 'Age')
missingValuesGauss(dt, 'Age')
print('')

missingValuesDrop(dt, 'Academic Pressure')
missingValuesMean(dt, 'Academic Pressure')
missingValuesMedian(dt, 'Academic Pressure')
missingValuesGauss(dt, 'Academic Pressure')
print('')

missingValuesDrop(dt, 'Work Pressure')
missingValuesMean(dt, 'Work Pressure')
missingValuesMedian(dt, 'Work Pressure')
missingValuesGauss(dt, 'Work Pressure')
print('')

missingValuesDrop(dt, 'CGPA')
missingValuesMean(dt, 'CGPA')
missingValuesMedian(dt, 'CGPA')
missingValuesGauss(dt, 'CGPA')
print('')

missingValuesDrop(dt, 'Study Satisfaction')
missingValuesMean(dt, 'Study Satisfaction')
missingValuesMedian(dt, 'Study Satisfaction')
missingValuesGauss(dt, 'Study Satisfaction')
print('')

missingValuesDrop(dt, 'Job Satisfaction')
missingValuesMean(dt, 'Job Satisfaction')
missingValuesMedian(dt, 'Job Satisfaction')
missingValuesGauss(dt, 'Job Satisfaction')
print('')

missingValuesDrop(dt, 'Work/Study Hours')
missingValuesMean(dt, 'Work/Study Hours')
missingValuesMedian(dt, 'Work/Study Hours')
missingValuesGauss(dt, 'Work/Study Hours')
print('')

missingValuesDrop(dt, 'Financial Stress')
missingValuesMean(dt, 'Financial Stress')
missingValuesMedian(dt, 'Financial Stress')
missingValuesGauss(dt, 'Financial Stress')
print('')

print('Average accuracy for deletion: ' + str(np.mean(averageAccuraciesForDeletion)))
print('Average accuracy for mean: ' + str(np.mean(averageAccuraciesForMean)))
print('Average accuracy for median: ' + str(np.mean(averageAccuraciesForMedian)))
print('Average accuracy for gauss: ' + str(np.mean(averageAccuraciesForGauss)))

Accuracy with no missing values: 0.7869175627240144

Accuracy when column Age is missing values and technique is deletion: 0.7928358208955224
Accuracy when column Age is missing values and technique is mean: 0.7763841605447053
Accuracy when column Age is missing values and technique is median: 0.7793906810035842
Accuracy when column Age is missing values and technique is gaussian: 0.773517290808099

Accuracy when column Academic Pressure is missing values and technique is deletion: 0.8035820895522388
Accuracy when column Academic Pressure is missing values and technique is mean: 0.7243727598566309
Accuracy when column Academic Pressure is missing values and technique is median: 0.7172043010752688
Accuracy when column Academic Pressure is missing values and technique is gaussian: 0.7188675864540405

Accuracy when column Work Pressure is missing values and technique is deletion: 0.7910447761194029
Accuracy when column Work Pressure is missing values and technique is mean: 0.7867765633399

In [None]:
missingValuesAllColumnsMean(dt, ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours', 'Financial Stress'])
missingValuesAllColumnsMedian(dt, ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours', 'Financial Stress'])
missingValuesAllColumnsGauss(dt, ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours', 'Financial Stress'])

Accuracy when all continuous columns are missing values and technique is mean: 0.7582870453323777
Accuracy when all continuous columns are missing values and technique is median: 0.7598996595592188
Accuracy when all continuous columns are missing values and technique is gaussian: 0.5875291166457625


In [None]:
missingValuesAllColumnsOptimal(dt, ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours', 'Financial Stress'])

Accuracy when all continuous columns are missing values and technique is optimal: 0.7618706325031357


In [None]:
def missingValuesBFill(database, column):
  databaseWithMissingValues = database.copy()

  indexes = list(range(databaseWithMissingValues.shape[0]))
  np.random.shuffle(indexes)
  indexes = indexes[:numberOfValuesToRemove]
  for i in indexes:
    databaseWithMissingValues.loc[i, column] = np.nan

  databaseWithMissingValues.fillna({column: 'bfill'}, inplace=True)

  accuracy = runLogisticRegressionModel(databaseWithMissingValues)
  averageAccuraciesForBFill.append(accuracy)

  print('Accuracy when column ' + column + ' is missing values and technique is backfill: ' + str(accuracy))

In [None]:
def missingValuesFFill(database, column):
  databaseWithMissingValues = database.copy()

  indexes = list(range(databaseWithMissingValues.shape[0]))
  np.random.shuffle(indexes)
  indexes = indexes[:numberOfValuesToRemove]
  for i in indexes:
    databaseWithMissingValues.loc[i, column] = np.nan

  databaseWithMissingValues.fillna({column: 'ffill'}, inplace=True)

  accuracy = runLogisticRegressionModel(databaseWithMissingValues)
  averageAccuraciesForFFill.append(accuracy)

  print('Accuracy when column ' + column + ' is missing values and technique is frontfill: ' + str(accuracy))

In [None]:
def missingValuesMostCommon(database, column):
  databaseWithMissingValues = database.copy()

  indexes = list(range(databaseWithMissingValues.shape[0]))
  np.random.shuffle(indexes)
  indexes = indexes[:numberOfValuesToRemove]
  for i in indexes:
    databaseWithMissingValues.loc[i, column] = np.nan

  databaseWithMissingValues.fillna({column: databaseWithMissingValues[column].value_counts().index[0]}, inplace=True)

  accuracy = runLogisticRegressionModel(databaseWithMissingValues)
  averageAccuraciesForMostCommon.append(accuracy)

  print('Accuracy when column ' + column + ' is missing values and technique is most common: ' + str(accuracy))

In [None]:
def missingValuesForAllColumnsBFill(database, columns):
  databaseWithMissingValues = database.copy()

  for column in columns:
    indexes = list(range(databaseWithMissingValues.shape[0]))
    np.random.shuffle(indexes)
    indexes = indexes[:numberOfValuesToRemove]
    for i in indexes:
      databaseWithMissingValues.loc[i, column] = np.nan

  for column in columns:
    databaseWithMissingValues.fillna({column: 'bfill'}, inplace=True)

  accuracy = runLogisticRegressionModel(databaseWithMissingValues)

  print('Accuracy when all categorical columns are missing values and technique is backfill: ' + str(accuracy))

In [None]:
def missingValuesForAllColumnsFFill(database, columns):
  databaseWithMissingValues = database.copy()

  for column in columns:
    indexes = list(range(databaseWithMissingValues.shape[0]))
    np.random.shuffle(indexes)
    indexes = indexes[:numberOfValuesToRemove]
    for i in indexes:
      databaseWithMissingValues.loc[i, column] = np.nan

  for column in columns:
    databaseWithMissingValues.fillna({column: 'ffill'}, inplace=True)

  accuracy = runLogisticRegressionModel(databaseWithMissingValues)

  print('Accuracy when all categorical columns are missing values and technique is frontfill: ' + str(accuracy))

In [None]:
def missingValuesForAllColumnsMostCommon(database, columns):
  databaseWithMissingValues = database.copy()

  for column in columns:
    indexes = list(range(databaseWithMissingValues.shape[0]))
    np.random.shuffle(indexes)
    indexes = indexes[:numberOfValuesToRemove]
    for i in indexes:
      databaseWithMissingValues.loc[i, column] = np.nan

  for column in columns:
    databaseWithMissingValues.fillna({column: databaseWithMissingValues[column].value_counts().index[0]}, inplace=True)

  accuracy = runLogisticRegressionModel(databaseWithMissingValues)

  print('Accuracy when all categorical columns are missing values and technique is most common: ' + str(accuracy))

In [None]:
def missingValuesForAllCategoricalFeaturesOptimal(database, columns):
  databaseWithMissingValues = database.copy()

  for column in columns:
    indexes = list(range(databaseWithMissingValues.shape[0]))
    np.random.shuffle(indexes)
    indexes = indexes[:numberOfValuesToRemove]
    for i in indexes:
      databaseWithMissingValues.loc[i, column] = np.nan

  databaseWithMissingValues.fillna({'City': 'ffill'}, inplace=True)

  databaseWithMissingValues.fillna({'Profession': 'bfill'}, inplace=True)

  databaseWithMissingValues.fillna({'Gender': 'ffill'}, inplace=True)

  databaseWithMissingValues.fillna({'Dietary Habits': databaseWithMissingValues['Dietary Habits'].value_counts().index[0]}, inplace=True)

  databaseWithMissingValues.fillna({'Degree': 'ffill'}, inplace=True)

  databaseWithMissingValues.fillna({'Have you ever had suicidal thoughts ?': 'bfill'}, inplace=True)

  databaseWithMissingValues.fillna({'Family History of Mental Illness': 'bfill'}, inplace=True)

  databaseWithMissingValues.fillna({'Sleep Duration': 'ffill'}, inplace=True)

  accuracy = runLogisticRegressionModel(databaseWithMissingValues)

  print('Accuracy when all categorical columns are missing values and technique is optimal: ' + str(accuracy))

In [None]:
print('Accuracy with no missing values: ' + str(runLogisticRegressionModel(dt)) + '\n')

missingValuesBFill(dt, 'City')
missingValuesFFill(dt, 'City')
missingValuesMostCommon(dt, 'City')
print('')

missingValuesBFill(dt, 'Profession')
missingValuesFFill(dt, 'Profession')
missingValuesMostCommon(dt, 'Profession')
print('')

missingValuesBFill(dt, 'Gender')
missingValuesFFill(dt, 'Gender')
missingValuesMostCommon(dt, 'Gender')
print('')

missingValuesBFill(dt, 'Dietary Habits')
missingValuesFFill(dt, 'Dietary Habits')
missingValuesMostCommon(dt, 'Dietary Habits')
print('')

missingValuesBFill(dt, 'Degree')
missingValuesFFill(dt, 'Degree')
missingValuesMostCommon(dt, 'Degree')
print('')

missingValuesBFill(dt, 'Have you ever had suicidal thoughts ?')
missingValuesFFill(dt, 'Have you ever had suicidal thoughts ?')
missingValuesMostCommon(dt, 'Have you ever had suicidal thoughts ?')
print('')

missingValuesBFill(dt, 'Family History of Mental Illness')
missingValuesFFill(dt, 'Family History of Mental Illness')
missingValuesMostCommon(dt, 'Family History of Mental Illness')
print('')

missingValuesBFill(dt, 'Sleep Duration')
missingValuesFFill(dt, 'Sleep Duration')
missingValuesMostCommon(dt, 'Sleep Duration')
print('')

print('Average accuracy for backfill: ' + str(np.mean(averageAccuraciesForBFill)))
print('Average accuracy for frontfill: ' + str(np.mean(averageAccuraciesForFFill)))
print('Average accuracy for most common: ' + str(np.mean(averageAccuraciesForMostCommon)))

Accuracy with no missing values: 0.7869175627240144

Accuracy when column City is missing values and technique is backfill: 0.7842293906810036
Accuracy when column City is missing values and technique is frontfill: 0.7896057347670251
Accuracy when column City is missing values and technique is most common: 0.7896057347670251

Accuracy when column Profession is missing values and technique is backfill: 0.7897849462365591
Accuracy when column Profession is missing values and technique is frontfill: 0.7845878136200717
Accuracy when column Profession is missing values and technique is most common: 0.7894265232974911

Accuracy when column Gender is missing values and technique is backfill: 0.7864182046228274
Accuracy when column Gender is missing values and technique is frontfill: 0.7872759856630824
Accuracy when column Gender is missing values and technique is most common: 0.7864182046228274

Accuracy when column Dietary Habits is missing values and technique is backfill: 0.783512544802867

In [None]:
missingValuesForAllColumnsBFill(dt, ['City', 'Profession', 'Gender', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness', 'Sleep Duration'])
missingValuesForAllColumnsFFill(dt, ['City', 'Profession', 'Gender', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness', 'Sleep Duration'])
missingValuesForAllColumnsMostCommon(dt, ['City', 'Profession', 'Gender', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness', 'Sleep Duration'])

Accuracy when all categorical columns are missing values and technique is backfill: 0.7740548288837126
Accuracy when all categorical columns are missing values and technique is frontfill: 0.7726213940154094
Accuracy when all categorical columns are missing values and technique is most common: 0.773338111449561


In [None]:
missingValuesForAllCategoricalFeaturesOptimal(dt, ['City', 'Profession', 'Gender', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness', 'Sleep Duration'])

Accuracy when all categorical columns are missing values and technique is optimal: 0.7742340082422505


In [None]:
def missingValuesForAllFeatures(database, columns):
  databaseWithMissingValues = database.copy()

  for column in columns:
    indexes = list(range(databaseWithMissingValues.shape[0]))
    np.random.shuffle(indexes)
    indexes = indexes[:numberOfValuesToRemove]
    for i in indexes:
      databaseWithMissingValues.loc[i, column] = np.nan

  databaseWithMissingValues['Age'] = databaseWithMissingValues['Age'].fillna(databaseWithMissingValues['Age'].median())

  databaseWithMissingValues['Academic Pressure'] = databaseWithMissingValues['Academic Pressure'].fillna(databaseWithMissingValues['Academic Pressure'].mean())

  databaseWithMissingValues['Work Pressure'] = databaseWithMissingValues['Work Pressure'].fillna(databaseWithMissingValues['Work Pressure'].mean())

  databaseWithMissingValues['CGPA'] = databaseWithMissingValues['CGPA'].fillna(databaseWithMissingValues['CGPA'].median())

  databaseWithMissingValues['Study Satisfaction'] = databaseWithMissingValues['Study Satisfaction'].fillna(databaseWithMissingValues['Study Satisfaction'].mean())

  databaseWithMissingValues['Job Satisfaction'] = databaseWithMissingValues['Job Satisfaction'].fillna(databaseWithMissingValues['Job Satisfaction'].mean())

  databaseWithMissingValues['Work/Study Hours'] = databaseWithMissingValues['Work/Study Hours'].fillna(databaseWithMissingValues['Work/Study Hours'].median())

  databaseWithMissingValues['Financial Stress'] = databaseWithMissingValues['Financial Stress'].fillna(databaseWithMissingValues['Financial Stress'].median())

  databaseWithMissingValues.fillna({'City': 'ffill'}, inplace=True)

  databaseWithMissingValues.fillna({'Profession': 'bfill'}, inplace=True)

  databaseWithMissingValues.fillna({'Gender': 'ffill'}, inplace=True)

  databaseWithMissingValues.fillna({'Dietary Habits': databaseWithMissingValues['Dietary Habits'].value_counts().index[0]}, inplace=True)

  databaseWithMissingValues.fillna({'Degree': 'ffill'}, inplace=True)

  databaseWithMissingValues.fillna({'Have you ever had suicidal thoughts ?': 'bfill'}, inplace=True)

  databaseWithMissingValues.fillna({'Family History of Mental Illness': 'bfill'}, inplace=True)

  databaseWithMissingValues.fillna({'Sleep Duration': 'ffill'}, inplace=True)

  accuracy = runLogisticRegressionModel(databaseWithMissingValues)

  print('Accuracy when all columns are missing values and technique is based on which methods worked the best: ' + str(accuracy))

In [None]:
missingValuesForAllFeatures(dt, ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours', 'Financial Stress', 'City', 'Profession', 'Gender', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness', 'Sleep Duration'])

Accuracy when all columns are missing values and technique is based on which methods worked the best: 0.6660096756853611


In [None]:
def missingValuesForAllFeaturesBasicVersion(database, continuousColumns, categoricalColumns):
  databaseWithMissingValues = database.copy()

  for column in continuousColumns:
    indexes = list(range(databaseWithMissingValues.shape[0]))
    np.random.shuffle(indexes)
    indexes = indexes[:numberOfValuesToRemove]
    for i in indexes:
      databaseWithMissingValues.loc[i, column] = np.nan

  for column in categoricalColumns:
    indexes = list(range(databaseWithMissingValues.shape[0]))
    np.random.shuffle(indexes)
    indexes = indexes[:numberOfValuesToRemove]
    for i in indexes:
      databaseWithMissingValues.loc[i, column] = np.nan

  for column in continuousColumns:
    databaseWithMissingValues[column] = databaseWithMissingValues[column].fillna(databaseWithMissingValues[column].mean())

  for column in categoricalColumns:
    databaseWithMissingValues.fillna({column: 'bfill'}, inplace=True)

  accuracy = runLogisticRegressionModel(databaseWithMissingValues)

  print('Accuracy when all columns are missing values and technique is based on basic methods: ' + str(accuracy))

In [None]:
missingValuesForAllFeaturesBasicVersion(dt, ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours', 'Financial Stress'], ['City', 'Profession', 'Gender', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness', 'Sleep Duration'])

Accuracy when all columns are missing values and technique is based on basic methods: 0.6617093710804516
