NOTE:

First Test:
The first 4 columns are variables. I simply chose one to predict and I threw the others in the features => This led to perfect models and maybe even overfitting.

Second Test:
When I removed ["Normal", "Depression", "Bipolar Type 1", "Bipolar Type 2"] from the features and I was predicting "Normal", the results changed drastically

In [10]:
### Data Preprocessing

import pandas as pd
import numpy as np

dataset = pd.read_csv("Dataset-Mental-Disorders.csv")
dataset = dataset.drop(dataset.columns[0], axis = 1)

# change columns 1, 2, 3, 4 => Seldom -> [0], Sometimes -> [1], Usually -> [2], Most-Often -> [3]
for i in range(len(dataset)):
  sadness = dataset["Sadness"]
  euphoric = dataset["Euphoric"]
  exhausted = dataset["Exhausted"]
  sleep_disorder = dataset["Sleep dissorder"]

  if sadness[i] == "Seldom": sadness[i] = 0
  elif sadness[i] == "Sometimes": sadness[i] = 1
  elif sadness[i] == "Usually": sadness[i] = 2
  elif sadness[i] == "Most-Often": sadness[i] = 3

  if euphoric[i] == "Seldom": euphoric[i] = 0
  elif euphoric[i] == "Sometimes": euphoric[i] = 1
  elif euphoric[i] == "Usually": euphoric[i] = 2
  elif euphoric[i] == "Most-Often": euphoric[i] = 3

  if exhausted[i] == "Seldom": exhausted[i] = 0
  elif exhausted[i] == "Sometimes": exhausted[i] = 1
  elif exhausted[i] == "Usually": exhausted[i] = 2
  elif exhausted[i] == "Most-Often": exhausted[i] = 3

  if sleep_disorder[i] == "Seldom": sleep_disorder[i] = 0
  elif sleep_disorder[i] == "Sometimes": sleep_disorder[i] = 1
  elif sleep_disorder[i] == "Usually": sleep_disorder[i] = 2
  elif sleep_disorder[i] == "Most-Often": sleep_disorder[i] = 3


# Change columns 5 -> 14 => NO[0], YES[1]
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
for i in range(4, 14):
  column = dataset.columns[i]
  dataset[column] = label_encoder.fit_transform(dataset[column])
  

# Change columns 15, 16, 17 => from string to decimal number
for i in range(14, 17):
  title = dataset.columns[i]
  column = dataset[title]
  column_values = column.values.reshape(len(column), 1)
  for j in range(len(column_values)):
    answer = int(column_values[j][0][0])
    dataset.loc[j, title] = answer/10


# Change last column into 4 new columns
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


column_transformer = ColumnTransformer( transformers = [('encode', OneHotEncoder(), [-1])], remainder = 'passthrough')

column_names = [
       'Bipolar Type 1', 'Bipolar Type 2' ,'Depression', 'Normal',  'Sadness', 'Euphoric', 'Exhausted', 'Sleep dissorder', 'Mood Swing',
       'Suicidal thoughts', 'Anorxia', 'Authority Respect', 'Try-Explanation', 'Aggressive Response', 'Ignore & Move-On', 'Nervous Break-down',
       'Admit Mistakes', 'Overthinking', 'Sexual Activity', 'Concentration', 'Optimisim'
       ]

values = column_transformer.fit_transform(dataset.values)
dataset = pd.DataFrame( values , columns = column_names)

dataset.head()

#NOTE: The dependent variables/class labels are now the first 4 columns

Unnamed: 0,Bipolar Type 1,Bipolar Type 2,Depression,Normal,Sadness,Euphoric,Exhausted,Sleep dissorder,Mood Swing,Suicidal thoughts,...,Authority Respect,Try-Explanation,Aggressive Response,Ignore & Move-On,Nervous Break-down,Admit Mistakes,Overthinking,Sexual Activity,Concentration,Optimisim
0,0.0,1.0,0.0,0.0,2,0,1,1,1,2,...,0,1,0,0,1,1,1,0.3,0.3,0.4
1,0.0,0.0,1.0,0.0,2,0,2,1,0,1,...,0,0,0,0,0,0,0,0.4,0.2,0.5
2,1.0,0.0,0.0,0.0,1,3,1,1,1,0,...,0,1,1,0,1,1,0,0.6,0.5,0.7
3,0.0,1.0,0.0,0.0,2,0,2,3,1,1,...,0,1,0,0,0,0,0,0.3,0.2,0.2
4,0.0,0.0,0.0,1.0,2,2,1,1,0,0,...,0,0,0,0,1,1,1,0.5,0.5,0.6


In [11]:
### Split the dataset

features = dataset.drop(columns = ["Normal", "Depression", "Bipolar Type 1", "Bipolar Type 2"], axis = 1).values
label = dataset["Normal"].values # We are predicting if the person is normal or has a disorder

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size = 0.2)

In [12]:
X_train
X_test
y_train
y_test

array([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0,
       0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0],
      dtype=object)

In [13]:
### Multiple Regression Model
from sklearn.linear_model import LinearRegression

multiple_regressor = LinearRegression()
multiple_regressor.fit(X_train, y_train)
multiple_regressor_prediction = multiple_regressor.predict(X_test)

# print(np.concatenate((multiple_regressor_prediction.reshape(len(multiple_regressor_prediction), 1), y_test.reshape(len(y_test), 1)), axis=1))

In [14]:
### Polynomial Regression Model
from sklearn.preprocessing import PolynomialFeatures

polynomial_features_maker = PolynomialFeatures(degree = 2)
polynomial_features = polynomial_features_maker.fit_transform(X_train)

polynomial_regressor = LinearRegression()
polynomial_regressor.fit(polynomial_features, y_train)
polynomial_regressor_prediction = polynomial_regressor.predict(polynomial_features_maker.fit_transform(X_test))

# print(np.concatenate( (polynomial_regressor_prediction.reshape(len(polynomial_regressor_prediction), 1), y_test.reshape(len(y_test), 1)), axis = 1 ))

In [15]:
from sklearn.tree import DecisionTreeRegressor
tree_regressor = DecisionTreeRegressor()
tree_regressor.fit(X_train, y_train)
tree_regressor_prediction = tree_regressor.predict(X_test)

# print(np.concatenate( ( tree_regressor_prediction.reshape(len(tree_regressor_prediction), 1), y_test.reshape(len(y_test), 1) ), axis = 1 ))

In [16]:
### Random Forest Model
from sklearn.ensemble import RandomForestRegressor
forest_regressor = RandomForestRegressor()
forest_regressor.fit(X_train, y_train)
forest_regressor_prediction = forest_regressor.predict(X_test)

# print(np.concatenate( ( forest_regressor_prediction.reshape(len(forest_regressor_prediction), 1), y_test.reshape(len(y_test), 1) ), axis = 1 ))

In [17]:
### Support Vector Model

# apply feature scaling (Standardization formula)
from sklearn.preprocessing import StandardScaler
feature_scaler = StandardScaler()
label_scaler = StandardScaler()

scaled_features = feature_scaler.fit_transform(X_train)
scaled_label = label_scaler.fit_transform(y_train.reshape(-1,1))

from sklearn.svm import SVR
svr = SVR(kernel = 'rbf')
svr.fit(scaled_features, scaled_label)

svr_prediction = svr.predict(feature_scaler.transform(X_test))
svr_prediction = label_scaler.inverse_transform(svr_prediction.reshape(len(svr_prediction), 1))
# print(np.concatenate( ( svr_prediction.reshape(len(svr_prediction), 1), y_test.reshape(len(y_test), 1) ), axis = 1 ))

  y = column_or_1d(y, warn=True)


In [18]:
from sklearn.metrics import r2_score
print(f"Multiple Linear Regression: {r2_score(y_test, multiple_regressor_prediction)}")
print(f"Polynomial Linear Regression: {r2_score(y_test, polynomial_regressor_prediction)}")
print(f"Decision Tree Regression: {r2_score(y_test, tree_regressor_prediction)}")
print(f"Random Forest Regression: {r2_score(y_test, forest_regressor_prediction)}")
print(f"Support Vector Regression: {r2_score(y_test, svr_prediction)}")

Multiple Linear Regression: 0.596352916306616
Polynomial Linear Regression: 0.06294611475876866
Decision Tree Regression: 0.19327731092436984
Random Forest Regression: 0.5107428571428572
Support Vector Regression: 0.5924448483333319
