In [1]:
import pandas as pd

In [2]:
#import dataset
df = pd.read_csv('ocd_patient_dataset 1.csv')
df.head()

Unnamed: 0,Patient ID,Age,Gender,Ethnicity,Marital Status,Education Level,OCD Diagnosis Date,Days after diagnosis,Duration of Symptoms (months),Previous Diagnoses,Family History of OCD,Obsession Type,Compulsion Type,Y-BOCS Score (Obsessions),Y-BOCS Score (Compulsions),Total score,class,Depression Diagnosis,Anxiety Diagnosis,Medications
0,1018,32,Female,African,Single,Some College,7/15/2016,2993,203,MDD,No,Harm-related,Checking,17,10,27,high,Yes,Yes,SNRI
1,1188,57,Male,Hispanic,Divorced,College Degree,2/2/2018,2426,173,MDD,No,Contamination,Checking,3,4,7,low,No,No,Benzodiazepine
2,9861,38,Female,Hispanic,Single,College Degree,3/13/2017,2752,110,MDD,No,Contamination,Praying,12,16,28,high,Yes,No,SNRI
3,7905,73,Female,Hispanic,Divorced,High School,1/13/2017,2811,233,GAD,No,Religious,Counting,4,16,20,moderate,Yes,Yes,Benzodiazepine
4,2637,66,Female,Asian,Divorced,College Degree,8/14/2018,2233,73,Panic Disorder,No,Harm-related,Washing,0,12,12,low,No,Yes,SNRI


In [3]:
#check number of rows and columns
df.shape

(419, 20)

# Data Processing

## Check null values and drop them

In [4]:
df.isnull().sum()

Patient ID                       0
Age                              0
Gender                           0
Ethnicity                        0
Marital Status                   0
Education Level                  0
OCD Diagnosis Date               0
Days after diagnosis             0
Duration of Symptoms (months)    0
Previous Diagnoses               0
Family History of OCD            0
Obsession Type                   0
Compulsion Type                  0
Y-BOCS Score (Obsessions)        0
Y-BOCS Score (Compulsions)       0
Total score                      0
class                            0
Depression Diagnosis             0
Anxiety Diagnosis                0
Medications                      0
dtype: int64

In [5]:
#There is no null values. To be sure, drop null values and check again
df.dropna(inplace=True)

In [6]:
df.isnull().sum()

Patient ID                       0
Age                              0
Gender                           0
Ethnicity                        0
Marital Status                   0
Education Level                  0
OCD Diagnosis Date               0
Days after diagnosis             0
Duration of Symptoms (months)    0
Previous Diagnoses               0
Family History of OCD            0
Obsession Type                   0
Compulsion Type                  0
Y-BOCS Score (Obsessions)        0
Y-BOCS Score (Compulsions)       0
Total score                      0
class                            0
Depression Diagnosis             0
Anxiety Diagnosis                0
Medications                      0
dtype: int64

In [7]:
#check shape again to know the number of rows and columns after removing null values
df.shape

(419, 20)

Total score is the sum of Y-BOCS scores (Obsessions) and Y-BOCS scores(Compulsions) and class is defined as:
total score 0-15 = low, 
total score 16-23 = middle,
total score 24-40 = high

In [8]:
#check the data types
df.dtypes

Patient ID                        int64
Age                               int64
Gender                           object
Ethnicity                        object
Marital Status                   object
Education Level                  object
OCD Diagnosis Date               object
Days after diagnosis              int64
Duration of Symptoms (months)     int64
Previous Diagnoses               object
Family History of OCD            object
Obsession Type                   object
Compulsion Type                  object
Y-BOCS Score (Obsessions)         int64
Y-BOCS Score (Compulsions)        int64
Total score                       int64
class                            object
Depression Diagnosis             object
Anxiety Diagnosis                object
Medications                      object
dtype: object

## Change data types

In [9]:
df["Ethnicity"] = df["Ethnicity"].astype("category")
df["Marital Status"] = df["Marital Status"].astype("category")
df["Education Level"] = df["Education Level"].astype("category")
df["Previous Diagnoses"] = df["Previous Diagnoses"].astype("category")
df["Obsession Type"] = df["Obsession Type"].astype("category")
df["Compulsion Type"] = df["Compulsion Type"].astype("category")
df["Medications"] = df["Medications"].astype("category")

In [10]:
df['Family History of OCD'] = df['Family History of OCD'].map({'Yes': True, 'No': False})
df['Depression Diagnosis'] = df['Depression Diagnosis'].map({'Yes': True, 'No': False})
df['Anxiety Diagnosis'] = df['Anxiety Diagnosis'].map({'Yes': True, 'No': False})

In [11]:
df["Gender"] = df["Gender"].astype("category")

In [12]:
#check the data types again
df.dtypes

Patient ID                          int64
Age                                 int64
Gender                           category
Ethnicity                        category
Marital Status                   category
Education Level                  category
OCD Diagnosis Date                 object
Days after diagnosis                int64
Duration of Symptoms (months)       int64
Previous Diagnoses               category
Family History of OCD                bool
Obsession Type                   category
Compulsion Type                  category
Y-BOCS Score (Obsessions)           int64
Y-BOCS Score (Compulsions)          int64
Total score                         int64
class                              object
Depression Diagnosis                 bool
Anxiety Diagnosis                    bool
Medications                      category
dtype: object

In [13]:
df.shape

(419, 20)

## Set features for training 

Relevant Features are selected for model training. According to our finding from relevant research articles, age, delay or early diagnosis and treatment, presence of depression or anxiety, family history of OCD have impact on disease severity. Thus, these relevant features:
age, days after diagnosis, anxiety diagnosis, depression diagnosis and family history of OCD are used for the training of prediction model.

In [14]:
X = df[['Age','Depression Diagnosis','Days after diagnosis','Anxiety Diagnosis','Family History of OCD']]

In [15]:
type(X)

pandas.core.frame.DataFrame

In [16]:
X.head()

Unnamed: 0,Age,Depression Diagnosis,Days after diagnosis,Anxiety Diagnosis,Family History of OCD
0,32,True,2993,True,False
1,57,False,2426,False,False
2,38,True,2752,False,False
3,73,True,2811,True,False
4,66,False,2233,True,False


## Encoding

## encode binary features

In [17]:
X["Anxiety Diagnosis"] = X["Anxiety Diagnosis"].map({
                False:0,
                True:1
            })

X["Depression Diagnosis"] = X["Depression Diagnosis"].map({
                False:0,
                True:1
            })

X["Family History of OCD"] = X["Family History of OCD"].map({
                False:0,
                True:1
            })
X

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["Anxiety Diagnosis"] = X["Anxiety Diagnosis"].map({
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["Depression Diagnosis"] = X["Depression Diagnosis"].map({
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["Family History of OCD"] = X["Family History of OCD"].map({


Unnamed: 0,Age,Depression Diagnosis,Days after diagnosis,Anxiety Diagnosis,Family History of OCD
0,32,1,2993,1,0
1,57,0,2426,0,0
2,38,1,2752,0,0
3,73,1,2811,1,0
4,66,0,2233,1,0
...,...,...,...,...,...
414,24,0,2451,1,1
415,62,0,1383,1,0
416,58,0,3001,1,1
417,40,1,2387,1,1


In [18]:
#check data types
X.dtypes

Age                      int64
Depression Diagnosis     int64
Days after diagnosis     int64
Anxiety Diagnosis        int64
Family History of OCD    int64
dtype: object

# Feature scaling (Min Max normalization)

2 Numerical features: ages and days after diagnosis are normalized with min max normalization to set their values between 0 and 1.

In [19]:
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()
numerical_features = ['Age', 'Days after diagnosis']
X[numerical_features] = min_max_scaler.fit_transform(X[numerical_features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numerical_features] = min_max_scaler.fit_transform(X[numerical_features])


In [20]:
#check the normalized data after feature scaling
X

Unnamed: 0,Age,Depression Diagnosis,Days after diagnosis,Anxiety Diagnosis,Family History of OCD
0,0.245614,1,0.702967,1,0
1,0.684211,0,0.529520,0,0
2,0.350877,1,0.629244,0,0
3,0.964912,1,0.647293,1,0
4,0.842105,0,0.470480,1,0
...,...,...,...,...,...
414,0.105263,0,0.537167,1,1
415,0.771930,0,0.210462,1,0
416,0.701754,0,0.705414,1,1
417,0.385965,1,0.517589,1,1


In [21]:
#check shape
X.shape

(419, 5)

In [22]:
#check type
type(X)

pandas.core.frame.DataFrame

# Regression

## Set target variable for regression model

First, I will train the regression model to predict the total score. So, the target variable is total score.

In [23]:
#regression
y = df['Total score']

In [24]:
#check y
y

0      27
1       7
2      28
3      20
4      12
       ..
414    24
415    35
416    11
417    17
418    23
Name: Total score, Length: 419, dtype: int64

In [25]:
#check y shape
y.shape

(419,)

## Train and test the regression model 

In [26]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [27]:
#partition of features and target for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
#check the number of training and testing
X_train.shape

(335, 5)

In [29]:
y_train.shape

(335,)

In [30]:
X_test.shape

(84, 5)

In [31]:
y_test.shape

(84,)

## Linear Regression 

In [32]:
#train a linear regression model and test it with testing data
model = LinearRegression()
model.fit(X_train, y_train)

In [33]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)  # Mean Squared Error
r2 = r2_score(y_test, y_pred)  # R-squared (coefficient of determination)
print(f"Linear Regression - MSE: {mse}, R-squared: {r2}")

Linear Regression - MSE: 95.96467974320423, R-squared: -0.015458138217490758


## Training Different Regression Models

Training data is trained with different regression models, models are tested and performance metrics are compared. 

In [34]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

In [35]:
import time

In [36]:
models = {
    "LinearRegression": LinearRegression(),  
    'RandomForest': RandomForestRegressor(),
    "DecisionTree": DecisionTreeRegressor(), 
    "GradientBoosting": GradientBoostingRegressor(),
    "KNN": KNeighborsRegressor(n_neighbors=5),
    "Multi-level perceptron": MLPRegressor(hidden_layer_sizes=(64, 32), activation='relu', solver='adam',learning_rate_init=0.001, max_iter=2000, random_state=42)
    
}

# iterate over the dictionary to view the name and the classifier object as follows:
for name, regressor in models.items():
    print("The name of the regressor is:", name, " and it is a sklearn object:", regressor)

The name of the regressor is: LinearRegression  and it is a sklearn object: LinearRegression()
The name of the regressor is: RandomForest  and it is a sklearn object: RandomForestRegressor()
The name of the regressor is: DecisionTree  and it is a sklearn object: DecisionTreeRegressor()
The name of the regressor is: GradientBoosting  and it is a sklearn object: GradientBoostingRegressor()
The name of the regressor is: KNN  and it is a sklearn object: KNeighborsRegressor()
The name of the regressor is: Multi-level perceptron  and it is a sklearn object: MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=2000, random_state=42)


In [37]:
results = pd.DataFrame({
                        "regressor_name": [],
                        "training_time": [],
                        "prediction_time": [],
                        "mse": [],
                        'r2': []
                        })
#to view the currently empty dataframe
results

Unnamed: 0,regressor_name,training_time,prediction_time,mse,r2


In [38]:
for name, regressor in models.items():

    
    t_start = time.time()
    regressor.fit(X_train,y_train)
    t_end = time.time()
    training_time = t_end - t_start
    
    t_start = time.time()
    y_pred = regressor.predict(X_test) 
    t_end = time.time()
    prediction_time = t_end - t_start

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    this_result = pd.DataFrame({
                    "regressor_name": [name],
                    "training_time": [training_time],
                    "prediction_time": [prediction_time],
                    "mse": [mse],
                    "r2": [r2]
                    })
    # Append this dataframe'this_result' to the main dataframe 'results'
    results = pd.concat([results, this_result], axis=0, ignore_index=True)
    
#By this way, at each time of the loop, classifier name, times and performance metrics of the classifier is added into the main 'results' dataframe.

In [39]:
results

Unnamed: 0,regressor_name,training_time,prediction_time,mse,r2
0,LinearRegression,0.003999,0.002997,95.96468,-0.015458
1,RandomForest,0.252859,0.009992,103.751529,-0.097855
2,DecisionTree,0.002998,0.000999,152.642857,-0.615203
3,GradientBoosting,0.100943,0.001998,108.667176,-0.149871
4,KNN,0.002998,0.001999,107.71,-0.139742
5,Multi-level perceptron,3.203162,0.001,94.145849,0.003788


# Classification

## Set target variable for classification model

To train a classification model to predict high, middle or low class of patient's Y-BOCS scores, target variable, 'class' is set up.

In [40]:
#classification
y = df['class']

In [41]:
#check y
y

0          high
1           low
2          high
3      moderate
4           low
         ...   
414        high
415        high
416         low
417    moderate
418    moderate
Name: class, Length: 419, dtype: object

In [42]:
#check shape of y
y.shape

(419,)

In [43]:
#partition of features and target for training and testing of classification model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:
#check the number of training and testing
X_train.shape

(335, 5)

In [45]:
y_train.shape

(335,)

In [46]:
X_test.shape

(84, 5)

In [47]:
y_test.shape

(84,)

## Train and test the Classification models

Different classification models with different algorithms are trained on training set and tested on testing set. Performance of models are evaluation. 

First, logistic regression model is trained and tested. 

In [48]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report

# Create a Logistic Regression model for multiclass classification
logreg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)

# Fit the model to the training data
logreg.fit(X_train, y_train)



In [49]:
# Predict on the test set
y_pred = logreg.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        high       0.61      0.49      0.54        35
         low       0.42      0.28      0.33        29
    moderate       0.24      0.45      0.32        20

    accuracy                           0.40        84
   macro avg       0.42      0.40      0.40        84
weighted avg       0.46      0.40      0.42        84



## Training Different Classification Models

Then, train different models, test and show show their performance.

In [50]:
# Import classes for training different classification algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
# Models dictionary: Multiclass classification algorithms
models = {
    "DT_gini": DecisionTreeClassifier(criterion='gini', max_depth=5),  # Decision tree with gini criterion and max depth of 5
    'DT_entropy': DecisionTreeClassifier(criterion='entropy', max_depth=5),  # Decision tree with entropy criterion and same max depth
    "RF_gini": RandomForestClassifier(n_estimators=30, criterion='gini', max_depth=5),  # Random forest with gini criterion, 30 estimators, and 5 max depth
    "RF_entropy": RandomForestClassifier(n_estimators=30, criterion='entropy', max_depth=5),  # Random forest with entropy criterion, same estimators, and max depth
    "KNN_3neighbours": KNeighborsClassifier(n_neighbors=3),  # KNN with 3 neighbors
    "KNN_5neighbours": KNeighborsClassifier(n_neighbors=5),  # KNN with 5 neighbors
    "SVM_rbf_kernel": SVC(kernel='rbf', decision_function_shape='ovr'),  # SVM with rbf kernel and one-vs-rest multiclass
    "SVM_linear_kernel": SVC(kernel='linear', decision_function_shape='ovr'),  # SVM with linear kernel and one-vs-rest multiclass
    'SGD_hinge_loss': SGDClassifier(loss="hinge", penalty="l2", max_iter=100),  # SGD with hinge loss, l2 penalty, and max iteration of 100
    'SGD_log_loss': SGDClassifier(loss="log_loss", penalty="l2", max_iter=100)  # SGD with logistic loss, l2 penalty, and max iteration of 100
}

In [51]:
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    print(f"\n{'='*40}\nModel: {name}\n{'='*40}")
    
    # Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    
    # Precision, Recall, F1-Score (macro average)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')
    print(f"Precision (macro): {precision:.4f}")
    print(f"Recall (macro): {recall:.4f}")
    print(f"F1 Score (macro): {f1:.4f}")
    
    # Confusion Matrix
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    # Classification Report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))


Model: DT_gini
Accuracy: 0.3929
Precision (macro): 0.4001
Recall (macro): 0.3776
F1 Score (macro): 0.3776
Confusion Matrix:
[[14 17  4]
 [ 4 14 11]
 [ 5 10  5]]
Classification Report:
              precision    recall  f1-score   support

        high       0.61      0.40      0.48        35
         low       0.34      0.48      0.40        29
    moderate       0.25      0.25      0.25        20

    accuracy                           0.39        84
   macro avg       0.40      0.38      0.38        84
weighted avg       0.43      0.39      0.40        84


Model: DT_entropy
Accuracy: 0.3929
Precision (macro): 0.3769
Recall (macro): 0.3729
F1 Score (macro): 0.3693
Confusion Matrix:
[[19  9  7]
 [ 8  8 13]
 [11  3  6]]
Classification Report:
              precision    recall  f1-score   support

        high       0.50      0.54      0.52        35
         low       0.40      0.28      0.33        29
    moderate       0.23      0.30      0.26        20

    accuracy                