OCD Patient Dataset: Demographics & Clinical Data Analysis

Import the Library and Reading the file

In [147]:
# import the necessary library

import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [148]:
# reading the data from the dataset

data = pd.read_csv(r"D:\OCD Patient Dataset_ Demographics & Clinical Data.csv")

In [149]:
# convert the dataset into dataframe

data = pd.DataFrame(data)

In [None]:
# print the top 5 values of the dataset

data.head()

In [None]:
# print the dimension of the dataset

data.shape

In [None]:
# print the dataset statistical values

data.describe()

In [None]:
# print dataset types 

data.info()

In [None]:
# print the data columns

data.columns

Data Preprocessing

1. Data Cleaning

In [None]:
# check if dataset has null values or not

data.isnull().sum()

# Note : if dataset has null values delete the values or fill those columns 

In [None]:
# Fill the null values using fillna function

data[['Previous Diagnoses', 'Medications']].fillna('unknown', inplace = True)

2. Data Normalisation

In [None]:
# convert the object type data columns into categorical type

data[['Gender', 'Ethnicity', 'Marital Status', 'Education Level', 'OCD Diagnosis Date', 'Previous Diagnoses', 'Family History of OCD', 'Obsession Type', 'Compulsion Type', 'Depression Diagnosis', 'Anxiety Diagnosis', 'Medications']].astype('category')

Data Visualisattion

In [None]:
palette = sns.color_palette("viridis")

plt.subplot(1, 3, 1)
sns.countplot(x = data["Gender"], palette = palette)
plt.xticks(rotation = 30)

plt.subplot(1, 3, 3)
sns.countplot(x = data["Ethnicity"], palette = palette)
plt.xticks(rotation = 30)

In [None]:
palette = sns.color_palette("viridis")

plt.subplot(1, 3, 1)
sns.countplot(x = data["Marital Status"], palette = palette)
plt.xticks(rotation = 30)

plt.subplot(1, 3, 3)
sns.countplot(x = data["Education Level"], palette = palette)
plt.xticks(rotation = 30)

In [None]:
palette = sns.color_palette("magma")

plt.subplot(1, 3, 1)
sns.countplot(x = data["Family History of OCD"], palette = palette)
plt.xticks(rotation = 30)

plt.subplot(1, 3, 3)
sns.countplot(x = data["Obsession Type"], palette = palette)
plt.xticks(rotation = 90)

In [None]:
palette = sns.color_palette("magma")

plt.subplot(1, 3, 1)
sns.countplot(x = data["Compulsion Type"], palette = palette)
plt.xticks(rotation = 90)

plt.subplot(1, 3, 3)
sns.countplot(x = data["Depression Diagnosis"], palette= palette)
plt.xticks(rotation = 90)

Pre processing

In [162]:
data_col = data[["Gender", "Ethnicity", "Marital Status", "Education Level", "Previous Diagnoses", "Family History of OCD", "Obsession Type", "Compulsion Type", "Depression Diagnosis", "Anxiety Diagnosis", "Medications"]]

df = pd.DataFrame(data_col)

def map_column_to_numbers(column, counter):
    value_counts = column.value_counts()
    value_map = {}
    
    for value in value_counts.index:
        if value not in value_map:
            value_map[value] = counter
            counter += 1
    
    return column.map(value_map), counter

for col in df : 
    counter = 0
    data[col], counter = map_column_to_numbers(df[col], counter)
    
data = data.drop(columns=['OCD Diagnosis Date'],axis=1)

In [None]:
# print the skewness and kurtosis of the dataset

print("Skewness: %f" % data['Previous Diagnoses'].skew())
print("Kurtosis: %f" % data['Previous Diagnoses'].kurt())

Training and Visualizing the data

In [164]:
# Use mean using Simple Imputer method 

num_cols = ['Previous Diagnoses']
num_imp = SimpleImputer(strategy='mean')

data[num_cols] = pd.DataFrame(num_imp.fit_transform(data[num_cols]), columns = num_cols)

In [165]:
data = data.dropna(axis=0,how='any')

In [None]:
# print the top 5 values of the data

data.head()

In [167]:
#  Training the features

train_feature = data.columns.drop('Medications').tolist()

In [None]:
data[train_feature].describe().T.style.bar(subset=['mean'], color=px.colors.qualitative.G10[0]).background_gradient(subset=['std'], cmap='BuPu').background_gradient(subset=['50%'], cmap='Reds')

In [None]:
for feat in train_feature :

    plt.figure(figsize = (15, 3))
    ax1 = plt.subplot(1, 2, 1)
    data[feat].plot(kind = 'hist', bins = 50, color = 'Green')
    plt.title(feat + ' / train')
    plt.show()

In [None]:
for col in data.columns :    
    print(f"Skewness of {col} : %f" % data[col].skew())
    print(f"Kurtosis of {col} : %f" % data[col].kurt())

Feature Selection, Training the model and Visualisation of Data

In [None]:
x = data.drop(columns=['Medications'],axis=1)
y = data['Medications']

# Training the model 

model = [xgb.XGBClassifier()]

model = [model[i].fit(x, y) for i in range(len(model))]

num_chr = [12, 12, 10]

for i in range(len(model)):

    print(str(model[i])[:num_chr[i]] + ': \n', model[i].feature_importances_)

    feat_importances = pd.Series(model[i].feature_importances_, index = x.columns)
    feat_importances.nlargest(15).plot(kind='barh', color='royalblue')

    plt.xlim(0, 0.5)
    plt.show()

In [None]:
corr = data.corr(method='pearson')

fig, ax = plt.subplots(figsize=(15, 15))

sns.heatmap(corr, cmap='RdBu', annot=True, fmt=".2f")

plt.xticks(range(len(corr.columns)), corr.columns)
plt.yticks(range(len(corr.columns)), corr.columns)

plt.show()

In [174]:
x = data.drop(columns=['Medications'],axis=1)
y = data['Medications']

MinMaxScaler = MinMaxScaler()

x = MinMaxScaler.fit_transform(x)
x = pd.DataFrame(x)

Data Modeling

In [175]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2019)

In [None]:
# print the dataset info

print("Shape of x_train: ",x_train.shape)
print("Shape of x_eval: ", x_test.shape)
print("Shape of y_train: ",y_train.shape)
print("Shape of y_eval",y_test.shape)

In [177]:
model_1 = SVC()
model_2 = LogisticRegression()
model_3 = SGDClassifier()
model_4 = xgb.XGBClassifier(objective='multi:softmax')
model_5 = KNeighborsClassifier()
model_6 = RandomForestClassifier()
model_7 = ExtraTreesClassifier()
model_8 = HistGradientBoostingClassifier()

In [178]:
model = VotingClassifier(estimators=[('SVM', model_1), ('Log', model_2), ('SGD', model_3), ('XGBoost', model_4), ('KNeighbors', model_5), ('RandomForest', model_6), ('ExtraTrees', model_7),('HistGradientBoosting', model_8)], voting = 'hard')

In [179]:
model_list = [model_1, model_2, model_3, model_5, model_6, model_7, model_8, model]
model_name = ['SVC', 'Log', 'SGD','XGBoost','KNeighbors','RandomForest','ExtraTrees','HistGradientBoosting']

In [None]:
for clf, label in zip(model_list, model_name) :
    scores = cross_val_score(clf, x_train, y_train, scoring='accuracy', cv=5)
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

In [181]:
model_1 = model_1.fit(x_train, y_train)
model_2 = model_2.fit(x_train, y_train)
model_3 = model_3.fit(x_train, y_train)
model_4 = model_4.fit(x_train, y_train)
model_5 = model_5.fit(x_train, y_train)
model_6 = model_6.fit(x_train, y_train)
model_7 = model_7.fit(x_train, y_train)
model_8 = model_8.fit(x_train, y_train)

Voting = model.fit(x_train, y_train)

In [None]:
y_pred = Voting.predict(x_test)
score = accuracy_score(y_test, y_pred)

# print the accuracy score

print("Voting accuracy is: {0:.3f}%".format(score * 100))

In [183]:
# Create a confusion matrix

cm = confusion_matrix(y_test, y_pred)

Data Visualisation

In [None]:
plt.figure(figsize = (8, 4))
sns.heatmap(cm, annot=True, fmt='.0f')
plt.xlabel("Predicted Digits")
plt.ylabel("True Digits")
plt.show()