In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

df = pd.read_csv('ocd_patient_dataset.csv')

<h1 style ="font-size: 36px; font-family:Italic ; font-weight : normal; background-color: black ; color : pink; text-align: center; border-radius: 50px 55px; font-style:Italic
">OCD Demographic Clinical Dataset</h1

__SUMMARY__ : The dataset includes detailed information on patients diagnosed with OCD, encompassing demographic details, OCD diagnosis date, symptom duration, previous diagnoses, family history, obsession and compulsion types, Y-BOCS scores, comorbidities, and prescribed medications.
A variety of OCD symptom presentations are evident, with diverse Y-BOCS scores for obsessions and compulsions, and comorbid conditions including major depressive disorder (MDD), PTSD, panic disorder, and generalized anxiety disorder (GAD).
Medication usage spans SNRIs, SSRIs, and benzodiazepines, reflecting the diverse pharmacological management approaches employed for OCD patients.

`Glossary`:__Y-BOCS__ stands for Yale-Brown Obsessive Compulsive Scale. It is a widely used clinician-administered scale to assess the severity of obsessive-compulsive disorder (OCD) symptoms. The scale evaluates the severity of obsessions and compulsions, and it is often used to monitor changes in symptoms over the course of treatment.

In [None]:
df

In [None]:
# Drop duplicate rows across all columns
df = df.drop_duplicates()

In [None]:
# Drop column: 'Patient ID'
df = df.drop(columns=['Patient ID'])


In [None]:
# Replace missing values with 0 in columns: 'Patient ID', 'Age' and 3 other columns
df = df.fillna({'Patient ID': 0, 'Duration of Symptoms (months)': 0, 'Y-BOCS Score (Obsessions)': 0, 'Y-BOCS Score (Compulsions)': 0})
df

This code prints the dimensions of the DataFrame df, i.e., the number of rows and columns in the DataFrame.

In [None]:

print (df.shape)


This code calculates the number of null or missing values in each column of the DataFrame 

In [None]:
#this will show the number of null or blank spaces in the data
df.isnull().sum()



This code finds the mode (the most frequently occurring value) of the 'Previous Diagnoses' column in the DataFrame and stores it in the variable and the mode of the 'Medications' column in the DataFrame and stores it in the variable and in place fills missing values in the 'Previous Diagnoses' column with the mode calculated earlier. The inplace=True parameter ensures that the changes are made directly to the DataFrame.

In [None]:
#Find the mode of 'Previous Diagnoses' column
previous_diagnoses_mode = df['Previous Diagnoses'].mode()[0]

# Find the mode of 'Medications' column
medications_mode = df['Medications'].mode()[0]

# Fill missing values with the mode
df['Previous Diagnoses'].fillna(previous_diagnoses_mode, inplace=True)
df['Medications'].fillna(medications_mode, inplace=True)
df

This code is used to remove leading whitespace (spaces at the beginning of a string) from the specified columns in the DataFrame df using the str.lstrip()

In [None]:

#Remove leading whitespace in columns: 'Education Level', 'OCD Diagnosis Date' and 7 other columns
df['Education Level'] = df['Education Level'].str.lstrip()
df['OCD Diagnosis Date'] = df['OCD Diagnosis Date'].str.lstrip()
df['Previous Diagnoses'] = df['Previous Diagnoses'].str.lstrip()
df['Family History of OCD'] = df['Family History of OCD'].str.lstrip()
df['Obsession Type'] = df['Obsession Type'].str.lstrip()
df['Compulsion Type'] = df['Compulsion Type'].str.lstrip()
df['Depression Diagnosis'] = df['Depression Diagnosis'].str.lstrip()
df['Anxiety Diagnosis'] = df['Anxiety Diagnosis'].str.lstrip()
df['Medications'] = df['Medications'].str.lstrip()

Here the gender column is converted into Numeric values for MALE its 0 and for FEMALE is 1 

In [None]:
#replacing the values in the 'Gender' column with 1 if the value is 'female' and 0 otherwise

df['Gender'] = df['Gender'].apply(lambda x: 1 if x.lower() == 'female' else 0)
df['Gender']


Converting the categorical '__Previous diagnosis__ ','__Education levels__','__Ethnicity__' and '__Marital status__' into numerical values.


In [None]:
# Define the mapping
previous_diagnoses_mapping = {
    'MDD': 0,
    'GAD': 1,
    'Panic Disorder': 2,
    'PTSD': 3
}

# Apply the mapping to the 'Previous Diagnoses' column
df['Previous Diagnoses'] = df['Previous Diagnoses'].map(previous_diagnoses_mapping)


In [None]:
# Define the mapping
education_level_mapping = {
    'High School': 0,
    'Some College': 1,
    'College Degree': 2,
    'Graduate Degree': 3
}

# Apply the mapping to the 'Education Level' column
df['Education Level'] = df['Education Level'].map(education_level_mapping)

In [None]:
# Define the mapping
ethnicity_mapping = {
    'Caucasian': 0,
    'Asian': 1,
    'African': 2,
    'Hispanic': 3
}

# Apply the mapping to the 'Ethnicity' column
df['Ethnicity'] = df['Ethnicity'].map(ethnicity_mapping)


In [None]:
# Define the mapping
marital_status_mapping = {
    'Single': 0,
    'Married': 1,
    'Divorced': 2
}

# Apply the mapping to the 'Marital Status' column
df['Marital Status'] = df['Marital Status'].map(marital_status_mapping)

creating a customized count plot using seaborn to visualize the distribution of Obsession Types across two genders

In [None]:
# Create the bar plot
plt.figure(figsize=(8, 6))
sns.set(style="whitegrid")

# Create the count plot with specified palette
plot = sns.countplot(data=df, x='Gender', hue='Obsession Type', palette='Oranges')

# Set the background color
sns.set(rc={'axes.facecolor':'lightblue', 'figure.facecolor':'lightblue'})
#Add a title by 
plt.title('Obsession Type by Gender')
# Add data labels
for p in plot.patches:
    plot.annotate(format(p.get_height(), '.0f'), 
                  (p.get_x() + p.get_width() / 2., p.get_height()), 
                  ha = 'center', va = 'center', 
                  xytext = (0, 10), 
                  textcoords = 'offset points')

# Show the plot
plt.show()


Obsession type by ethnicity using barplot with legends to understand better

In [None]:
# Set the background color
sns.set(rc={'axes.facecolor':'lightblue', 'figure.facecolor':'lightblue'})

# Create the bar plot
plt.figure(figsize=(10, 6))
sns.set(style="whitegrid")
plot = sns.countplot(data=df, x='Ethnicity', hue='Obsession Type', palette='coolwarm')
# Add a title
plt.title('Obsession Type by Ethnicity')
# Add data labels
for p in plot.patches:
    plot.annotate(format(p.get_height(), '.0f'), 
                  (p.get_x() + p.get_width() / 2., p.get_height()), 
                  ha = 'center', va = 'center', 
                  xytext = (0, 10), 
                  textcoords = 'offset points')


# Display the plot
plt.show()

OBsession and Compulsion type with reference To Age using swarmplot

In [None]:
# Set the background color
sns.set(rc={'axes.facecolor':'lightblue', 'figure.facecolor':'lightblue'})

# Create the swarm plot
plt.figure(figsize=(10, 6))
sns.swarmplot(data=df, x='Obsession Type', y='Age', hue='Compulsion Type', palette='icefire')

# Add a title
plt.title('Obsession and Compulsion Types by Age')

# Add axis titles
plt.xlabel('Obsession Type')
plt.ylabel('Age')

# Add a legend
plt.legend(title='Compulsion Type', title_fontsize='13', loc='upper right')

# Display the plot
plt.show()

Depression and Anxiety Diagnoses by Gender

In [None]:

# Sample data
df = {
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female'],
    'Diagnosis': ['Depression', 'Depression', 'Anxiety', 'Anxiety', 'Depression', 'Anxiety'],
    'Count': [20, 30, 25, 35, 18, 22]
}
# Create a DataFrame from the sample data
df_sample = pd.DataFrame(df)

# Set the background color
sns.set(rc={'axes.facecolor':'grey', 'figure.facecolor':'grey'})

# Create the grouped bar plot
plt.figure(figsize=(10, 6))
plot = sns.barplot(data=df, x='Gender', y='Count', hue='Diagnosis', palette='pastel')

# Add a title
plt.title('Distribution of Depression and Anxiety Diagnosis by Gender') 

Its a scatterplot between age and medications

In [None]:
sample_data = df[['Age', 'Medications']].sample(n=100, random_state=42)  # Selecting 100 random rows

# Print sample data to verify
print(sample_data.head())


In [None]:

# Plotting
plt.figure(figsize=(10, 6))
sns.scatterplot(data=sample_data, x='Age', y='Medications', hue='Age', palette='icefire')
plt.title('Age vs Medications Scatter Plot')
plt.xlabel('Age')
plt.ylabel('Medications')
plt.show()

Distribution of Anxiety Diagnoses by Education level

In [None]:


# Set the background color
sns.set(rc={'axes.facecolor':'grey', 'figure.facecolor':'lightblue'})


# Create the count plot for anxiety diagnosis by education level
plt.figure(figsize=(10, 6))
plot = sns.countplot(data=df, x='Education Level',hue='Anxiety Diagnosis', palette='Oranges')

# Add a title
plt.title('Distribution of Anxiety Diagnosis by Education Level')

# Add axis titles
plt.xlabel('Education Level')
plt.ylabel('Count of Diagnosis')
# Add data labels
for p in plot.patches:
    plot.annotate(format(p.get_height(), '.0f'), 
                  (p.get_x() + p.get_width() / 2., p.get_height()), 
                  ha = 'center', va = 'center', 
                  xytext = (0, 10), 
                  textcoords = 'offset points')


# Display the plot
plt.show()


__SUMMARY__: 

1- women and men have both harm related obsession while more women are turned towards hoarding.
2- Caucasians and Hispanic are more towards Contamination type of OCD while Asians are most owards Harm related OCD .Africans have religious and hoarding problems.
3-MOost of the obsession and compulsion types are at the Age between 50 to 60 by Harm related and Hoarding from 30 to 40 Age.
4-from 20 to 30 age , and 60 to 70 Benzodiazepine is mostly used and then least used is SSRI.
5-Male has less Depression more anxiety while women have more depression then anxiety .

