# Predicting Psychiatric Diagnosis Using Machine Learning: A Multi-Model Approach

### This Reasearch Project aim to use clinical data to predict the diagnosis (DIAGN) using various machine learning models, compare performance, and interpret the results

In [3]:
import matplotlib.pyplot as plt
import pandas as pd
import pylab as pl
import numpy as np
%matplotlib inline
print('modules are imported')

modules are imported


### Let's import our dataset

In [4]:
df = pd.read_excel(r'C:\Users\USER\Desktop\Research_Focus\Clinical_data.xlsx')

# take a look at the dataset
df.head()

Unnamed: 0,YEAR,AGE,SEX,OCCUP,MAR_STA,DUR_EPIS,P_PSY_HX,P_MED_HX,FAM_P_HX,P_SOC_HX,...,INT_GFK,INT_S_A_D,INT_CAL,INT_PROV,JUDGMT,INSIGHT,PSE,EEG,DIAGN,CLASS
0,2017,26,F,NURSE,WIDOW,0.5,RAPE,DIABETES,NO,YES,...,,,,,,PARTIAL,GOOD,NORMAL,PARANOID SCHIZ,SCHIZ
1,2016,28,F,UNEMPLOYED,MARRIED,48.0,,,,,...,,,,,POOR,POOR,GOOD,NORMAL,PARANOID SCHIZ,SCHIZ
2,2016,26,M,UNEMPLOYED,SINGLE,1.0,NO,SHORT-SIGHT,NO,NO,...,,,,,POOR,PARTIAL,GOOD,NORMAL,PARANOID SCHIZ,SCHIZ
3,2016,20,M,STUDENT,SINGLE,2.0,,,,YES,...,,,,,,,GOOD,NORMAL,PARANOID SCHIZ,SCHIZ
4,2016,34,F,UNEMPLOYED,SINGLE,6.0,,,,,...,,,,,,POOR,GOOD,NORMAL,PARANOID SCHIZ,SCHIZ


In [5]:
### Let's inspect our datasets
df.info()
df.describe()
df['DIAGN'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 664 entries, 0 to 663
Data columns (total 38 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   YEAR        664 non-null    int64 
 1   AGE         664 non-null    int64 
 2   SEX         664 non-null    object
 3   OCCUP       654 non-null    object
 4   MAR_STA     659 non-null    object
 5   DUR_EPIS    655 non-null    object
 6   P_PSY_HX    642 non-null    object
 7   P_MED_HX    630 non-null    object
 8   FAM_P_HX    639 non-null    object
 9   P_SOC_HX    644 non-null    object
 10  P_SEX_HX    620 non-null    object
 11  FOR_HX      637 non-null    object
 12  PREMOBD_HX  623 non-null    object
 13  MSE         658 non-null    object
 14  SPEECH      643 non-null    object
 15  MOOD        594 non-null    object
 16  AFFECT      604 non-null    object
 17  TH_FORM     490 non-null    object
 18  TH_STRM     476 non-null    object
 19  TH_CONTENT  464 non-null    object
 20  TH_POSS   

DIAGN
PARANOID SCHIZ                          143
DEPRESSIVE EPISODE                       37
HEBEPHRENIC SCHIZ                        32
SIMPLE SCHIZOPHRENIA                     19
BIPOLAR AFFECTION DISORDER               17
SUBSTANCE USE                            10
DISORGANIZED SCHIZOPHRENIA               10
REMISSION                                 9
ACUTE PSYCHOSIS                           9
SCHIZOAFFECTIVE DISORDER                  8
ORGANIC MENTAL DISORDER                   8
DEPRESSIVE                                7
DEPRESSIVE DISORDER                       7
DEPRESSION                                6
MBD                                       5
RELAPSED SCHIZOPHRENIA                    5
MENTAL&BEHAVIOURAL DISORDER               5
RESIDUAL SCHIZOPHRENIA                    4
INTELLECTUAL DIABILITY                    4
SEIZURE DISORDER                          4
UNDIFFERENTIATED SCHIZOPHRENIA            4
COMPLEX PARTIAL SEIZURE                   4
DEMENTIAL                 

### Handling Missing Data
#### We impute or drop columns/rows depending on missingness percentage

In [6]:
# Show columns with missing values
missing = df.isnull().sum()
missing[missing > 0].sort_values(ascending=False)

TH_POSS       380
INT_PROV      345
INT_CAL       337
INT_S_A_D     325
INT_GFK       297
DIAGN         249
TH_CONTENT    200
PERCEP        196
TH_STRM       188
MEM_ST        181
TH_FORM       174
MEM_LT        171
MEM_IR        155
CONC          120
ORIENT        115
ATTEN         109
JUDGMT         90
INSIGHT        73
MOOD           70
AFFECT         60
P_SEX_HX       44
PREMOBD_HX     41
P_MED_HX       34
PSE            27
FOR_HX         27
FAM_P_HX       25
P_PSY_HX       22
SPEECH         21
P_SOC_HX       20
EEG            19
OCCUP          10
DUR_EPIS        9
MSE             6
MAR_STA         5
dtype: int64

### Drop Irrecoverable Rows or Columns

In [7]:
# Drop rows with more than 50% missing values
threshold_row = df.shape[1] * 0.5
df = df[df.isnull().sum(axis=1) < threshold_row]

# Drop columns with more than 50% missing
threshold_col = df.shape[0] * 0.5
df = df.loc[:, df.isnull().sum(axis=0) < threshold_col]

df.head()
df.shape



(624, 37)

### Using AI-Driven Imputation Methods rather than the conventional 'mode' and 'median' approach. This approach involves training AI models to impute missing categorical data by learning from patterns in the complete data

### Grouping features into Thematic sets. 
#### Instead of imputing all the missing values at once; we analyze the data using smaller clinically related groups. Imputing each block separately helps to give better signal, better dimensionality, and higher completeness.

 1. Mental Status Examination (MSE) Group:
These assess the patient’s current mental state, such as mood, thought process, perception, memory, attention, etc.
Features:                                             
MOOD, AFFECT, INSIGHT, TH_FORM, TH_CONTENT, TH_STRM, PERCEP, SPEECH, MEM_LT, MEM_ST, MEM_IR,
CONC, ORIENT, ATTEN, JUDGMT                              |
2.  Psychiatric & Personal History Group:
Captures the patient's clinical, social, and personal history. 
Features: P_SEX_HX, P_MED_HX, PREMOBD_HX, P_PSY_HX, P_SOC_HX, FOR_HX, FAM_P_HX, EEG
3. Investigative / Clinical Interview Group:
These are insights or scores from clinical interviews or structured diagnostic assessments.
Features: TH_POSS, INT_PROV, INT_CAL, INT_S_A_D, INT_GFK
4. Diagnosis & Functionality Group:
These reflect final outcomes, functioning, and high-level status markers.
Features: DIAGN, MSE, DUR_EPIS, MAR_STA, OCCUP
Note: MSE here is a summary score or classification, not the full exam.

In [9]:
# Group 1: MSE Features
mse_cols = [
    'MOOD', 'AFFECT', 'INSIGHT', 'TH_FORM', 'TH_CONTENT', 'TH_STRM',
    'PERCEP', 'SPEECH', 'MEM_LT', 'MEM_ST', 'MEM_IR',
    'CONC', 'ORIENT', 'ATTEN', 'JUDGMT'
]

# Group 2: History Features
history_cols = [
    'P_SEX_HX', 'P_MED_HX', 'PREMOBD_HX', 'P_PSY_HX',
    'P_SOC_HX', 'FOR_HX', 'FAM_P_HX', 'EEG'
]

# Group 3: Clinical Interviews / Investigative
interview_cols = [
     'INT_PROV', 'INT_CAL', 'INT_S_A_D', 'INT_GFK'
]

# Group 4: Diagnosis and Outcome
diagnosis_cols = [
    'DIAGN', 'MSE', 'DUR_EPIS', 'MAR_STA', 'OCCUP'
]


### We can then extract these blocks for cleaning, imputation, and modeling.

In [10]:
df_mse = df[mse_cols]
df_hist = df[history_cols]
df_inv = df[interview_cols]
df_diag = df[diagnosis_cols]


In [11]:
df.to_excel("cleaned_clinical_dataset.xlsx", index=False)

In [12]:
import os
os.getcwd()  # Shows your current directory


'C:\\Users\\USER'

We’ll use:

Mode imputation for columns with low missingness (<10%)

RandomForestClassifier for moderate missingness (10%–40%)

Leave out or flag for very high missingness (>40%) unless crucial

In [13]:
#First, calculate missingness across the whole dataset:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder

# Load your cleaned dataset
df = pd.read_excel("cleaned_clinical_dataset.xlsx")  # Already dropped high-missing rows/cols

# Calculate missing % for all columns
missing_pct = df.isnull().mean() * 100


## 1. MSE Group Imputation

In [14]:
mse_cols = [
    'MOOD', 'AFFECT', 'INSIGHT', 'TH_FORM', 'TH_CONTENT', 'TH_STRM',
    'PERCEP', 'SPEECH', 'MEM_LT', 'MEM_ST', 'MEM_IR',
    'CONC', 'ORIENT', 'ATTEN', 'JUDGMT'
]

df_mse = df[mse_cols].copy()
encoder = OrdinalEncoder()
df_mse_encoded = pd.DataFrame(encoder.fit_transform(df_mse), columns=mse_cols)

# Impute group-wise based on missing level
for col in mse_cols:
    pct_missing = missing_pct[col]

    if pct_missing == 0:
        continue

    elif pct_missing < 10:  # Mode
        mode_val = df_mse_encoded[col].mode()[0]
        df_mse_encoded[col] = df_mse_encoded[col].fillna(mode_val)


    elif pct_missing < 40:  # Random Forest
        not_null_mask = df_mse_encoded[col].notnull()
        X_train = df_mse_encoded.loc[not_null_mask].drop(columns=[col])
        y_train = df_mse_encoded.loc[not_null_mask, col]
        X_pred = df_mse_encoded.loc[~not_null_mask].drop(columns=[col])

        if X_pred.shape[0] > 0:
            model = RandomForestClassifier(n_estimators=100, random_state=42)
            model.fit(X_train, y_train)
            df_mse_encoded.loc[~not_null_mask, col] = model.predict(X_pred)

    else:  # Very high missingness: replace with -1 or drop
        df_mse_encoded[col] = df_mse_encoded[col].fillna(-1)

# Decode
df_mse_imputed = pd.DataFrame(encoder.inverse_transform(df_mse_encoded), columns=mse_cols)


## 2. History Group Imputation

In [15]:
history_cols = [
    'P_SEX_HX', 'P_MED_HX', 'PREMOBD_HX', 'P_PSY_HX',
    'P_SOC_HX', 'FOR_HX', 'FAM_P_HX', 'EEG'
]

df_hist = df[history_cols].copy()
df_hist_encoded = pd.DataFrame(encoder.fit_transform(df_hist), columns=history_cols)

for col in history_cols:
    pct_missing = missing_pct[col]

    if pct_missing == 0:
        continue

    elif pct_missing < 20:
        df_hist_encoded[col] = df_hist_encoded[col].fillna(df_hist_encoded[col].mode()[0])

    elif pct_missing < 50:
        mask = df_hist_encoded[col].notnull()
        X_train = df_hist_encoded.loc[mask].drop(columns=[col])
        y_train = df_hist_encoded.loc[mask, col]
        X_pred = df_hist_encoded.loc[~mask].drop(columns=[col])

        if X_pred.shape[0] > 0:
            model = RandomForestClassifier(n_estimators=100, random_state=42)
            model.fit(X_train, y_train)
            df_hist_encoded.loc[~mask, col] = model.predict(X_pred)

    else:
        df_hist_encoded[col] = df_hist_encoded[col].fillna(-1)

df_hist_imputed = pd.DataFrame(encoder.inverse_transform(df_hist_encoded), columns=history_cols)


## 3. Investigative Group Imputation

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

df_inv = df[interview_cols].copy()
df_inv_encoded = df_inv.astype(str)  # Encode all as string for consistency

# Encode
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
df_inv_encoded = pd.DataFrame(
    encoder.fit_transform(df_inv_encoded),
    columns=interview_cols
)

# Loop through each column for imputation
for col in interview_cols:
    not_null_mask = ~df_inv[col].isnull()
    
    # If very few non-missing values, skip model — use fallback
    if not not_null_mask.sum() or not_null_mask.sum() < 20:
        # Fallback to mode fill
        mode_val = df_inv_encoded[col].mode()[0]
        df_inv_encoded[col] = df_inv_encoded[col].fillna(mode_val)
        continue

    # Features and target
    X = df_inv_encoded.loc[not_null_mask].drop(columns=[col])
    y = df_inv_encoded.loc[not_null_mask, col]
    
    X_pred = df_inv_encoded.loc[~not_null_mask].drop(columns=[col])
    
    # If no data to predict on, skip
    if X_pred.empty:
        continue

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X, y)

    y_pred = model.predict(X_pred)
    df_inv_encoded.loc[~not_null_mask, col] = y_pred

# Decode back
df_inv_imputed = pd.DataFrame(
    encoder.inverse_transform(df_inv_encoded),
    columns=interview_cols
)


## 4. Diagnosis/Outcome Group Imputation

In [17]:
diagnosis_cols = ['DIAGN', 'MSE', 'DUR_EPIS', 'MAR_STA', 'OCCUP']
df_diag = df[diagnosis_cols].copy()

# Convert all values to string (recommended before using OrdinalEncoder)
df_diag = df_diag.astype(str)

df_diag_encoded = pd.DataFrame(encoder.fit_transform(df_diag), columns=diagnosis_cols)

for col in diagnosis_cols:
    if missing_pct[col] < 20:
        df_diag_encoded[col] = df_diag_encoded[col].fillna(df_diag_encoded[col].mode()[0])
    else:
        df_diag_encoded[col] = df_diag_encoded[col].fillna(-1)

df_diag_imputed = pd.DataFrame(encoder.inverse_transform(df_diag_encoded), columns=diagnosis_cols)


Combine Imputed Blocks

In [18]:
df_imputed = pd.concat([df_mse_imputed, df_hist_imputed, df_inv_imputed, df_diag_imputed], axis=1)
df_imputed.shape

(624, 32)

Combine Imputed Groups with Remaining Data

In [19]:
# Original dataset (before splitting into groups)
df_original = pd.read_excel("cleaned_clinical_dataset.xlsx")

# Drop already imputed columns
columns_to_drop = mse_cols + history_cols + interview_cols + diagnosis_cols
df_remaining = df_original.drop(columns=columns_to_drop)

# Concatenate all imputed blocks and remaining columns
df_final = pd.concat([
    df_remaining.reset_index(drop=True),
    df_mse_imputed.reset_index(drop=True),
    df_hist_imputed.reset_index(drop=True),
    df_inv_imputed.reset_index(drop=True),
    df_diag_imputed.reset_index(drop=True)
], axis=1)


In [17]:
df_final.to_excel("final_cleaned_dataset.xlsx", index=False)
df_final.head()

Unnamed: 0,YEAR,AGE,SEX,PSE,CLASS,MOOD,AFFECT,INSIGHT,TH_FORM,TH_CONTENT,...,EEG,INT_PROV,INT_CAL,INT_S_A_D,INT_GFK,DIAGN,MSE,DUR_EPIS,MAR_STA,OCCUP
0,2017,26,F,GOOD,SCHIZ,NEUTRAL,DEPRESSED,PARTIAL,LOGICAL,PERSECUTORY DELUSION,...,NORMAL,GOOD,GOOD,GOOD,GOOD,PARANOID SCHIZ,KEMPT,0.5,WIDOW,NURSE
1,2016,28,F,GOOD,SCHIZ,HAPPY,REACTIVE,POOR,LOGICAL,PERSECUTORY DELUSION,...,NORMAL,GOOD,GOOD,GOOD,GOOD,PARANOID SCHIZ,KEMPT,48.0,MARRIED,UNEMPLOYED
2,2016,26,M,GOOD,SCHIZ,RELAXED,BLUNT,PARTIAL,LOGICAL,NORMAL,...,NORMAL,GOOD,GOOD,GOOD,GOOD,PARANOID SCHIZ,KEMPT,1.0,SINGLE,UNEMPLOYED
3,2016,20,M,GOOD,SCHIZ,FINE,REACTIVE,POOR,LOGICAL,OBSESSION,...,NORMAL,GOOD,GOOD,GOOD,GOOD,PARANOID SCHIZ,POOR EYE CONTACT,2.0,SINGLE,STUDENT
4,2016,56,F,GOOD,SCHIZ,HAPPY,RESTRICTED,PERSISTENT,LOGICAL,PERSECUTORY DELUSION,...,NORMAL,FAIR,FAIR,FAIR,FAIR,PARANOID SCHIZ,KEMPT,12.0,DIVORCED,CLEANER


In [20]:
missing_after = df_final.isnull().sum()
print(missing_after[missing_after > 0])


PSE    14
dtype: int64


To handle the missing values in PSE column:

In [21]:
df_final['PSE'] = df_final['PSE'].fillna(df_final['PSE'].mode()[0])
df_final.shape
df_final.head()

Unnamed: 0,YEAR,AGE,SEX,PSE,CLASS,MOOD,AFFECT,INSIGHT,TH_FORM,TH_CONTENT,...,EEG,INT_PROV,INT_CAL,INT_S_A_D,INT_GFK,DIAGN,MSE,DUR_EPIS,MAR_STA,OCCUP
0,2017,26,F,GOOD,SCHIZ,NEUTRAL,DEPRESSED,PARTIAL,LOGICAL,PERSECUTORY DELUSION,...,NORMAL,GOOD,GOOD,GOOD,GOOD,PARANOID SCHIZ,KEMPT,0.5,WIDOW,NURSE
1,2016,28,F,GOOD,SCHIZ,HAPPY,REACTIVE,POOR,LOGICAL,PERSECUTORY DELUSION,...,NORMAL,GOOD,GOOD,GOOD,GOOD,PARANOID SCHIZ,KEMPT,48.0,MARRIED,UNEMPLOYED
2,2016,26,M,GOOD,SCHIZ,RELAXED,BLUNT,PARTIAL,LOGICAL,NORMAL,...,NORMAL,GOOD,GOOD,GOOD,GOOD,PARANOID SCHIZ,KEMPT,1.0,SINGLE,UNEMPLOYED
3,2016,20,M,GOOD,SCHIZ,FINE,REACTIVE,POOR,LOGICAL,OBSESSION,...,NORMAL,GOOD,GOOD,GOOD,GOOD,PARANOID SCHIZ,POOR EYE CONTACT,2.0,SINGLE,STUDENT
4,2016,56,F,GOOD,SCHIZ,HAPPY,RESTRICTED,PERSISTENT,LOGICAL,PERSECUTORY DELUSION,...,NORMAL,FAIR,FAIR,FAIR,FAIR,PARANOID SCHIZ,KEMPT,12.0,DIVORCED,CLEANER


To inspect unique Dagnosis features

In [22]:
df_final['DIAGN'].value_counts(dropna=False)


DIAGN
nan                                     229
PARANOID SCHIZ                          136
DEPRESSIVE EPISODE                       37
HEBEPHRENIC SCHIZ                        31
SIMPLE SCHIZOPHRENIA                     18
BIPOLAR AFFECTION DISORDER               17
DISORGANIZED SCHIZOPHRENIA                9
REMISSION                                 9
SUBSTANCE USE                             8
ACUTE PSYCHOSIS                           8
SCHIZOAFFECTIVE DISORDER                  7
DEPRESSIVE                                7
DEPRESSIVE DISORDER                       7
ORGANIC MENTAL DISORDER                   7
DEPRESSION                                6
MENTAL&BEHAVIOURAL DISORDER               5
RELAPSED SCHIZOPHRENIA                    5
MBD                                       5
INTELLECTUAL DIABILITY                    4
RESIDUAL SCHIZOPHRENIA                    4
DEMENTIAL                                 4
COMPLEX PARTIAL SEIZURE                   4
UNDIFFERENTIATED SCHIZOPHR

In [23]:
# To handle the missing values in the Target variable:
df_final['DIAGN'] = df_final['DIAGN'].fillna('NO_DIAGNOSIS')
df_final['DIAGN'] = df_final['DIAGN'].astype(str).str.upper().str.strip()
df_final.to_excel("Updated_cleaned_dataset.xlsx", index=False)
df_final.head()

Unnamed: 0,YEAR,AGE,SEX,PSE,CLASS,MOOD,AFFECT,INSIGHT,TH_FORM,TH_CONTENT,...,EEG,INT_PROV,INT_CAL,INT_S_A_D,INT_GFK,DIAGN,MSE,DUR_EPIS,MAR_STA,OCCUP
0,2017,26,F,GOOD,SCHIZ,NEUTRAL,DEPRESSED,PARTIAL,LOGICAL,PERSECUTORY DELUSION,...,NORMAL,GOOD,GOOD,GOOD,GOOD,PARANOID SCHIZ,KEMPT,0.5,WIDOW,NURSE
1,2016,28,F,GOOD,SCHIZ,HAPPY,REACTIVE,POOR,LOGICAL,PERSECUTORY DELUSION,...,NORMAL,GOOD,GOOD,GOOD,GOOD,PARANOID SCHIZ,KEMPT,48.0,MARRIED,UNEMPLOYED
2,2016,26,M,GOOD,SCHIZ,RELAXED,BLUNT,PARTIAL,LOGICAL,NORMAL,...,NORMAL,GOOD,GOOD,GOOD,GOOD,PARANOID SCHIZ,KEMPT,1.0,SINGLE,UNEMPLOYED
3,2016,20,M,GOOD,SCHIZ,FINE,REACTIVE,POOR,LOGICAL,OBSESSION,...,NORMAL,GOOD,GOOD,GOOD,GOOD,PARANOID SCHIZ,POOR EYE CONTACT,2.0,SINGLE,STUDENT
4,2016,56,F,GOOD,SCHIZ,HAPPY,RESTRICTED,PERSISTENT,LOGICAL,PERSECUTORY DELUSION,...,NORMAL,FAIR,FAIR,FAIR,FAIR,PARANOID SCHIZ,KEMPT,12.0,DIVORCED,CLEANER


In [24]:
import streamlit as st
import pandas as pd

# Define Diagnostic Groups
diagnosis_map = {

    # Psychotic-Mood Disorders
    "SCHIZOAFFECTIVE DISORDER": "Psychotic-Mood Disorders",
    "BIPOLAR AFFECTION DISORDER": "Psychotic-Mood Disorders",
    "BIPOLAR DISORDER": "Psychotic-Mood Disorders",
    "MANIC EPISODE": "Psychotic-Mood Disorders",
    "ACUTE PSYCHOSIS": "Psychotic-Mood Disorders",
    
    # Pure Psychotic Disorders
    "PARANOID SCHIZ": "Psychotic Disorders",
    "HEBEPHRENIC SCHIZ": "Psychotic Disorders",
    "SIMPLE SCHIZOPHRENIA": "Psychotic Disorders",
    "DISORGANIZED SCHIZOPHRENIA": "Psychotic Disorders",
    "RELAPSED SCHIZOPHRENIA": "Psychotic Disorders",
    "RESIDUAL SCHIZOPHRENIA": "Psychotic Disorders",
    "UNDIFFERENTIATED SCHIZOPHRENIA": "Psychotic Disorders",
    "ORGANIC SCHIZOPHRENIA": "Psychotic Disorders",
    "PERSITENT DELUSIONAL DISORDER": "Psychotic Disorders",
    "DELUSIONAL DISORDER": "Psychotic Disorders",
    "SCHIZOTYPAL PERSONALITY DISORDER": "Psychotic Disorders",
    "CATATONIC SCHIZ": "Psychotic Disorders",
    "CHRONIC SCHIZOPHRENIA": "Psychotic Disorders",
    "PSYCHOTIC DISORDER": "Psychotic Disorders",
    "SIMPLE SCHIZ": "Psychotic Disorders",
    "SCHIZOPHRENIA WITH NEGATIVE SYMPTOMS": "Psychotic Disorders",
    
    # Pure Mood Disorders
    "DEPRESSIVE EPISODE": "Mood Disorders",
    "DEPRESSIVE": "Mood Disorders",
    "DEPRESSIVE DISORDER": "Mood Disorders",
    "DEPRESSION": "Mood Disorders",
    
    # Other categories (for completeness)
    "REMISSION": "Other",
    "SUBSTANCE USE": "Other",
    "ORGANIC MENTAL DISORDER": "Other",
    "MENTAL&BEHAVIOURAL DISORDER": "Other",
    "MBD": "Other",
    "INTELLECTUAL DIABILITY": "Other",
    "DEMENTIAL": "Other",
    "COMPLEX PARTIAL SEIZURE": "Other",
    "SEIZURE DISORDER": "Other",
    "BEHAVIOURAL DISORDER": "Other",
    "AUTISM": "Other",
    "SIEZURE": "Other",
    "GRIEF REACTION": "Other",
    "ANXIETY DISORDER": "Other",
    "ANAEMIA": "Other",
    "DRUG RELATED DISORDER": "Other",
    "BPD": "Other",
    "MENTAL RETARDATION": "Other",
    "INSOMNIA": "Other",
    "EPILEPSY": "Other",
    "ALZHEIMERS": "Other",
    "LATE ONSET": "Other",
}

# Apply mapping
df_final['DIAGN'] = df_final['DIAGN'].astype(str).str.upper().str.strip()

def map_diag(diag):
    for key in diagnosis_map:
        if key in diag:
            return diagnosis_map[key]
    return 'NO_DIAGNOSIS'

df_final['DIAGN_GROUP'] = df_final['DIAGN'].apply(map_diag)

In [25]:
df_final['DIAGN_GROUP'].value_counts()


DIAGN_GROUP
NO_DIAGNOSIS                229
Psychotic Disorders         227
Other                        77
Mood Disorders               57
Psychotic-Mood Disorders     34
Name: count, dtype: int64

In [26]:
# Drop rows with NO_DIAGNOSIS and Other
df_final = df_final[~df_final['DIAGN_GROUP'].isin(['NO_DIAGNOSIS', 'Other'])]

# Check the remaining categories
print("Remaining categories:")
print(df_final['DIAGN_GROUP'].value_counts())
print(f"\nTotal samples after filtering: {len(df_final)}")

Remaining categories:
DIAGN_GROUP
Psychotic Disorders         227
Mood Disorders               57
Psychotic-Mood Disorders     34
Name: count, dtype: int64

Total samples after filtering: 318


In [27]:
df = df_final.drop(columns=['DIAGN', 'CLASS'])
df.head()

Unnamed: 0,YEAR,AGE,SEX,PSE,MOOD,AFFECT,INSIGHT,TH_FORM,TH_CONTENT,TH_STRM,...,EEG,INT_PROV,INT_CAL,INT_S_A_D,INT_GFK,MSE,DUR_EPIS,MAR_STA,OCCUP,DIAGN_GROUP
0,2017,26,F,GOOD,NEUTRAL,DEPRESSED,PARTIAL,LOGICAL,PERSECUTORY DELUSION,NORMAL,...,NORMAL,GOOD,GOOD,GOOD,GOOD,KEMPT,0.5,WIDOW,NURSE,Psychotic Disorders
1,2016,28,F,GOOD,HAPPY,REACTIVE,POOR,LOGICAL,PERSECUTORY DELUSION,NORMAL,...,NORMAL,GOOD,GOOD,GOOD,GOOD,KEMPT,48.0,MARRIED,UNEMPLOYED,Psychotic Disorders
2,2016,26,M,GOOD,RELAXED,BLUNT,PARTIAL,LOGICAL,NORMAL,REDUCED,...,NORMAL,GOOD,GOOD,GOOD,GOOD,KEMPT,1.0,SINGLE,UNEMPLOYED,Psychotic Disorders
3,2016,20,M,GOOD,FINE,REACTIVE,POOR,LOGICAL,OBSESSION,NORMAL,...,NORMAL,GOOD,GOOD,GOOD,GOOD,POOR EYE CONTACT,2.0,SINGLE,STUDENT,Psychotic Disorders
4,2016,56,F,GOOD,HAPPY,RESTRICTED,PERSISTENT,LOGICAL,PERSECUTORY DELUSION,NORMAL,...,NORMAL,FAIR,FAIR,FAIR,FAIR,KEMPT,12.0,DIVORCED,CLEANER,Psychotic Disorders


In [28]:
df.to_excel("psychotic_mood_disorders_cleaned_dataset.xlsx", index=False)
df.head()

Unnamed: 0,YEAR,AGE,SEX,PSE,MOOD,AFFECT,INSIGHT,TH_FORM,TH_CONTENT,TH_STRM,...,EEG,INT_PROV,INT_CAL,INT_S_A_D,INT_GFK,MSE,DUR_EPIS,MAR_STA,OCCUP,DIAGN_GROUP
0,2017,26,F,GOOD,NEUTRAL,DEPRESSED,PARTIAL,LOGICAL,PERSECUTORY DELUSION,NORMAL,...,NORMAL,GOOD,GOOD,GOOD,GOOD,KEMPT,0.5,WIDOW,NURSE,Psychotic Disorders
1,2016,28,F,GOOD,HAPPY,REACTIVE,POOR,LOGICAL,PERSECUTORY DELUSION,NORMAL,...,NORMAL,GOOD,GOOD,GOOD,GOOD,KEMPT,48.0,MARRIED,UNEMPLOYED,Psychotic Disorders
2,2016,26,M,GOOD,RELAXED,BLUNT,PARTIAL,LOGICAL,NORMAL,REDUCED,...,NORMAL,GOOD,GOOD,GOOD,GOOD,KEMPT,1.0,SINGLE,UNEMPLOYED,Psychotic Disorders
3,2016,20,M,GOOD,FINE,REACTIVE,POOR,LOGICAL,OBSESSION,NORMAL,...,NORMAL,GOOD,GOOD,GOOD,GOOD,POOR EYE CONTACT,2.0,SINGLE,STUDENT,Psychotic Disorders
4,2016,56,F,GOOD,HAPPY,RESTRICTED,PERSISTENT,LOGICAL,PERSECUTORY DELUSION,NORMAL,...,NORMAL,FAIR,FAIR,FAIR,FAIR,KEMPT,12.0,DIVORCED,CLEANER,Psychotic Disorders


In [1]:
## So, we saved the above dataset for modeling. This is done in the Psychotic Mood Disorder Prediction 2 notebook. 