# Patient Data Analytics

## Importing Libraries

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import joblib

## Data Loading and Initial Inspection

In [3]:
# Load the dataset
df = pd.read_csv("/content/OPMD-PATIENTS.csv")


df.info()
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   SL. NO.               999 non-null    int64  
 1   PATIENT'S NAME        999 non-null    object 
 2   AGE                   998 non-null    float64
 3   SEX                   998 non-null    object 
 4   SOCIOECONOMIC STATUS  999 non-null    object 
 5   DIAGNOSIS             999 non-null    object 
 6   HABBITS               999 non-null    object 
 7   DURATION              999 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 62.6+ KB
   SL. NO.     PATIENT'S NAME    AGE SEX SOCIOECONOMIC STATUS  \
0         1   Manoj Kumar Rout  32.0   M   Lower Middle class   
1         2             Jeeban  37.0   M          Lower class   
2         3    Debidutta Nayak  40.0   M   Lower Middle class   
3         4  Abdul Amjad Khan   41.0   M          Upper class  

## Data Preprocessing and Cleaning

In [4]:
# Cleaning and Standardizing the 'DIAGNOSIS' Column
print("\nCleaning 'DIAGNOSIS' column...")
def clean_diagnosis(diagnosis_str):
    diagnosis_str = str(diagnosis_str).lower().strip()
    if 'oral submucous fibrosis' in diagnosis_str or 'osmf' in diagnosis_str or 'submucous fibrosis' in diagnosis_str or 'submucoous fibrosis' in diagnosis_str or 'submuous fibrosis' in diagnosis_str or 'sub fibrosis' in diagnosis_str or 'sub mucous fibrosis' in diagnosis_str or 'submucousfibrosis' in diagnosis_str or 'submucous fibrosi' in diagnosis_str or 'submucous firbosi' in diagnosis_str or 'submucous fibrrosis' in diagnosis_str or 'submucous fibrsosis' in diagnosis_str or 'submucous fiibrosis' in diagnosis_str or 'oal submucous fibrosis' in diagnosis_str or 'oralsubmucous fibrosis' in diagnosis_str:
        return 'oral submucous fibrosis'
    elif 'oral lichen planus' in diagnosis_str or 'lichen planus' in diagnosis_str or 'erosive lichen planus' in diagnosis_str or 'pigmented lichen planus' in diagnosis_str or 'reticular lichen planus' in diagnosis_str or 'oral lichen planua' in diagnosis_str or 'oral lichen plaanus' in diagnosis_str or 'oral liichen planus' in diagnosis_str:
        return 'oral lichen planus'
    elif 'leukoplakia' in diagnosis_str or 'leukkoplakia' in diagnosis_str or 'leukoplkia' in diagnosis_str:
        return 'leukoplakia'
    elif 'erythroplakia' in diagnosis_str or 'eruthroplakia' in diagnosis_str:
        return 'erythroplakia'
    elif 'frictional keratosis' in diagnosis_str:
        return 'frictional keratosis'
    elif 'tobacco pouch keratosis' in diagnosis_str:
        return 'tobacco pouch keratosis'
    elif 'smoker' in diagnosis_str and 'palate' in diagnosis_str:
        return 'smoker\'s palate'
    elif 'erythroleukoplakia' in diagnosis_str:
        return 'erythroleukoplakia'
    else:
        return 'other'

df['DIAGNOSIS'] = df['DIAGNOSIS'].apply(clean_diagnosis)
print("Unique DIAGNOSIS values after cleaning:")
print(df['DIAGNOSIS'].unique())

# Standardizing 'SOCIOECONOMIC STATUS' Column
print("\nStandardizing 'SOCIOECONOMIC STATUS' column...")
def standardize_socioeconomic_status(status_str):
    """
    Standardize socioeconomic status to four specific categories:
    - Upper Class
    - Lower Class
    - Upper Middle Class
    - Lower Middle Class
    """
    status_str = str(status_str).lower().strip()
    if 'upper class' in status_str:
        return 'Upper Class'
    elif 'lower class' in status_str:
        return 'Lower Class'
    elif 'upper middle class' in status_str:
        return 'Upper Middle Class'
    elif 'lower middle class' in status_str:
        return 'Lower Middle Class'
    else:
        return 'Lower Middle Class'

df['SOCIOECONOMIC STATUS'] = df['SOCIOECONOMIC STATUS'].apply(standardize_socioeconomic_status)
print("Unique SOCIOECONOMIC STATUS values after standardization:")
print(df['SOCIOECONOMIC STATUS'].unique())
print("SOCIOECONOMIC STATUS value counts:")
print(df['SOCIOECONOMIC STATUS'].value_counts())

# Standardizing 'SEX' Column
print("\nStandardizing 'SEX' column...")
def standardize_sex(sex_str):
    """
    Standardize sex to only M and F
    """
    sex_str = str(sex_str).upper().strip()
    if sex_str in ['M', 'MALE']:
        return 'M'
    elif sex_str in ['F', 'FEMALE']:
        return 'F'
    else:
        return 'M'

df['SEX'] = df['SEX'].apply(standardize_sex)
print("Unique SEX values after standardization:")
print(df['SEX'].unique())

# Handling Missing Values & Fill missing 'AGE' with median & Fill missing 'SEX' with mode
print("\nHandling missing values...")
df['AGE'] = df['AGE'].fillna(df['AGE'].median())
df['SEX'] = df['SEX'].fillna(df['SEX'].mode()[0])
print(df.isnull().sum())

# Encoding Categorical Features
# Strip whitespace from column names before encoding to avoid issues
df.columns = df.columns.str.strip()

# Perform encoding on specified categorical columns
df = pd.get_dummies(df, columns=['SEX', 'SOCIOECONOMIC STATUS', 'HABBITS', 'DURATION'])
print("DataFrame head after encoding:")
print(df.head())


Cleaning 'DIAGNOSIS' column...
Unique DIAGNOSIS values after cleaning:
['oral submucous fibrosis' 'oral lichen planus' 'leukoplakia'
 'frictional keratosis' 'tobacco pouch keratosis' 'erythroplakia'
 "smoker's palate"]

Standardizing 'SOCIOECONOMIC STATUS' column...
Unique SOCIOECONOMIC STATUS values after standardization:
['Lower Middle Class' 'Lower Class' 'Upper Class' 'Upper Middle Class']
SOCIOECONOMIC STATUS value counts:
SOCIOECONOMIC STATUS
Lower Middle Class    586
Upper Middle Class    165
Lower Class           158
Upper Class            90
Name: count, dtype: int64

Standardizing 'SEX' column...
Unique SEX values after standardization:
['M' 'F']

Handling missing values...
SL. NO.                 0
PATIENT'S NAME          0
AGE                     0
SEX                     0
SOCIOECONOMIC STATUS    0
DIAGNOSIS               0
HABBITS                 0
DURATION                0
dtype: int64
DataFrame head after encoding:
   SL. NO.     PATIENT'S NAME   AGE                DIA

## Feature Selection and Engineering

In [5]:
# Drop identifier columns as they are not useful for prediction
print("Dropping identifier columns ('SL. NO.', 'PATIENT'S NAME')...")
df = df.drop(['SL. NO.', 'PATIENT\'S NAME'], axis=1)
print("DataFrame head after feature selection:")
print(df.head())

# Separate features (X) and target (y)
X = df.drop('DIAGNOSIS', axis=1)
y = df['DIAGNOSIS']

print("Final feature columns:")
print(X.columns.tolist())

Dropping identifier columns ('SL. NO.', 'PATIENT'S NAME')...
DataFrame head after feature selection:
    AGE                DIAGNOSIS  SEX_F  SEX_M  \
0  32.0  oral submucous fibrosis  False   True   
1  37.0       oral lichen planus  False   True   
2  40.0              leukoplakia  False   True   
3  41.0  oral submucous fibrosis  False   True   
4  31.0  oral submucous fibrosis  False   True   

   SOCIOECONOMIC STATUS_Lower Class  SOCIOECONOMIC STATUS_Lower Middle Class  \
0                             False                                     True   
1                              True                                    False   
2                             False                                     True   
3                             False                                    False   
4                             False                                    False   

   SOCIOECONOMIC STATUS_Upper Class  SOCIOECONOMIC STATUS_Upper Middle Class  \
0                             False  

## Model Development and Comparison

In [6]:
# Apply SMOTE to handle class imbalance
print("Applying SMOTE for class imbalance...")
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
print("Class distribution after SMOTE:")
print(y_resampled.value_counts())

# Split data into training and testing sets
print("Splitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# 4.1. Random Forest Classifier
print("\n--- Training Random Forest Classifier ---")
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))

# 4.2. Logistic Regression
print("\n--- Training Logistic Regression ---")
lr_model = LogisticRegression(random_state=42, max_iter=1000) # Increased max_iter for convergence
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_lr))

# 4.3. Support Vector Machine (SVC)
print("\n--- Training Support Vector Machine (SVC) ---")
svc_model = SVC(random_state=42)
svc_model.fit(X_train, y_train)
y_pred_svc = svc_model.predict(X_test)
print("SVC Accuracy:", accuracy_score(y_test, y_pred_svc))
print("SVC Classification Report:\n", classification_report(y_test, y_pred_svc))

Applying SMOTE for class imbalance...
Class distribution after SMOTE:
DIAGNOSIS
oral submucous fibrosis    436
oral lichen planus         436
leukoplakia                436
frictional keratosis       436
tobacco pouch keratosis    436
erythroplakia              436
smoker's palate            436
Name: count, dtype: int64
Splitting data into training and testing sets...

--- Training Random Forest Classifier ---
Random Forest Accuracy: 0.7839607201309329
Random Forest Classification Report:
                          precision    recall  f1-score   support

          erythroplakia       0.94      0.98      0.96        84
   frictional keratosis       0.85      0.93      0.89        82
            leukoplakia       0.65      0.73      0.69        81
     oral lichen planus       0.76      0.62      0.69        96
oral submucous fibrosis       0.56      0.58      0.57        93
        smoker's palate       1.00      0.95      0.98        88
tobacco pouch keratosis       0.76      0.74    

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.5073649754500819
Logistic Regression Classification Report:
                          precision    recall  f1-score   support

          erythroplakia       0.62      0.69      0.65        84
   frictional keratosis       0.53      0.59      0.56        82
            leukoplakia       0.33      0.20      0.25        81
     oral lichen planus       0.40      0.22      0.28        96
oral submucous fibrosis       0.47      0.80      0.59        93
        smoker's palate       0.64      0.77      0.70        88
tobacco pouch keratosis       0.40      0.29      0.34        87

               accuracy                           0.51       611
              macro avg       0.48      0.51      0.48       611
           weighted avg       0.48      0.51      0.48       611


--- Training Support Vector Machine (SVC) ---
SVC Accuracy: 0.24877250409165302
SVC Classification Report:
                          precision    recall  f1-score   support

          eryt

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Feature Importance Analysis (from Random Forest)

In [7]:
feature_importances = pd.Series(rf_model.feature_importances_, index=X.columns).sort_values(ascending=False)
print("Feature Importances (from Random Forest):\n", feature_importances)

Feature Importances (from Random Forest):
 AGE                                        0.270119
HABBITS_Paan                               0.040877
SOCIOECONOMIC STATUS_Upper Middle Class    0.039131
DURATION_1 year                            0.038496
HABBITS_Cigarette                          0.038318
SEX_F                                      0.037682
DURATION_2 years                           0.034470
SOCIOECONOMIC STATUS_Lower Middle Class    0.033650
SOCIOECONOMIC STATUS_Lower Class           0.033440
HABBITS_Bidi & Alcohol                     0.032069
HABBITS_cigarette                          0.031840
DURATION_7 year                            0.031035
HABBITS_Gutka                              0.030303
SEX_M                                      0.029817
DURATION_2.5 years                         0.029089
DURATION_1.5 years                         0.027905
SOCIOECONOMIC STATUS_Upper Class           0.027750
DURATION_5 years                           0.025707
DURATION_6 months    

## Save Model and Feature Information

In [8]:
# Save the best performing model (Random Forest) and feature columns
joblib.dump(rf_model, "random_forest_model.pkl")
joblib.dump(X_train.columns.tolist(), "feature_columns.pkl")
print("Model and feature columns saved successfully!")

Model and feature columns saved successfully!
