In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
import joblib

In [2]:
df = pd.read_csv('../Data/processed_data.csv')

In [3]:
X = df.drop(['comments','treatment' , 'Age','state'] , axis =1)
y = df['treatment'].map({'Yes': 1, 'No': 0})

In [4]:
X_train , X_test , y_train, y_test = train_test_split(X,y,test_size = 0.2 , random_state = 42)

In [5]:
categorical_cols = X_train.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

In [6]:
encoder = ColumnTransformer(
        transformers=[
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_cols),
        ]
)

In [7]:
rf_pipeline = Pipeline(steps=[
    ('preprocessor', encoder),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [8]:
rf_pipeline.fit(X_train,y_train)
y_pred = rf_pipeline.predict(X_test)

In [9]:
joblib.dump(rf_pipeline ,'../Models/classification pipline.pkl')

['../Models/classification pipline.pkl']

In [10]:
print("✅ Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n✅ Classification Report:\n", classification_report(y_test, y_pred))
print("✅ Accuracy Score:", accuracy_score(y_test, y_pred))

✅ Confusion Matrix:
 [[ 84  29]
 [ 25 113]]

✅ Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.74      0.76       113
           1       0.80      0.82      0.81       138

    accuracy                           0.78       251
   macro avg       0.78      0.78      0.78       251
weighted avg       0.78      0.78      0.78       251

✅ Accuracy Score: 0.7848605577689243
