In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score

In [None]:
df = pd.read_csv('/content/pcos_prediction_dataset.csv')

In [None]:
df.sample(3)

Unnamed: 0,Country,Age,BMI,Menstrual Regularity,Hirsutism,Acne Severity,Family History of PCOS,Insulin Resistance,Lifestyle Score,Stress Levels,Urban/Rural,Socioeconomic Status,Awareness of PCOS,Fertility Concerns,Undiagnosed PCOS Likelihood,Ethnicity,Diagnosis
63373,Uzbekistan,25,Overweight,Regular,No,Mild,No,No,4,High,Rural,Middle,No,No,0.108533,Caucasian,No
66258,Ecuador,34,Normal,Regular,Yes,,Yes,Yes,3,Medium,Urban,Middle,Yes,No,0.210803,African,No
97618,Bangladesh,27,Overweight,Regular,Yes,,No,No,8,Medium,Urban,Low,Yes,No,0.104777,Asian,No


In [None]:
df.isnull().sum()

Unnamed: 0,0
Country,0
Age,0
BMI,0
Menstrual Regularity,0
Hirsutism,0
Acne Severity,60085
Family History of PCOS,0
Insulin Resistance,0
Lifestyle Score,0
Stress Levels,0


In [None]:
df['Acne Severity'] = df['Acne Severity'].fillna("Unknown")

In [None]:
df = df.dropna()

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 17 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Country                      120000 non-null  object 
 1   Age                          120000 non-null  int64  
 2   BMI                          120000 non-null  object 
 3   Menstrual Regularity         120000 non-null  object 
 4   Hirsutism                    120000 non-null  object 
 5   Acne Severity                120000 non-null  object 
 6   Family History of PCOS       120000 non-null  object 
 7   Insulin Resistance           120000 non-null  object 
 8   Lifestyle Score              120000 non-null  int64  
 9   Stress Levels                120000 non-null  object 
 10  Urban/Rural                  120000 non-null  object 
 11  Socioeconomic Status         120000 non-null  object 
 12  Awareness of PCOS            120000 non-null  object 
 13 

In [None]:
df = df.drop(columns=['Awareness of PCOS', 'Fertility Concerns','Country','Lifestyle Score','Ethnicity','Undiagnosed PCOS Likelihood'])

In [None]:
df.sample(5)

Unnamed: 0,Age,BMI,Menstrual Regularity,Hirsutism,Acne Severity,Family History of PCOS,Insulin Resistance,Stress Levels,Urban/Rural,Socioeconomic Status,Diagnosis
27355,17,Overweight,Regular,No,Mild,Yes,No,Medium,Urban,Middle,No
16171,32,Normal,Regular,No,Mild,No,No,Medium,Urban,Low,No
65960,37,Normal,Irregular,No,Unknown,Yes,No,Low,Urban,High,No
53041,16,Obese,Regular,No,Unknown,No,No,Low,Urban,Middle,No
30937,24,Obese,Irregular,Yes,Moderate,Yes,No,Medium,Urban,Middle,No


In [None]:
# Selecting features and target
X = df[['Age', 'BMI', 'Menstrual Regularity', 'Hirsutism', 'Acne Severity', 'Family History of PCOS', 'Insulin Resistance', 'Stress Levels', 'Urban/Rural', 'Socioeconomic Status' ]]
y= df[['Diagnosis']]

In [None]:
X

Unnamed: 0,Age,BMI,Menstrual Regularity,Hirsutism,Acne Severity,Family History of PCOS,Insulin Resistance,Stress Levels,Urban/Rural,Socioeconomic Status
0,26,Overweight,Regular,Yes,Severe,Yes,Yes,Low,Rural,High
1,16,Underweight,Regular,Yes,Unknown,No,Yes,High,Rural,Middle
2,41,Normal,Regular,No,Moderate,No,No,Medium,Urban,Middle
3,27,Normal,Irregular,No,Mild,No,No,Low,Urban,High
4,26,Overweight,Irregular,Yes,Unknown,No,No,Medium,Urban,Middle
...,...,...,...,...,...,...,...,...,...,...
119995,28,Normal,Regular,No,Moderate,Yes,No,Low,Urban,Middle
119996,35,Overweight,Regular,No,Unknown,No,No,Low,Rural,High
119997,16,Normal,Regular,No,Moderate,No,No,Medium,Rural,Low
119998,15,Obese,Regular,Yes,Unknown,Yes,Yes,Medium,Rural,High


In [None]:
# define categorical and numerical features
categorical_features = ['BMI','Menstrual Regularity', 'Hirsutism', 'Acne Severity', 'Family History of PCOS', 'Insulin Resistance', 'Stress Levels', 'Urban/Rural', 'Socioeconomic Status']
numercial_features = ['Age']

In [None]:
# column transformer for OneHotEncoding
preprocessor = ColumnTransformer(transformers=[("cat",OneHotEncoder(),categorical_features),
                                                ("num","passthrough",numercial_features)])

In [None]:
# Creating a pipeline with preprocessor and random forest classifier
pipeline = Pipeline(steps=[("preprocessor",preprocessor),
                           ("classifier", RandomForestClassifier(n_estimators=300, max_depth=10,min_samples_split=5,class_weight='balanced', random_state=42))])

In [None]:
# Split data
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=1)
pipeline.fit(X_train,y_train)

  return fit_method(estimator, *args, **kwargs)


In [None]:
# predict and evaluate
y_pred = pipeline.predict(X_test)
accuracy_score(y_test,y_pred)

0.6731666666666667

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          No       0.90      0.72      0.80     21570
         Yes       0.10      0.27      0.14      2430

    accuracy                           0.67     24000
   macro avg       0.50      0.50      0.47     24000
weighted avg       0.82      0.67      0.73     24000



In [None]:
import pickle

pickle_model_path = 'model.pkl'
with open(pickle_model_path, 'wb') as f:
  pickle.dump(pipeline,f)

In [None]:
import sklearn
print(sklearn.__version__)

1.6.1
