In [1]:
#import modules

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np

In [2]:
#Loding data into the kernels 

file_path_with_infertility="PCOS_infertility.csv"
file_path_without_infertility="PCOS_data_without_infertility.xlsx"

PCOS_inf = pd.read_csv(file_path_with_infertility)
PCOS_woinf = pd.read_excel(file_path_without_infertility, sheet_name="Full_new")

In [3]:
#Merging the two files as per patient file no. 
data = pd.merge(PCOS_woinf, PCOS_inf, on='Patient File No.', suffixes=('', '_y'), how='left')

#Dropping the repeated features after merging
data_premium =data.drop(['Unnamed: 44', 'Sl. No_y', 'PCOS (Y/N)_y', '  I   beta-HCG(mIU/mL)_y',
      'II    beta-HCG(mIU/mL)_y', 'AMH(ng/mL)_y'], axis=1)




In [4]:
#Dealing with categorical values.
#In this database the type objects are numeric values saved as strings.
#So I am just converting it into a numeric value.

data_premium["AMH(ng/mL)"] = pd.to_numeric(data_premium["AMH(ng/mL)"], errors='coerce')
data_premium["II    beta-HCG(mIU/mL)"] = pd.to_numeric(data_premium["II    beta-HCG(mIU/mL)"], errors='coerce')

#Dealing with missing values. 
#Filling NA values with the median of that feature.

data_premium['Marraige Status (Yrs)'].fillna(data_premium['Marraige Status (Yrs)'].median(),inplace=True)
data_premium['II    beta-HCG(mIU/mL)'].fillna(data_premium['II    beta-HCG(mIU/mL)'].median(),inplace=True)
data_premium['AMH(ng/mL)'].fillna(data_premium['AMH(ng/mL)'].median(),inplace=True)
data_premium['Fast food (Y/N)'].fillna(data_premium['Fast food (Y/N)'].median(),inplace=True)

#Clearing up the extra space in the column names (optional)
data_premium.columns = [col.strip() for col in data_premium.columns]

In [5]:
#Bootsrapping the data using sample
data_premium = data_premium.sample(frac=1, replace=True, random_state=42)


#Dropping the columns that are not relevant to PCOS
data_standard = data_premium.drop(['Blood Group','BP _Systolic (mmHg)','BP _Diastolic (mmHg)','Follicle No. (L)','Follicle No. (R)','Avg. F size (L) (mm)','Avg. F size (R) (mm)','Endometrium (mm)','Hb(g/dl)','Vit D3 (ng/mL)','PRG(ng/mL)','RBS(mg/dl)','TSH (mIU/L)','AMH(ng/mL)','PRL(ng/mL)','RR (breaths/min)', 'II    beta-HCG(mIU/mL)', 
       'FSH(mIU/mL)', 'LH(mIU/mL)', 'FSH/LH','I   beta-HCG(mIU/mL)'], axis=1)

#Splitting the data into training and testing sets, 80% of the data will be used for training and 20% for testing.
X = data_standard.drop(["PCOS (Y/N)","Patient File No.", "Sl. No"], axis=1)
y = data_standard['PCOS (Y/N)']

data_standard = data_standard.drop(["PCOS (Y/N)","Patient File No.", "Sl. No"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print(data_standard.columns)

(432, 20) (109, 20) (432,) (109,)
Index(['Age (yrs)', 'Weight (Kg)', 'Height(Cm)', 'BMI', 'Pulse rate(bpm)',
       'Cycle(R/I)', 'Cycle length(days)', 'Marraige Status (Yrs)',
       'Pregnant(Y/N)', 'No. of aborptions', 'Hip(inch)', 'Waist(inch)',
       'Waist:Hip Ratio', 'Weight gain(Y/N)', 'hair growth(Y/N)',
       'Skin darkening (Y/N)', 'Hair loss(Y/N)', 'Pimples(Y/N)',
       'Fast food (Y/N)', 'Reg.Exercise(Y/N)'],
      dtype='object')


In [6]:
#normalising all the features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import numpy as np

#Creating a random forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Initialize KFold with 5 splits
kf = KFold(n_splits=5, shuffle=True, random_state=42)

accuracies = []

# Ensure X and y are numpy arrays
X_np = np.array(X)
y_np = np.array(y)

for train_index, test_index in kf.split(X_np):
    X_train, X_test = X_np[train_index], X_np[test_index]
    y_train, y_test = y_np[train_index], y_np[test_index]

    # Train the model
    rf.fit(X_train, y_train)

    # Predict the test set
    y_pred = rf.predict(X_test)

    # Calculate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

# Print the mean accuracy
print("Mean Accuracy: ", np.mean(accuracies))



Mean Accuracy:  0.9223921168875296


In [8]:
# Assuming you have already trained the model and have the `rf` object

# Step 1: Prepare the test input
# Create a dictionary with the same features as X
test_input = {
    'Age (yrs)': [28],
    'Weight (Kg)': [58],
    'Height(Cm)': [152],
    'BMI': [19.3],
    'Pulse rate(bpm)': [78],
    'Cycle(R/I)': [2],
    'Cycle length(days)': [5],
    'Marraige Status (Yrs)': [7],
    'Pregnant(Y/N)': [0],
    'No. of aborptions': [0],
    'Hip(inch)': [36],
    'Waist(inch)': [30],
    'Waist:Hip Ratio': [0.833],
    'Weight gain(Y/N)': [0],
    'hair growth(Y/N)': [0],
    'Skin darkening (Y/N)': [0],
    'Hair loss(Y/N)': [0],
    'Pimples(Y/N)': [0],
    'Fast food (Y/N)': [1],
    'Reg.Exercise(Y/N)': [0]
}

# Convert the dictionary to a DataFrame
test_input_df = pd.DataFrame(test_input)

# Step 2: Preprocess the test input
# Apply the same scaling that was applied to the training data
test_input_scaled = scaler.transform(test_input_df)

# Step 3: Make a prediction
prediction = rf.predict(test_input_scaled)

# Output the prediction
print("Prediction: ", prediction)

prediction_proba = rf.predict_proba(test_input_scaled)
print("Prediction Probabilities:", prediction_proba)

Prediction:  [0]
Prediction Probabilities: [[0.78 0.22]]


In [9]:
from joblib import dump
from sklearn.impute import SimpleImputer

# Define the imputer
imputer = SimpleImputer(strategy='median')

# Fit the imputer on the data
imputer.fit(data_standard)

# Save your model and preprocessing objects
dump(rf, 'model_new.joblib')
dump(scaler, 'scaler_new.joblib')
dump(imputer, 'imputer_new.joblib')

# Assuming input_data should have the same columns as data_standard
input_data = pd.DataFrame(columns=data_standard.columns)

print("Columns used for imputer:", data_standard.columns)
print("Columns from Streamlit input:", input_data.columns)


import json

feature_columns = [
    'Age (yrs)', 'Weight (Kg)', 'Height(Cm)', 'BMI', 'Pulse rate(bpm)',
    'Cycle(R/I)', 'Cycle length(days)', 'Marraige Status (Yrs)',
    'Pregnant(Y/N)', 'No. of aborptions', 'Hip(inch)', 'Waist(inch)',
    'Waist:Hip Ratio', 'Weight gain(Y/N)', 'hair growth(Y/N)',
    'Skin darkening (Y/N)', 'Hair loss(Y/N)', 'Pimples(Y/N)',
    'Fast food (Y/N)', 'Reg.Exercise(Y/N)'
]

with open("feature_columns.json", "w") as f:
    json.dump(feature_columns, f, indent=4)


Columns used for imputer: Index(['Age (yrs)', 'Weight (Kg)', 'Height(Cm)', 'BMI', 'Pulse rate(bpm)',
       'Cycle(R/I)', 'Cycle length(days)', 'Marraige Status (Yrs)',
       'Pregnant(Y/N)', 'No. of aborptions', 'Hip(inch)', 'Waist(inch)',
       'Waist:Hip Ratio', 'Weight gain(Y/N)', 'hair growth(Y/N)',
       'Skin darkening (Y/N)', 'Hair loss(Y/N)', 'Pimples(Y/N)',
       'Fast food (Y/N)', 'Reg.Exercise(Y/N)'],
      dtype='object')
Columns from Streamlit input: Index(['Age (yrs)', 'Weight (Kg)', 'Height(Cm)', 'BMI', 'Pulse rate(bpm)',
       'Cycle(R/I)', 'Cycle length(days)', 'Marraige Status (Yrs)',
       'Pregnant(Y/N)', 'No. of aborptions', 'Hip(inch)', 'Waist(inch)',
       'Waist:Hip Ratio', 'Weight gain(Y/N)', 'hair growth(Y/N)',
       'Skin darkening (Y/N)', 'Hair loss(Y/N)', 'Pimples(Y/N)',
       'Fast food (Y/N)', 'Reg.Exercise(Y/N)'],
      dtype='object')


In [10]:
# Assuming you have already trained the model and have the `rf` object

# Step 1: Prepare the test input
# Create a dictionary with the same features as X
test_input = {
    'Age (yrs)': [28],
    'Weight (Kg)': [58],
    'Height(Cm)': [152],
    'BMI': [19.3],
    'Pulse rate(bpm)': [78],
    'Cycle(R/I)': [2],
    'Cycle length(days)': [5],
    'Marraige Status (Yrs)': [7],
    'Pregnant(Y/N)': [0],
    'No. of aborptions': [0],
    'Hip(inch)': [36],
    'Waist(inch)': [30],
    'Waist:Hip Ratio': [0.833],
    'Weight gain(Y/N)': [0],
    'hair growth(Y/N)': [0],
    'Skin darkening (Y/N)': [0],
    'Hair loss(Y/N)': [0],
    'Pimples(Y/N)': [0],
    'Fast food (Y/N)': [1],
    'Reg.Exercise(Y/N)': [0]
}

# Convert the dictionary to a DataFrame
test_input_df = pd.DataFrame(test_input)

# Step 2: Preprocess the test input
# Apply the same scaling that was applied to the training data
test_input_scaled = scaler.transform(test_input_df)

# Step 3: Make a prediction
prediction = rf.predict(test_input_scaled)

# Output the prediction
print("Prediction: ", prediction)

prediction_proba = rf.predict_proba(test_input_scaled)
print("Prediction Probabilities:", prediction_proba)

Prediction:  [0]
Prediction Probabilities: [[0.78 0.22]]
