# **PCOS Lifestyle Impact Prediction with Gradio Interface**

# This notebook implements a series of machine learning models to predict the likelihood of PCOS based on lifestyle factors. 

# The models include Random Forest, Decision Trees, Logistic Regression, and MLP.

# We also use Gradio to provide real-time predictions and visualize the results interactively.

# Import necessary libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc, classification_report, precision_recall_curve
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer
import gradio as gr
from keras.src.layers import Dense, Dropout
from keras import Sequential

# **Load the Dataset**

In [None]:
data = pd.read_csv("input/Cleaned-Data.csv")
print("Initial dataset shape:", data.shape)

# **Data Preprocessing and Validation**

In [None]:
print(data.describe())
data.info()
data.head()

# Check for missing values

In [None]:
print("\nMissing values per column:")
print(data.isnull().sum())

# Impute missing values with median for numerical columns

In [None]:
numerical_cols = data.select_dtypes(include=[np.number]).columns
imputer = SimpleImputer(strategy='median')
data[numerical_cols] = imputer.fit_transform(data[numerical_cols])

# Check again after imputation

In [None]:
print("\nMissing values after imputation:")
print(data.isnull().sum())

# **Label Encoding Categorical Features**

In [None]:
categorical_cols = data.drop(columns=['PCOS']).select_dtypes(include=['object']).columns
data_encoded = data.copy()

# Use separate LabelEncoder for each column to avoid conflicts

In [None]:
for col in categorical_cols:
    le = LabelEncoder()
    data_encoded[col] = le.fit_transform(data_encoded[col])

# Convert target variable 'PCOS' to numerical (1/0)

In [None]:
data_encoded['PCOS'] = data_encoded['PCOS'].map({'Yes': 1, 'No': 0})

# **Critical: Target Validation**

In [None]:
target_counts = data_encoded['PCOS'].value_counts()
print("\nTarget distribution after encoding:")
print(target_counts)

# **Feature Engineering**

# Calculate BMI

In [None]:
data_encoded.loc[:, 'Height_m'] = data_encoded['Height_ft'] * 0.3048
data_encoded.loc[:, 'BMI'] = data_encoded['Weight_kg'] / (data_encoded['Height_m'] ** 2)

# Calculate Nutritional Score

In [None]:
nutrition_weights = {
    'Diet_Bread_Cereals': 1,
    'Diet_Milk_Products': 1,
    'Diet_Fruits': 2,
    'Diet_Vegetables': 2,
    'Diet_Starchy_Vegetables': -1,
    'Diet_NonStarchy_Vegetables': 2,
    'Diet_Fats': -2,
    'Diet_Sweets': -3,
    'Diet_Fried_Food': -3,
    'Diet_Tea_Coffee': 0,
}
data_encoded.loc[:, 'Nutritional_Score'] = data_encoded[nutrition_weights.keys()].dot(pd.Series(nutrition_weights))

# **Data Splitting**

In [None]:
features = data_encoded.drop(columns=['PCOS'])
target = data_encoded['PCOS']

# Stratified split to maintain class distribution

In [None]:
print("Missing values in target variable (PCOS):")
print(target.isnull().sum())

# Remove rows where the target variable 'PCOS' is NaN

In [None]:
data_cleaned = data_encoded.dropna(subset=['PCOS'])
features_cleaned = data_cleaned.drop(columns=['PCOS'])
target_cleaned = data_cleaned['PCOS']

X_train, X_test, y_train, y_test = train_test_split(
    features_cleaned, target_cleaned, 
    test_size=0.2, 
    random_state=42, 
    stratify=target_cleaned
)

# **Feature Scaling**

In [None]:
scaler = StandardScaler()
numerical_cols = features.select_dtypes(include=[np.number]).columns

X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

# **Model Training and Hyperparameter Tuning**

In [None]:
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "MLP": MLPClassifier(hidden_layer_sizes=(50,), max_iter=1000, random_state=42)
}

# Hyperparameter tuning using GridSearchCV for Random Forest

In [None]:
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

rf_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, scoring='accuracy')
rf_search.fit(X_train, y_train)
best_rf = rf_search.best_estimator_

# MLP Hyperparameter Tuning

In [None]:
param_grid_mlp = {
    'hidden_layer_sizes': [(50,), (100,), (50, 30)],
    'alpha': [0.0001, 0.001]
}
mlp_search = GridSearchCV(MLPClassifier(max_iter=1000, random_state=42), param_grid_mlp, cv=5)
mlp_search.fit(X_train, y_train)
best_mlp = mlp_search.best_estimator_

# **Evaluation of All Models**

In [None]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n{name} Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Classification Report for {name}:\n", classification_report(y_test, y_pred))

# Final Evaluation with the best model

In [None]:
y_pred_rf = best_rf.predict(X_test)
y_pred_mlp = best_mlp.predict(X_test)

print("\nBest Random Forest Test Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nBest MLP Test Accuracy:", accuracy_score(y_test, y_pred_mlp))

# **Model Evaluation Metrics**

# Confusion Matrix for Random Forest

In [None]:
cm_rf = confusion_matrix(y_test, y_pred_rf)
ConfusionMatrixDisplay(cm_rf).plot()
plt.title('Random Forest Confusion Matrix')
plt.show()

# ROC-AUC for Random Forest

In [None]:
fpr, tpr, _ = roc_curve(y_test, best_rf.predict_proba(X_test)[:,1])
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) - Random Forest')
plt.legend(loc='lower right')
plt.show()

# Precision-Recall Curve

In [None]:
precision, recall, _ = precision_recall_curve(y_test, best_rf.predict_proba(X_test)[:,1])
plt.plot(recall, precision, marker='.')
plt.title('Precision-Recall Curve - Random Forest')
plt.show()

# **Exploratory Data Analysis (EDA)**

# Correlation Heatmap

In [None]:
correlation_matrix = data_encoded[['Age','Weight_kg','Diet_Sweets', 'Diet_Fried_Food','Family_History_PCOS', 'Menstrual_Irregularity', 'Hormonal_Imbalance', 'Exercise_Frequency','Exercise_Duration','Sleep_Hours','PCOS']].corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

# Pairplot for key features

In [None]:
sns.pairplot(data_encoded[['BMI', 'Nutritional_Score', 'Age', 'PCOS']], hue='PCOS')
plt.show()

# **Deep Learning Model - TensorFlow/Keras**

In [None]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Plot training history

In [None]:
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Training Progress')
plt.legend()
plt.show()

# **Gradio Interface for Real-Time Prediction**

In [None]:
def predict_pcos(Weight_kg, Exercise_Duration, Hormonal_Imbalance, Conception_Difficulty,
                 Insulin_Resistance, Exercise_Benefit, Sleep_Hours, Hirsutism, Age, Exercise_Type, 
                 Hyperandrogenism, Exercise_Frequency, ID):
    # Prepare input data as a DataFrame
    input_data = {
        "Weight_kg": [Weight_kg],
        "Exercise_Duration": [Exercise_Duration],
        "Hormonal_Imbalance": [Hormonal_Imbalance],
        "Conception_Difficulty": [Conception_Difficulty],
        "Insulin_Resistance": [Insulin_Resistance],
        "Exercise_Benefit": [Exercise_Benefit],
        "Sleep_Hours": [Sleep_Hours],
        "Hirsutism": [Hirsutism],
        "Age": [Age],
        "Exercise_Type": [Exercise_Type],
        "Hyperandrogenism": [Hyperandrogenism],
        "Exercise_Frequency": [Exercise_Frequency],
        "ID": [ID]
    }

    input_df = pd.DataFrame(input_data)

    # Scale the input data
    scaled_input = scaler.transform(input_df)

    # Predict using the trained model
    prediction = best_rf.predict_proba(scaled_input)[0][1]

    return {"PCOS Probability": prediction}


iface = gr.Interface(fn=predict_pcos, 
                    inputs=[gr.Number(label="Weight (kg)"), 
                            gr.Number(label="Exercise Duration (minutes)"),
                            gr.Dropdown(label="Hormonal Imbalance", choices=["Yes", "No"]),
                            gr.Dropdown(label="Conception Difficulty", choices=["Yes", "No"]),
                            gr.Dropdown(label="Insulin Resistance", choices=["Yes", "No"]),
                            gr.Dropdown(label="Exercise Benefit", choices=["Low", "Moderate", "High"]),
                            gr.Number(label="Sleep Hours"),
                            gr.Dropdown(label="Hirsutism", choices=["Yes", "No"]),
                            gr.Number(label="Age"),
                            gr.Dropdown(label="Exercise Type", choices=["Cardio", "Strength"]),
                            gr.Dropdown(label="Hyperandrogenism", choices=["Yes", "No"]),
                            gr.Number(label="Exercise Frequency (times per week)"),
                            gr.Number(label="ID")],
                    outputs="json",
                    title="PCOS Risk Predictor")

iface.launch(debug=True, inline=True)