In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder

# Load your data
df = pd.read_csv(r"C:\Users\mayur\Downloads\cervical cancer_project\cervical_cleaned_upedated.csv")

# Clean percentage columns (remove % sign and convert to float)
df['Mortality_Rate'] = df['Mortality_Rate'].str.replace('%', '').astype(float)
df['Screening_Rate'] = df['Screening_Rate'].str.replace('%', '').astype(float)

# Encode categorical variables
le_region = LabelEncoder()
df['Region_Encoded'] = le_region.fit_transform(df['Region'])

le_vacc = LabelEncoder()
df['HPV_Vaccination_Encoded'] = le_vacc.fit_transform(df['HPV_Vaccination_Status'])

# Features and target
X = df[['Screening_Rate', 'HPV_Vaccination_Encoded', 'Region_Encoded', 'Year']]
y = df['Mortality_Rate']

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
}

results = {}

print("=== MODEL PERFORMANCE ===")
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'MAE': mae, 'R²': r2}
    print(f"\n{name}")
    print(f"MAE: {mae:.3f}")
    print(f"R²: {r2:.3f}")

# Use Random Forest as best model for predictions
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get predictions on test set
y_test_pred = rf.predict(X_test)

# Create a DataFrame with actual vs predicted + metadata
test_results = X_test.copy()
test_results['Country'] = df.loc[X_test.index, 'Country'].values
test_results['Year'] = df.loc[X_test.index, 'Year'].values
test_results['Actual_Mortality_Rate'] = y_test.values
test_results['Predicted_Mortality_Rate'] = y_test_pred
test_results['Error'] = test_results['Actual_Mortality_Rate'] - test_results['Predicted_Mortality_Rate']
test_results['Error_Absolute'] = abs(test_results['Error'])

# Sort by error to inspect worst predictions
test_results = test_results.sort_values('Error_Absolute', ascending=False)

# Save predictions to CSV
test_results.to_csv(r'C:\Users\mayur\Downloads\cervical cancer_project\predicted_mortality_rates.csv', index=False)
print("\n✅ Predictions saved to: predicted_mortality_rates.csv")

# Display top 10 predictions for inspection
print("\n=== TOP 10 PREDICTIONS (by error) ===")
print(test_results[['Country', 'Year', 'Actual_Mortality_Rate', 'Predicted_Mortality_Rate', 'Error']].head(10))

# Feature Importance
feature_importance = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\n=== FEATURE IMPORTANCE (Random Forest) ===")
print(feature_importance)

# ===== BONUS: Predict Future Scenario =====
print("\n=== BONUS: What-if Prediction ===")
# Example: What if India increases screening from 26% → 40%, keeps other factors same?
# India: Region_Encoded = 0 (Developing), HPV_Vaccination_Encoded = 1 (Yes), Year = 2025

new_data = pd.DataFrame({
    'Screening_Rate': [40.0],
    'HPV_Vaccination_Encoded': [1],      # Yes
    'Region_Encoded': [0],               # Developing (India)
    'Year': [2025]
})

predicted_mortality = rf.predict(new_data)[0]
print(f"🔮 If India increases screening to 40% in 2025, predicted mortality rate: {predicted_mortality:.2f}%")
print(f"📉 Current India mortality (2024): 65.0% → Potential reduction: {65.0 - predicted_mortality:.2f} percentage points")

=== MODEL PERFORMANCE ===

Linear Regression
MAE: 2.987
R²: 0.932

Random Forest
MAE: 1.400
R²: 0.971

✅ Predictions saved to: predicted_mortality_rates.csv

=== TOP 10 PREDICTIONS (by error) ===
      Country  Year  Actual_Mortality_Rate  Predicted_Mortality_Rate     Error
12      China  2015                   49.1                 56.705000 -7.605000
10      India  2024                   65.0                 59.958000  5.042000
0       India  2014                   60.0                 55.929000  4.071000
44  Australia  2014                   32.0                 35.765000 -3.765000
4       India  2018                   60.8                 59.787000  1.013000
62    Germany  2021                   36.7                 35.771667  0.928333
42     Africa  2023                   72.6                 71.741000  0.859000
95     Europe  2021                   34.2                 35.030167 -0.830167
64    Germany  2023                   36.6                 35.927000  0.673000
97     Europe 