In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans # This is the missing import


from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, roc_curve
from sklearn.datasets import make_classification

import warnings
warnings.filterwarnings('ignore')


In [12]:
# Load dataset
file_path = '/content/sample_data/preprocessed_telco_churn.csv'
df = pd.read_csv(file_path)

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency, ttest_ind, mannwhitneyu
import warnings
warnings.filterwarnings('ignore')



print("="*80)
print("COMPREHENSIVE STATISTICAL ANALYSIS - TELCO CHURN DATASET")
print("="*80)
print(f"Dataset Shape: {df.shape}")
print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# 1. BASIC DATA OVERVIEW
print("\n" + "="*50)
print("1. DATASET OVERVIEW")
print("="*50)

print("\nDataset Info:")
print(df.info())

print("\nFirst 5 rows:")
print(df.head())

print("\nMissing Values:")
missing_data = df.isnull().sum()
if missing_data.sum() > 0:
    print(missing_data[missing_data > 0])
else:
    print("No missing values found!")

print("\nDuplicate Rows:", df.duplicated().sum())

# 2. TARGET VARIABLE ANALYSIS
print("\n" + "="*50)
print("2. TARGET VARIABLE ANALYSIS")
print("="*50)

# Assuming 'Churn' is the target variable (adjust if different)
target_cols = [col for col in df.columns if 'churn' in col.lower() or 'target' in col.lower()]
if not target_cols:
    # If no obvious target column, let's check unique value counts for binary columns
    binary_cols = []
    for col in df.columns:
        if df[col].nunique() == 2:
            binary_cols.append(col)
    print(f"Potential target columns (binary): {binary_cols}")
    if binary_cols:
        target_col = binary_cols[0]  # Take first binary column as target
    else:
        target_col = df.columns[-1]  # Take last column as target
else:
    target_col = target_cols[0]

print(f"Target Variable: {target_col}")
print("\nTarget Distribution:")
target_counts = df[target_col].value_counts()
print(target_counts)
print(f"\nTarget Proportions:")
print(df[target_col].value_counts(normalize=True))

# Check class imbalance
minority_class_ratio = min(target_counts) / sum(target_counts)
print(f"\nClass Imbalance Ratio: {minority_class_ratio:.3f}")
if minority_class_ratio < 0.3:
    print("⚠️  SIGNIFICANT CLASS IMBALANCE DETECTED!")
    print("This could be contributing to your precision-recall issues.")

# 3. FEATURE ANALYSIS
print("\n" + "="*50)
print("3. FEATURE ANALYSIS")
print("="*50)

# Separate numerical and categorical features
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
if target_col in numerical_features:
    numerical_features.remove(target_col)

categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
if target_col in categorical_features:
    categorical_features.remove(target_col)

print(f"Numerical Features ({len(numerical_features)}): {numerical_features}")
print(f"Categorical Features ({len(categorical_features)}): {categorical_features}")

# 4. NUMERICAL FEATURES STATISTICAL ANALYSIS
if numerical_features:
    print("\n" + "="*50)
    print("4. NUMERICAL FEATURES ANALYSIS")
    print("="*50)

    print("\nDescriptive Statistics:")
    print(df[numerical_features].describe())

    print("\nSkewness Analysis:")
    skewness_data = []
    for col in numerical_features:
        skew_val = stats.skew(df[col].dropna())
        skewness_data.append({
            'Feature': col,
            'Skewness': skew_val,
            'Interpretation': 'Highly Skewed' if abs(skew_val) > 1 else
                           'Moderately Skewed' if abs(skew_val) > 0.5 else 'Normal'
        })

    skew_df = pd.DataFrame(skewness_data)
    print(skew_df)

    # Outlier Detection using IQR method
    print("\nOutlier Analysis (IQR Method):")
    outlier_data = []
    for col in numerical_features:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col]
        outlier_data.append({
            'Feature': col,
            'Outliers_Count': len(outliers),
            'Outliers_Percentage': (len(outliers) / len(df)) * 100,
            'Lower_Bound': lower_bound,
            'Upper_Bound': upper_bound
        })

    outlier_df = pd.DataFrame(outlier_data)
    print(outlier_df)

# 5. CORRELATION ANALYSIS
if len(numerical_features) > 1:
    print("\n" + "="*50)
    print("5. CORRELATION ANALYSIS")
    print("="*50)

    # Calculate correlation matrix
    corr_matrix = df[numerical_features].corr()

    print("\nHighly Correlated Feature Pairs (|correlation| > 0.7):")
    high_corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            corr_val = corr_matrix.iloc[i, j]
            if abs(corr_val) > 0.7:
                high_corr_pairs.append({
                    'Feature_1': corr_matrix.columns[i],
                    'Feature_2': corr_matrix.columns[j],
                    'Correlation': corr_val
                })

    if high_corr_pairs:
        high_corr_df = pd.DataFrame(high_corr_pairs)
        print(high_corr_df)
        print("\n⚠️  High multicollinearity detected! Consider feature selection.")
    else:
        print("No highly correlated feature pairs found.")

# 6. TARGET-FEATURE RELATIONSHIP ANALYSIS
print("\n" + "="*50)
print("6. TARGET-FEATURE RELATIONSHIP ANALYSIS")
print("="*50)

# For numerical features
if numerical_features:
    print("\nNumerical Features vs Target:")
    target_relationship_data = []

    for feature in numerical_features:
        # Group by target and calculate statistics
        grouped_stats = df.groupby(target_col)[feature].agg(['mean', 'std', 'median'])

        # Perform statistical test (t-test or Mann-Whitney U)
        group_0 = df[df[target_col] == df[target_col].unique()[0]][feature].dropna()
        group_1 = df[df[target_col] == df[target_col].unique()[1]][feature].dropna()

        # Check for normality (simplified)
        if len(group_0) > 5000 or len(group_1) > 5000:
            # Use Mann-Whitney U for large samples or non-normal data
            statistic, p_value = mannwhitneyu(group_0, group_1, alternative='two-sided')
            test_type = "Mann-Whitney U"
        else:
            # Use t-test
            statistic, p_value = ttest_ind(group_0, group_1)
            test_type = "T-test"

        target_relationship_data.append({
            'Feature': feature,
            'Test_Type': test_type,
            'P_Value': p_value,
            'Significant': 'Yes' if p_value < 0.05 else 'No',
            'Effect_Size': abs(group_0.mean() - group_1.mean()) / np.sqrt((group_0.var() + group_1.var()) / 2)
        })

    target_rel_df = pd.DataFrame(target_relationship_data)
    print(target_rel_df)

# For categorical features
if categorical_features:
    print("\nCategorical Features vs Target (Chi-Square Test):")
    cat_target_data = []

    for feature in categorical_features:
        # Create contingency table
        contingency_table = pd.crosstab(df[feature], df[target_col])

        # Perform Chi-square test
        if contingency_table.shape[0] > 1 and contingency_table.shape[1] > 1:
            chi2, p_value, dof, expected = chi2_contingency(contingency_table)

            # Calculate Cramér's V (effect size for categorical associations)
            n = contingency_table.sum().sum()
            cramers_v = np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))

            cat_target_data.append({
                'Feature': feature,
                'Chi2_Statistic': chi2,
                'P_Value': p_value,
                'Significant': 'Yes' if p_value < 0.05 else 'No',
                'Cramers_V': cramers_v,
                'Effect_Size': 'Large' if cramers_v > 0.25 else 'Medium' if cramers_v > 0.15 else 'Small'
            })

    if cat_target_data:
        cat_target_df = pd.DataFrame(cat_target_data)
        print(cat_target_df)

# 7. FEATURE IMPORTANCE RANKING
print("\n" + "="*50)
print("7. FEATURE IMPORTANCE SUMMARY")
print("="*50)

# Combine all significant features
important_features = []

if numerical_features and 'target_rel_df' in locals():
    sig_num_features = target_rel_df[target_rel_df['Significant'] == 'Yes'].sort_values('Effect_Size', ascending=False)
    for _, row in sig_num_features.iterrows():
        important_features.append({
            'Feature': row['Feature'],
            'Type': 'Numerical',
            'Importance_Score': row['Effect_Size'],
            'P_Value': row['P_Value']
        })

if categorical_features and 'cat_target_df' in locals():
    sig_cat_features = cat_target_df[cat_target_df['Significant'] == 'Yes'].sort_values('Cramers_V', ascending=False)
    for _, row in sig_cat_features.iterrows():
        important_features.append({
            'Feature': row['Feature'],
            'Type': 'Categorical',
            'Importance_Score': row['Cramers_V'],
            'P_Value': row['P_Value']
        })

if important_features:
    importance_df = pd.DataFrame(important_features).sort_values('Importance_Score', ascending=False)
    print("Top Features by Statistical Significance:")
    print(importance_df)

    print(f"\n🎯 RECOMMENDATIONS FOR BALANCED MODEL:")
    print("="*50)
    top_features = importance_df.head(10)['Feature'].tolist()
    print(f"1. Focus on these top {len(top_features)} features: {top_features}")

    if minority_class_ratio < 0.3:
        print("2. Address class imbalance using:")
        print("   - SMOTE (Synthetic Minority Oversampling)")
        print("   - Class weights in your model")
        print("   - Stratified sampling")

    print("3. For better precision-recall balance:")
    print("   - Use ROC-AUC and PR-AUC as evaluation metrics")
    print("   - Try ensemble methods (Random Forest, XGBoost)")
    print("   - Optimize threshold using validation set")
    print("   - Use cross-validation with stratification")

# 8. DATA QUALITY ASSESSMENT
print("\n" + "="*50)
print("8. DATA QUALITY ASSESSMENT")
print("="*50)

print("Data Quality Summary:")
quality_issues = []

# Check for constant features
constant_features = [col for col in df.columns if df[col].nunique() <= 1]
if constant_features:
    quality_issues.append(f"Constant features found: {constant_features}")

# Check for high cardinality categorical features
if categorical_features:
    high_card_features = [col for col in categorical_features if df[col].nunique() > 50]
    if high_card_features:
        quality_issues.append(f"High cardinality categorical features: {high_card_features}")

# Check for potential data leakage (features perfectly correlated with target)
if len(numerical_features) > 0:
    target_corr = df[numerical_features + [target_col]].corr()[target_col].abs()
    perfect_corr = target_corr[target_corr > 0.95].drop(target_col)
    if len(perfect_corr) > 0:
        quality_issues.append(f"Potential data leakage detected: {perfect_corr.index.tolist()}")

if quality_issues:
    for issue in quality_issues:
        print(f"⚠️  {issue}")
else:
    print("✅ No major data quality issues detected!")

print("\n" + "="*80)
print("ANALYSIS COMPLETE!")
print("="*80)
print("💡 Next Steps:")
print("1. Use the feature importance ranking for feature selection")
print("2. Address class imbalance if detected")
print("3. Consider feature engineering based on statistical insights")
print("4. Use cross-validation with proper stratification")
print("5. Optimize model threshold for balanced precision-recall")

COMPREHENSIVE STATISTICAL ANALYSIS - TELCO CHURN DATASET
Dataset Shape: (7043, 31)
Memory Usage: 1.67 MB

1. DATASET OVERVIEW

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 31 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   gender                                 7043 non-null   int64  
 1   SeniorCitizen                          7043 non-null   int64  
 2   tenure                                 7043 non-null   int64  
 3   MonthlyCharges                         7043 non-null   float64
 4   TotalCharges                           7043 non-null   float64
 5   Churn                                  7043 non-null   int64  
 6   Partner_Yes                            7043 non-null   int64  
 7   Dependents_Yes                         7043 non-null   int64  
 8   PhoneService_Yes                       7043 non-null   int64  
 9  

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
import warnings

warnings.filterwarnings('ignore')



# 2. Implement Feature Selection and Data Quality steps
df = df.drop_duplicates()

recommended_features = [
    'tenure', 'MonthlyCharges', 'InternetService_Fiber optic',
    'Contract_One year', 'Contract_Two year', 'PaymentMethod_Electronic check',
    'InternetService_No', 'SeniorCitizen', 'Partner_Yes',
    'Dependents_Yes', 'PaperlessBilling_Yes', 'PaymentMethod_Credit card (automatic)'
]
target_variable = 'Churn'

X = df[recommended_features]
y = df[target_variable]

# 3. Handle Class Imbalance using SMOTE
print("Original class distribution:", y.value_counts())
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)
print("Resampled class distribution:", y_res.value_counts())

# 4. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42, stratify=y_res)

# 5. Train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

# 6. Make predictions on the test set
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# 7. Evaluate the model
print("\n" + "="*50)
print("MODEL EVALUATION")
print("="*50)

roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC-AUC Score: {roc_auc:.4f}")

pr_auc = average_precision_score(y_test, y_pred_proba)
print(f"PR-AUC Score: {pr_auc:.4f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Original class distribution: Churn
0    5164
1    1857
Name: count, dtype: int64
Resampled class distribution: Churn
0    5164
1    5164
Name: count, dtype: int64

MODEL EVALUATION
ROC-AUC Score: 0.8970
PR-AUC Score: 0.8906

Confusion Matrix:
[[830 203]
 [171 862]]

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.80      0.82      1033
           1       0.81      0.83      0.82      1033

    accuracy                           0.82      2066
   macro avg       0.82      0.82      0.82      2066
weighted avg       0.82      0.82      0.82      2066



In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import warnings

warnings.filterwarnings('ignore')

# You may need to run this command in your environment first:
# !pip install scikit-learn imbalanced-learn xgboost pandas


# 2. Implement Feature Selection and Data Quality steps
df = df.drop_duplicates()

recommended_features = [
    'tenure', 'MonthlyCharges', 'InternetService_Fiber optic',
    'Contract_One year', 'Contract_Two year', 'PaymentMethod_Electronic check',
    'InternetService_No', 'SeniorCitizen', 'Partner_Yes',
    'Dependents_Yes', 'PaperlessBilling_Yes', 'PaymentMethod_Credit card (automatic)'
]
target_variable = 'Churn'

X = df[recommended_features]
y = df[target_variable]

# 3. Handle Class Imbalance using SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# 4. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42, stratify=y_res)

# A helper function to evaluate a model and print a full report
def evaluate_model(model, X_test, y_test, model_name):
    print(f"\n{'='*50}")
    print(f"EVALUATION: {model_name}")
    print(f"{'='*50}")

    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    pr_auc = average_precision_score(y_test, y_pred_proba)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"ROC-AUC Score: {roc_auc:.4f}")
    print(f"PR-AUC Score: {pr_auc:.4f}")

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

# 5. Train and evaluate the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_model.fit(X_train, y_train)
evaluate_model(rf_model, X_test, y_test, "Random Forest Classifier")

# 6. Train and evaluate the XGBoost model
xgb_model = xgb.XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)
evaluate_model(xgb_model, X_test, y_test, "XGBoost Classifier")


EVALUATION: Random Forest Classifier
Accuracy: 0.8190
ROC-AUC Score: 0.8970
PR-AUC Score: 0.8906

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.80      0.82      1033
           1       0.81      0.83      0.82      1033

    accuracy                           0.82      2066
   macro avg       0.82      0.82      0.82      2066
weighted avg       0.82      0.82      0.82      2066


EVALUATION: XGBoost Classifier
Accuracy: 0.8166
ROC-AUC Score: 0.8889
PR-AUC Score: 0.8723

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.77      0.81      1033
           1       0.79      0.86      0.82      1033

    accuracy                           0.82      2066
   macro avg       0.82      0.82      0.82      2066
weighted avg       0.82      0.82      0.82      2066



In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import warnings

warnings.filterwarnings('ignore')

# You may need to run these commands first if you don't have them:
# !pip install torch torchvision torchaudio
# !pip install scikit-learn imbalanced-learn pandas


# 2. Implement Feature Selection and Data Quality steps
df = df.drop_duplicates()

recommended_features = [
    'tenure', 'MonthlyCharges', 'InternetService_Fiber optic',
    'Contract_One year', 'Contract_Two year', 'PaymentMethod_Electronic check',
    'InternetService_No', 'SeniorCitizen', 'Partner_Yes',
    'Dependents_Yes', 'PaperlessBilling_Yes', 'PaymentMethod_Credit card (automatic)'
]
target_variable = 'Churn'

X = df[recommended_features]
y = df[target_variable]

# 3. Handle Class Imbalance using SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# 4. Standardize numerical features for the neural network
scaler = StandardScaler()
X_res[['tenure', 'MonthlyCharges']] = scaler.fit_transform(X_res[['tenure', 'MonthlyCharges']])

# 5. Split data and convert to PyTorch Tensors
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42, stratify=y_res)

X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

# Create DataLoader for batch training
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)

# 6. Define the Neural Network Model
class ChurnModel(nn.Module):
    def __init__(self, input_features):
        super(ChurnModel, self).__init__()
        self.layer_1 = nn.Linear(input_features, 64)
        self.relu = nn.ReLU()
        self.layer_2 = nn.Linear(64, 32)
        self.layer_out = nn.Linear(32, 1)

    def forward(self, x):
        x = self.relu(self.layer_1(x))
        x = self.relu(self.layer_2(x))
        x = self.layer_out(x)
        return x

# Instantiate the model, loss function, and optimizer
input_features = X_train.shape[1]
model = ChurnModel(input_features)
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 7. Training loop
epochs = 100
for epoch in range(epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        y_pred_logits = model(batch_X)
        loss = loss_fn(y_pred_logits, batch_y)
        loss.backward()
        optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

# 8. Evaluation
model.eval()
with torch.no_grad():
    y_pred_logits = model(X_test_tensor)
    y_pred_proba = torch.sigmoid(y_pred_logits)
    y_pred_class = (y_pred_proba > 0.5).float()

    y_test_np = y_test.values
    y_pred_proba_np = y_pred_proba.numpy().flatten()
    y_pred_class_np = y_pred_class.numpy().flatten()

    accuracy = accuracy_score(y_test_np, y_pred_class_np)
    roc_auc = roc_auc_score(y_test_np, y_pred_proba_np)
    pr_auc = average_precision_score(y_test_np, y_pred_proba_np)

    print("\n" + "="*50)
    print("PYTORCH MODEL EVALUATION")
    print("="*50)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"ROC-AUC Score: {roc_auc:.4f}")
    print(f"PR-AUC Score: {pr_auc:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test_np, y_pred_class_np))

Epoch [10/100], Loss: 0.1647
Epoch [20/100], Loss: 0.9590
Epoch [30/100], Loss: 0.1272
Epoch [40/100], Loss: 0.3250
Epoch [50/100], Loss: 0.1305
Epoch [60/100], Loss: 0.2831
Epoch [70/100], Loss: 0.1227
Epoch [80/100], Loss: 0.5427
Epoch [90/100], Loss: 0.3434
Epoch [100/100], Loss: 0.4192

PYTORCH MODEL EVALUATION
Accuracy: 0.7948
ROC-AUC Score: 0.8716
PR-AUC Score: 0.8509

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.78      0.79      1033
           1       0.78      0.81      0.80      1033

    accuracy                           0.79      2066
   macro avg       0.80      0.79      0.79      2066
weighted avg       0.80      0.79      0.79      2066



In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import warnings

warnings.filterwarnings('ignore')



# A helper function to evaluate a model and print a full report
def evaluate_model(model, X_test, y_test, model_name):
    print(f"\n{'='*50}")
    print(f"EVALUATION: {model_name}")
    print(f"{'='*50}")

    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    pr_auc = average_precision_score(y_test, y_pred_proba)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"ROC-AUC Score: {roc_auc:.4f}")
    print(f"PR-AUC Score: {pr_auc:.4f}")

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))



# 2. Preprocess data
df = df.drop_duplicates()
recommended_features = [
    'tenure', 'MonthlyCharges', 'InternetService_Fiber optic',
    'Contract_One year', 'Contract_Two year', 'PaymentMethod_Electronic check',
    'InternetService_No', 'SeniorCitizen', 'Partner_Yes',
    'Dependents_Yes', 'PaperlessBilling_Yes', 'PaymentMethod_Credit card (automatic)'
]
target_variable = 'Churn'
X = df[recommended_features]
y = df[target_variable]
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)
scaler = StandardScaler()
X_res[['tenure', 'MonthlyCharges']] = scaler.fit_transform(X_res[['tenure', 'MonthlyCharges']])
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42, stratify=y_res)

# 3. Train and evaluate Logistic Regression models
# a. Logistic Regression (no penalty)
lr_model = LogisticRegression(random_state=42, solver='liblinear')
lr_model.fit(X_train, y_train)
evaluate_model(lr_model, X_test, y_test, "Logistic Regression")

# b. Logistic Regression with Lasso (L1) penalty
lasso_lr_model = LogisticRegression(penalty='l1', solver='liblinear', C=1.0, random_state=42)
lasso_lr_model.fit(X_train, y_train)
evaluate_model(lasso_lr_model, X_test, y_test, "Logistic Regression (Lasso)")

# c. Logistic Regression with Ridge (L2) penalty
ridge_lr_model = LogisticRegression(penalty='l2', solver='liblinear', C=1.0, random_state=42)
ridge_lr_model.fit(X_train, y_train)
evaluate_model(ridge_lr_model, X_test, y_test, "Logistic Regression (Ridge)")

# d. Logistic Regression with Elastic Net penalty
elastic_lr_model = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, random_state=42)
elastic_lr_model.fit(X_train, y_train)
evaluate_model(elastic_lr_model, X_test, y_test, "Logistic Regression (Elastic Net)")


EVALUATION: Logistic Regression
Accuracy: 0.7778
ROC-AUC Score: 0.8585
PR-AUC Score: 0.8361

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.71      0.76      1033
           1       0.74      0.85      0.79      1033

    accuracy                           0.78      2066
   macro avg       0.78      0.78      0.78      2066
weighted avg       0.78      0.78      0.78      2066


EVALUATION: Logistic Regression (Lasso)
Accuracy: 0.7788
ROC-AUC Score: 0.8584
PR-AUC Score: 0.8359

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.71      0.76      1033
           1       0.75      0.85      0.79      1033

    accuracy                           0.78      2066
   macro avg       0.78      0.78      0.78      2066
weighted avg       0.78      0.78      0.78      2066


EVALUATION: Logistic Regression (Ridge)
Accuracy: 0.7778
ROC-AUC Score: 0.8585
PR-AUC Score: 0.8361

Cl