In [7]:
import pandas as pd

file_name="dataset_task1.csv"

df=pd.read_csv(file_name)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [8]:
# Check how many duplicate rows exist
duplicates = df.duplicated().sum()
print("Number of duplicate rows:", duplicates)


Number of duplicate rows: 0


In [9]:
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns


In [10]:
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    print(f"{col} has {outliers.shape[0]} outliers")


SeniorCitizen has 1142 outliers
tenure has 0 outliers
MonthlyCharges has 0 outliers


In [11]:
df['SeniorCitizen'].value_counts()


SeniorCitizen
0    5901
1    1142
Name: count, dtype: int64

In [12]:
#The outlier method is flagging 1 as an outlier probably. But its not.

In [13]:
from sklearn.impute import SimpleImputer
import pandas as pd

# We drop the target column(1 column) and name the target column y. The remaining is X
X=df.drop('Churn', axis=1)
y=df['Churn']

# Imputation(filling missing values) is not required here because no null values.

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

In [15]:
# Tells us the distribution of the target column. How many are no and how many are yes in this case

y.value_counts()

Churn
No     5174
Yes    1869
Name: count, dtype: int64

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE




categorical_cols= X.select_dtypes(include=['object']).columns
numerical_cols= X.select_dtypes(exclude=['object']).columns

preprocessor= ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),numerical_cols),
        ('cat',OneHotEncoder(drop='first',handle_unknown='ignore', sparse_output=False), categorical_cols)
    ]
)

# We see from y.value_counts() that there is an imbalance in the dataset. No's are more than yes. So we use SMOTE to make synthetic samples to balance the dataset.

pipeline=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('smote', SMOTE(random_state=42)),
])

X_train_processed = pipeline.named_steps['preprocessor'].fit_transform(X_train)
X_test_processed = pipeline.named_steps['preprocessor'].transform(X_test)

X_resampled, y_resampled = pipeline.named_steps['smote'].fit_resample(
    pipeline.named_steps['preprocessor'].fit_transform(X_train), y_train
)




In [24]:
from collections import Counter
from sklearn.pipeline import Pipeline


# After applying SMOTE
X_resampled, y_resampled = pipeline.named_steps['smote'].fit_resample(
    pipeline.named_steps['preprocessor'].fit_transform(X_train), y_train
)

print("Original y_train:", Counter(y_train))
print("After SMOTE:", Counter(y_resampled))



Original y_train: Counter({'No': 4139, 'Yes': 1495})
After SMOTE: Counter({'No': 4139, 'Yes': 4139})


In [29]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(class_weight='balanced', random_state=42)
rf.fit(X_resampled, y_resampled)

y_pred_rf = rf.predict(X_test_processed)

print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))




[[911 124]
 [175 199]]
              precision    recall  f1-score   support

          No       0.84      0.88      0.86      1035
         Yes       0.62      0.53      0.57       374

    accuracy                           0.79      1409
   macro avg       0.73      0.71      0.72      1409
weighted avg       0.78      0.79      0.78      1409



In [38]:
from xgboost import XGBClassifier

# Encode target as 0/1
y_resampled_numeric = y_resampled.map({'No': 0, 'Yes': 1})
y_test_numeric = y_test.map({'No': 0, 'Yes': 1})

xgb = XGBClassifier(scale_pos_weight=1)  # Since we already used SMOTE
xgb.fit(X_resampled, y_resampled_numeric)

y_pred_xgb = xgb.predict(X_test_processed)

y_pred_labels = ['Yes' if p == 1 else 'No' for p in y_pred_xgb]

print(confusion_matrix(y_test, y_pred_labels))
print(classification_report(y_test, y_pred_labels))


[[876 159]
 [154 220]]
              precision    recall  f1-score   support

          No       0.85      0.85      0.85      1035
         Yes       0.58      0.59      0.58       374

    accuracy                           0.78      1409
   macro avg       0.72      0.72      0.72      1409
weighted avg       0.78      0.78      0.78      1409



In [41]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

y_pred_rf = rf.predict(X_test_processed)          # Yes / No output
y_proba_rf = rf.predict_proba(X_test_processed)   # [[prob_No, prob_Yes], ...]

y_pred_xgb = xgb.predict(X_test_processed)
y_proba_xgb = xgb.predict_proba(X_test_processed)

y_pred_xgb_mapped = ["Yes" if p == 1 else "No" for p in y_pred_xgb]

def evaluate_model(name, y_true, y_pred, y_proba=None):
    print(f"\n------ {name} ------")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision (Yes):", precision_score(y_true, y_pred, pos_label="Yes"))
    print("Recall (Yes):", recall_score(y_true, y_pred, pos_label="Yes"))
    print("F1-score (Yes):", f1_score(y_true, y_pred, pos_label="Yes"))
    if y_proba is not None:
        print("ROC-AUC:", roc_auc_score((y_true=="Yes").astype(int), y_proba[:,1]))

# Random Forest
evaluate_model("Random Forest", y_test, y_pred_rf, y_proba_rf)

# XGBoost
evaluate_model("XGBoost", y_test, y_pred_xgb_mapped, y_proba_xgb)



------ Random Forest ------
Accuracy: 0.7877927608232789
Precision (Yes): 0.6160990712074303
Recall (Yes): 0.5320855614973262
F1-score (Yes): 0.5710186513629842
ROC-AUC: 0.8265261308739571

------ XGBoost ------
Accuracy: 0.7778566359119943
Precision (Yes): 0.5804749340369393
Recall (Yes): 0.5882352941176471
F1-score (Yes): 0.5843293492695883
ROC-AUC: 0.8312265881319589


In [42]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report

qda = QuadraticDiscriminantAnalysis(reg_param=0.1)
qda.fit(X_resampled, y_resampled)

X_test_processed = preprocessor.transform(X_test)

# Predict on test set
y_pred = qda.predict(X_test_processed)

# Evaluation
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))




Confusion Matrix:
[[527 508]
 [ 34 340]]

Classification Report:
              precision    recall  f1-score   support

          No       0.94      0.51      0.66      1035
         Yes       0.40      0.91      0.56       374

    accuracy                           0.62      1409
   macro avg       0.67      0.71      0.61      1409
weighted avg       0.80      0.62      0.63      1409

