In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [4]:
train_df = pd.read_csv(r"D:\Final_tasks\classification\churn-bigml-80.csv")
test_df  = pd.read_csv(r"D:\Final_tasks\classification\churn-bigml-20.csv")

In [5]:
TARGET = "Churn"
train_df[TARGET] = train_df[TARGET].astype(int)
test_df[TARGET] = test_df[TARGET].astype(int)


In [6]:
X_train = train_df.drop(columns=[TARGET])
y_train = train_df[TARGET]

X_test  = test_df.drop(columns=[TARGET])
y_test = test_df[TARGET]

In [7]:
numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X_train.select_dtypes(exclude=[np.number]).columns.tolist()

In [8]:
numeric_transform = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transform = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer([
    ("num", numeric_transform, numeric_cols),
    ("cat", categorical_transform, categorical_cols)
])


In [9]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=500),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()

}

In [16]:
results = []

In [17]:
for name, model in models.items():
    pipe = Pipeline([
        ("preprocess", preprocess),
        ("model", model)
    ])
    
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    
    acc  = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec  = recall_score(y_test, y_pred)
    f1   = f1_score(y_test, y_pred)
    
    results.append([name, acc, prec, rec, f1])

In [19]:
summary_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score"])
print("\n*** MODEL PERFORMANCE SUMMARY ***\n")
print(summary_df)


*** MODEL PERFORMANCE SUMMARY ***

                Model  Accuracy  Precision    Recall  F1-Score
0  LogisticRegression  0.857571   0.500000  0.242105  0.326241
1       Decision Tree  0.926537   0.739583  0.747368  0.743455
2       Random Forest  0.946027   0.983607  0.631579  0.769231
