In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib


df = pd.read_csv('/Users/laraturunc/Desktop/churn data/Telco-Customer-Churn-Dataset.csv')


df.drop('customerID', axis=1, inplace=True)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.fillna(df['TotalCharges'].median(), inplace=True)

categorical_cols = df.select_dtypes(include=['object']).columns.drop('Churn')
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns


scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])


encoder = OneHotEncoder(sparse=False, drop='first')
encoded_categorical_data = encoder.fit_transform(df[categorical_cols])
encoded_categorical_df = pd.DataFrame(encoded_categorical_data, columns=encoder.get_feature_names_out(categorical_cols))

df = df.drop(categorical_cols, axis=1)
df = pd.concat([df.reset_index(drop=True), encoded_categorical_df.reset_index(drop=True)], axis=1)


df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})


X = df.drop('Churn', axis=1)
y = df['Churn']


X, y = SMOTE().fit_resample(X, y)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


lgbm_model = LGBMClassifier()
lgbm_model.fit(X_train, y_train)


predictions = lgbm_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)


print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


joblib.dump(lgbm_model, 'telco_customer_churn_lightgbm_model.pkl')




[LightGBM] [Info] Number of positive: 3643, number of negative: 3600
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001844 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2545
[LightGBM] [Info] Number of data points in the train set: 7243, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502968 -> initscore=0.011874
[LightGBM] [Info] Start training from score 0.011874
Accuracy: 0.8557165861513688
Precision: 0.8477842003853564
Recall: 0.8621815806662312
F1 Score: 0.854922279792746


['telco_customer_churn_lightgbm_model.pkl']

In [31]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

base_learners = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
    ('svc', make_pipeline(StandardScaler(), SVC(probability=True)))
]


meta_learner = LogisticRegression()


stacking_clf = StackingClassifier(estimators=base_learners, final_estimator=meta_learner, cv=5)


stacking_clf.fit(X_train, y_train)

predictions = stacking_clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)


print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')





Accuracy: 0.863768115942029
Precision: 0.8611473272490222
Recall: 0.8628347485303723
F1 Score: 0.8619902120717781


In [34]:
joblib.dump(stacking_clf, 'telco_customer_churn_model_high_acc.pkl')

['telco_customer_churn_model_high_acc.pkl']