In [14]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [2]:
df = pd.read_csv('telecom_customer_churn.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 38 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Customer ID                        7043 non-null   object 
 1   Gender                             7043 non-null   object 
 2   Age                                7043 non-null   int64  
 3   Married                            7043 non-null   object 
 4   Number of Dependents               7043 non-null   int64  
 5   City                               7043 non-null   object 
 6   Zip Code                           7043 non-null   int64  
 7   Latitude                           7043 non-null   float64
 8   Longitude                          7043 non-null   float64
 9   Number of Referrals                7043 non-null   int64  
 10  Tenure in Months                   7043 non-null   int64  
 11  Offer                              3166 non-null   objec

In [3]:
df.duplicated().sum()

0

In [4]:
df['Offer'] = df['Offer'].fillna('None')

In [5]:
df.loc[df['Phone Service'] == 'No', 'Avg Monthly Long Distance Charges'] = 0
df.loc[df['Phone Service'] == 'No', 'Multiple Lines'] = 'No'

In [6]:
internet_cols = ['Online Security', 'Online Backup', 'Device Protection Plan', 'Premium Tech Support', 
                 'Streaming TV', 'Streaming Movies', 'Streaming Music', 'Unlimited Data']
for col in internet_cols:
    df.loc[df['Internet Service'] == 'No', col] = 'No'

In [7]:
df.loc[df['Internet Service'] == 'No', 'Internet Type'] = 'None'

In [8]:
df['Avg Monthly GB Download'] = df['Avg Monthly GB Download'].fillna(0)

In [9]:
df['Churn'] = (df['Customer Status'] == 'Churned').astype(int)

In [10]:
# Features
features_num = ['Age', 'Number of Dependents', 'Number of Referrals', 'Tenure in Months', 
                'Avg Monthly Long Distance Charges', 'Avg Monthly GB Download', 'Monthly Charge']

features_cat = ['Gender', 'Married', 'Offer', 'Phone Service', 'Multiple Lines', 'Internet Type', 
                'Online Security', 'Online Backup', 'Device Protection Plan', 'Premium Tech Support', 
                'Streaming TV', 'Streaming Movies', 'Streaming Music', 'Unlimited Data', 
                'Contract', 'Paperless Billing', 'Payment Method']

In [11]:
X = df[features_num + features_cat]
y = df['Churn']

In [12]:
X = pd.get_dummies(X, columns=features_cat, drop_first=True)

X = X.astype(float)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,    
    random_state=42,  
    stratify=y       
)

In [20]:
model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.89      0.88      1035
           1       0.69      0.64      0.66       374

    accuracy                           0.83      1409
   macro avg       0.78      0.77      0.77      1409
weighted avg       0.82      0.83      0.82      1409



In [23]:
import joblib

model.fit(X,y)
joblib.dump(model, "model.pkl")


training_columns = X_train.columns.tolist()
joblib.dump(training_columns, "training_columns.pkl")


['training_columns.pkl']