In [1]:
import pandas as pd

In [18]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


## Loading Dataset

In [2]:
df = pd.read_csv("/content/telecom_churn.csv")

In [3]:
df

Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
0,0,128,1,1,2.70,1,265.1,110,89.0,9.87,10.0
1,0,107,1,1,3.70,1,161.6,123,82.0,9.78,13.7
2,0,137,1,0,0.00,0,243.4,114,52.0,6.06,12.2
3,0,84,0,0,0.00,2,299.4,71,57.0,3.10,6.6
4,0,75,0,0,0.00,3,166.7,113,41.0,7.42,10.1
...,...,...,...,...,...,...,...,...,...,...,...
3328,0,192,1,1,2.67,2,156.2,77,71.7,10.78,9.9
3329,0,68,1,0,0.34,3,231.1,57,56.4,7.67,9.6
3330,0,28,1,0,0.00,2,180.8,109,56.0,14.44,14.1
3331,0,184,0,0,0.00,2,213.8,105,50.0,7.98,5.0


## EDA and Data Cleaning

In [4]:
df['Churn'].value_counts()

0    2850
1     483
Name: Churn, dtype: int64

In [5]:
df.isnull().sum()

Churn              0
AccountWeeks       0
ContractRenewal    0
DataPlan           0
DataUsage          0
CustServCalls      0
DayMins            0
DayCalls           0
MonthlyCharge      0
OverageFee         0
RoamMins           0
dtype: int64

### Standarization

In [6]:
from sklearn.preprocessing import StandardScaler

In [7]:
# Split the data into features (X) and target variable (y)
X = df.drop('Churn', axis=1)
y = df['Churn']

In [8]:
# Perform feature scaling using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

### Classification

In [19]:
import joblib

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [10]:
# Split the scaled data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [11]:
# Create an instance of the logistic regression model
model = LogisticRegression()

# Fit the model on the training data
model.fit(X_train, y_train)

# Predict the churn for the test data
y_pred = model.predict(X_test)

In [20]:
# Save the trained model to Google Drive
joblib.dump(model, '/content/gdrive/My Drive/prediction1.joblib')

['/content/gdrive/My Drive/prediction1.joblib']

In [12]:
# Generate the classification report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.87      0.98      0.92       566
           1       0.62      0.18      0.28       101

    accuracy                           0.86       667
   macro avg       0.75      0.58      0.60       667
weighted avg       0.83      0.86      0.82       667



So, here we can see that the recall of the 1 is very less, since the dataset is imbalance.

### Handling Imbalance dataset

In [13]:
from imblearn.over_sampling import RandomOverSampler

In [14]:
# Split the scaled data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [15]:
# Handling class imbalance using RandomOverSampler
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

# Check the balance of the resampled data
class_counts = y_train_resampled.value_counts()
print(class_counts)

0    2284
1    2284
Name: Churn, dtype: int64


### Classification Model

In [22]:
# Create an instance of the logistic regression model
model1 = LogisticRegression()

# Fit the model on the training data
model1.fit(X_train_resampled, y_train_resampled)

# Predict the churn for the test data
y_pred = model1.predict(X_test)

In [23]:
# Save the trained model to Google Drive
joblib.dump(model1, '/content/gdrive/My Drive/prediction2.joblib')

['/content/gdrive/My Drive/prediction2.joblib']

In [17]:
#classification report
report = classification_report(y_test, y_pred)
print(report)


              precision    recall  f1-score   support

           0       0.95      0.78      0.86       566
           1       0.39      0.79      0.52       101

    accuracy                           0.78       667
   macro avg       0.67      0.79      0.69       667
weighted avg       0.87      0.78      0.81       667

