In [38]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("Telecom Churn Prediction.csv")

In [3]:
df.drop("customerID", axis=1, inplace=True)

In [4]:
# Define a mapping dictionary

df['gender'] = df['gender'].map({'Male': 0, 'Female': 1})
df['MultipleLines'] = df['MultipleLines'].map({'No': 0, 'Yes': 1, 'No phone service': 2})
df['Contract'] = df['Contract'].map({'Month-to-month': 0, 'One year': 1, 'Two year': 2})
df['InternetService'] = df['InternetService'].map({'No': 0, 'DSL': 1, 'Fiber optic': 2})
df['PaymentMethod'] = df['PaymentMethod'].map({'Electronic check': 0, 'Mailed check': 1, 'Bank transfer (automatic)': 2, 'Credit card (automatic)': 3})
df['TotalCharges']=df['TotalCharges'].replace({' ': 0})
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'])

for col in ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']:
    df[col] = df[col].map({'No': 0, 'Yes': 1})
for col in ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']:
    df[col] = df[col].map({'No': 0, 'Yes': 1, 'No internet service': 2})

In [5]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,1,0,1,0,1,0,2,1,0,1,0,0,0,0,0,1,0,29.85,29.85,0
1,0,0,0,0,34,1,0,1,1,0,1,0,0,0,1,0,1,56.95,1889.5,0
2,0,0,0,0,2,1,0,1,1,1,0,0,0,0,0,1,1,53.85,108.15,1
3,0,0,0,0,45,0,2,1,1,0,1,1,0,0,1,0,2,42.3,1840.75,0
4,1,0,0,0,2,1,0,2,0,0,0,0,0,0,0,1,0,70.7,151.65,1


In [8]:
# Using min max scaler technique on tenure, monthlycharges and total charges column

scaler = MinMaxScaler()
df[['tenure', 'MonthlyCharges', 'TotalCharges']] = scaler.fit_transform(df[['tenure', 'MonthlyCharges', 'TotalCharges']])

In [10]:
# Splitting the data

X = df.drop("Churn", axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=100)

In [18]:
smote = SMOTE(random_state=42)
X_oversampled, y_oversampled = smote.fit_resample(X_train, y_train)
print("Shape of X_oversampled:", X_oversampled.shape)
print("Shape of y_oversampled:", y_oversampled.shape)

Shape of X_oversampled: (3120, 19)
Shape of y_oversampled: (3120,)


In [22]:
# Logistic regression using hyperparameter
logistic = LogisticRegression(max_iter=1000)

In [23]:
# Testing data
logistic.fit(X_train, y_train)

predict = logistic.predict(X_test)

accuracy = accuracy_score(y_test, predict)
print(f"Testing Accuracy: {accuracy * 100:.2f}%")

Testing Accuracy: 79.42%


In [28]:
# Logistic regression using regularization type L1 and regularization strength (C)
# The 'liblinear' solver is a good choice for small datasets and binary classification.

logistic = LogisticRegression(penalty='l1', C=0.01, solver='liblinear')

In [29]:
# Testing data
logistic.fit(X_train, y_train)

predict = logistic.predict(X_test)

accuracy = accuracy_score(y_test, predict)
print(f"Testing Accuracy: {accuracy * 100:.2f}%")

Testing Accuracy: 73.29%


In [34]:
# Logistic regression using regularization type L2 and regularization strength (C) and some hyperparameters

logistic = LogisticRegression(max_iter=1000, penalty='l2', C=1.0, solver='lbfgs', tol=1e-4, class_weight=None, random_state=42)

In [35]:
# Testing data
logistic.fit(X_train, y_train)

predict = logistic.predict(X_test)

accuracy = accuracy_score(y_test, predict)
print(f"Testing Accuracy: {accuracy * 100:.2f}%")

Testing Accuracy: 79.42%


In [32]:
# Logistic regression using regularization type elastic net and regularization strength (C)
# The solver 'saga' supports Elastic net regularization.

logistic = LogisticRegression(penalty='elasticnet', C=0.01, solver='saga', l1_ratio=0.5)

In [33]:
# Testing data
logistic.fit(X_train, y_train)

predict = logistic.predict(X_test)

accuracy = accuracy_score(y_test, predict)
print(f"Testing Accuracy: {accuracy * 100:.2f}%")

Testing Accuracy: 77.51%


In [40]:
# Initialize the Decision Tree model
descision_obj = DecisionTreeClassifier()

# Train the model with the training data
descision_obj.fit(X_train, y_train)

# Make predictions on the test data
predictions = descision_obj.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 71.63%


In [45]:
dt_classifier = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, random_state=42)

# Train the model with the training data
dt_classifier.fit(X_train, y_train)

# Make predictions on the test data
predictions = dt_classifier.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 71.53%


In [56]:
# Initialize the Random Forest model
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)  # 100 trees in the forest

# Train the model with the training data
rf_clf.fit(X_train, y_train)

# Make predictions on the test data
predictions = rf_clf.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 78.22%


In [46]:
rf_classifier = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, random_state=42)

# Train the model with the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the test data
predictions = rf_classifier.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 78.22%


In [42]:
# Initialize the SVM model
svm_model = SVC()

# Train the model with the training data
svm_model.fit(X_train, y_train)

# Make predictions on the test data
predictions = svm_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 78.81%


In [47]:
svm_classifier = SVC(C=1.0, kernel='rbf', gamma='scale', random_state=42)

# Train the model with the training data
svm_classifier.fit(X_train, y_train)

# Make predictions on the test data
predictions = svm_classifier.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 78.81%


In [44]:
knn = KNeighborsClassifier()

# Train the model with the training data
knn.fit(X_train, y_train)

# Make predictions on the test data
predictions = knn.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 74.57%


In [55]:
knn_classifier = KNeighborsClassifier(n_neighbors=10, weights='uniform', algorithm='auto')

# Train the model with the training data
knn_classifier.fit(X_train, y_train)

# Make predictions on the test data
predictions = knn_classifier.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 76.80%
