In [1]:
#same dataset as the week 3 one
import numpy as np
import scipy as sc
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder



df = pd.read_csv('Churn_DataSet.csv')

print(df.columns.tolist())

['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']


In [2]:
#ex2
#Make the necessary preprocessing for developing the models

#remove duplicate values
print(df.size)
df.drop_duplicates(inplace=True)
print(df.size)


#check for missing values
missing_df_values = df.isnull().sum()
print(missing_df_values)

# Encode the target variable 'Churn' to 0 and 1
label_encoder = LabelEncoder()
df['Churn'] = label_encoder.fit_transform(df['Churn'])

# Handle missing values in the 'TotalCharges' column (if any)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')  # Handle non-numeric values gracefully

# Identify categorical columns for one-hot encoding
nominal_categorical_columns = ['InternetService', 'Contract', 'PaymentMethod']
binary_categorical_columns = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
                             'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
                             'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'SeniorCitizen']

# One-hot encoding for nominal categorical columns
df_encoded = pd.get_dummies(df, columns=nominal_categorical_columns, drop_first=True)

# Label encoding for binary categorical columns
label_encoder = LabelEncoder()
for col in binary_categorical_columns:
    df_encoded[col] = label_encoder.fit_transform(df[col])


140640
140200
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [3]:
#ex3
# Split the data into features (X) and the target variable (y)
print(df_encoded)
X = df_encoded.drop('Churn', axis=1)
y = df_encoded['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


      gender  SeniorCitizen  Partner  Dependents  tenure  PhoneService  \
0          0              0        1           0       1             0   
1          1              0        0           0      34             1   
2          1              0        0           0       2             1   
3          1              0        0           0      45             0   
4          0              0        0           0       2             1   
...      ...            ...      ...         ...     ...           ...   
7027       1              0        1           1      24             1   
7028       0              0        1           1      72             1   
7029       0              0        1           1      11             0   
7030       1              1        1           0       4             1   
7031       1              0        0           0      66             1   

      MultipleLines  OnlineSecurity  OnlineBackup  DeviceProtection  ...  \
0                 0               0

In [4]:
#ex4
#a) Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

logistic_model = LogisticRegression(max_iter=1000)
logistic_scores = cross_val_score(logistic_model, X_train, y_train, cv=10, scoring='accuracy')
mean_accuracy = logistic_scores.mean()
print("Logistic Regression Mean Accuracy:", mean_accuracy)


Logistic Regression Mean Accuracy: 0.8018904379933792


In [5]:
#ex4
#b) Decision Tree
from sklearn.tree import DecisionTreeClassifier

decision_tree_model = DecisionTreeClassifier()
decision_tree_scores = cross_val_score(decision_tree_model, X_train, y_train, cv=10, scoring='accuracy')
mean_accuracy = decision_tree_scores.mean()
print("Decision Tree Mean Accuracy:", mean_accuracy)


Decision Tree Mean Accuracy: 0.726642475171887


In [6]:
#ex4
#c)Naives Bayes
from sklearn.naive_bayes import GaussianNB

naive_bayes_model = GaussianNB()
naive_bayes_scores = cross_val_score(naive_bayes_model, X_train, y_train, cv=10, scoring='accuracy')
mean_accuracy = naive_bayes_scores.mean()
print("Naive Bayes Mean Accuracy:", mean_accuracy)


Naive Bayes Mean Accuracy: 0.7487538197097021


In [7]:
#ex4
#d) KNN
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()
knn_scores = cross_val_score(knn_model, X_train, y_train, cv=10, scoring='accuracy')
mean_accuracy = knn_scores.mean()
print("KNN Mean Accuracy:", mean_accuracy)


KNN Mean Accuracy: 0.7655092946269417


In [8]:
#ex5
#e)SVM
from sklearn.svm import SVC

svm_model = SVC()
svm_scores = cross_val_score(svm_model, X_train, y_train, cv=10, scoring='accuracy')
mean_accuracy = svm_scores.mean()
print("SVM Mean Accuracy:", mean_accuracy)


SVM Mean Accuracy: 0.735021645021645


In [9]:
#5. Evaluate the performance of various algorithms on the test set.
from sklearn.metrics import accuracy_score

# Assuming you've trained the models earlier (logistic_model, decision_tree_model, naive_bayes_model, knn_model, svm_model)
# Evaluate each model on the test set
models = [logistic_model, decision_tree_model, naive_bayes_model, knn_model, svm_model]
model_names = ["Logistic Regression", "Decision Tree", "Naive Bayes", "KNN", "SVM"]

for model, model_name in zip(models, model_names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Test Accuracy: {accuracy:.2f}") 

Logistic Regression Test Accuracy: 0.81
Decision Tree Test Accuracy: 0.72
Naive Bayes Test Accuracy: 0.75
KNN Test Accuracy: 0.76
SVM Test Accuracy: 0.74


In [10]:
#6. Create a MLP network and optimize its configuration to predict the Churn: Yes/No attribute.
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
mlp_model_churn = MLPClassifier()
parameters = {
    'hidden_layer_sizes': [(50, 50), (100, 100), (50, 100, 50)],
    'activation': ['logistic', 'tanh', 'relu'],
    'max_iter': [500, 1000],
}

grid_search = GridSearchCV(mlp_model_churn, parameters, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_mlp_model_churn = grid_search.best_estimator_
best_mlp_model_churn.fit(X_train, y_train)
accuracy = best_mlp_model_churn.score(X_test, y_test)
print(f"Best MLP Classifier for Churn Test Accuracy: {accuracy:.2f}")


Best MLP Classifier for Churn Test Accuracy: 0.78


In [11]:
#ex7
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

mlp_model_monthly_charges = MLPRegressor()
parameters = {
    'hidden_layer_sizes': [(50, 50), (100, 100), (50, 100, 50)],
    'activation': ['logistic', 'tanh', 'relu'],
    'max_iter': [500, 1000],
}

grid_search = GridSearchCV(mlp_model_monthly_charges, parameters, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_mlp_model_monthly_charges = grid_search.best_estimator_
best_mlp_model_monthly_charges.fit(X_train, y_train)
y_pred = best_mlp_model_monthly_charges.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Best MLP Regressor for MonthlyCharges Mean Squared Error: {mse:.2f}")


Best MLP Regressor for MonthlyCharges Mean Squared Error: 0.15
