In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

import pickle

In [None]:
#reading data
df=pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Data cleaning

In [None]:
print(df.info(),"\n\n")
print(df.head())

In [None]:
df.isnull().sum()

In [None]:
for i in df.columns:
    print(i)
    print (df[i].unique(),"\n")

In [None]:
# Replace "yes" with 1 and "no" with 0
colToReplace= ['Partner','Dependents','PhoneService', 'MultipleLines','OnlineSecurity','OnlineBackup', 'DeviceProtection','Churn']
df[colToReplace] = df[colToReplace].replace({"Yes": 1, "No": 0,"No internet service": np.NaN,"No phone service" : np.NaN})
print(df.head())

In [None]:
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')
df.isnull().sum()

In [None]:

df.dropna(inplace = True)

df2 = df.iloc[:,1:]
df2.head()
#Let's convert all the categorical variables into dummy variables
new_df = pd.get_dummies(df2)
new_df.head()

In [None]:
print(new_df.isnull().sum())#cross checking

# corelation and feature selec

In [None]:
corr = new_df.corr()

fig, ax = plt.subplots(figsize=(20, 20))
cax = ax.matshow(corr, cmap='coolwarm', vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0, len(new_df.columns), 1)
ax.set_xticks(ticks)
ax.set_xticklabels(new_df.columns, rotation=90)
ax.set_yticks(ticks)
ax.set_yticklabels(new_df.columns)

# Print the correlation factor
for i in range(new_df.shape[1]):
    for j in range(new_df.shape[1]):
        text = ax.text(j, i, round(corr.iloc[i][j], 2), ha="center", va="center", color="w")

plt.show()

In [None]:
top_features = new_df.corr().nlargest(10, 'Churn')['Churn']
print(top_features)
print()
print(top_features.index.tolist)

# **Evaluating the Algorithms**
■ Logistic Regression
■ K-Nearest Neighbors (KNN)
■ Support Vector Machines (SVM)—Linear
■ Support Vector Machines (SVM) RBF Kernels

In [None]:
X = new_df[['Contract_Month-to-month', 'TechSupport_No', 'PaymentMethod_Electronic check', 'InternetService_Fiber optic', 'PaperlessBilling_Yes', 'SeniorCitizen', 'StreamingMovies_No', 'StreamingTV_No', 'MonthlyCharges']]
y = new_df['Churn'].values

In [None]:
#logistic regression
log_regress = linear_model.LogisticRegression(max_iter=2000)
log_regress_score = cross_val_score(log_regress, X, y, cv=10, scoring='accuracy').mean()
print(log_regress_score)


In [None]:
result = []
result.append(log_regress_score)

In [None]:
#KNN
sample_size = 1000

sampled_indices = np.random.choice(len(X), size=sample_size, replace=False)
X_sampled = X.iloc[sampled_indices]
y_sampled = y[sampled_indices]

cv_scores = []

folds = 10


ks = list(range(1,int(len(X_sampled) * ((folds - 1)/folds)), 5))

for k in ks:
  knn = KNeighborsClassifier(n_neighbors=k)
  score = cross_val_score(knn, X_sampled, y_sampled, cv=folds, scoring='accuracy').mean()
  cv_scores.append(score)

#get the maximum score
knn_score = max(cv_scores)

#find the optimal k that gives the highest score
optimal_k = ks[cv_scores.index(knn_score)]

print(f"The optimal number of neighbors is {optimal_k}")
print(knn_score)


In [None]:
result.append(knn_score)

In [None]:
#SVM linear
linear_svm = svm.SVC(kernel='linear')
linear_svm_score = cross_val_score(linear_svm, X, y,
 cv=10, scoring='accuracy').mean()
print(linear_svm_score)
result.append(linear_svm_score)


print()

#svm rbf kernerl
rbf = svm.SVC(kernel='rbf')
rbf_score = cross_val_score(rbf, X, y, cv=10, scoring='accuracy').mean()
print(rbf_score)
result.append(rbf_score)

In [None]:
algorithms = ["Logistic Regression", "K Nearest Neighbors", "SVM LinearKernel", "SVM RBF Kernel"]
cv_mean = pd.DataFrame(result,index = algorithms)
cv_mean.columns=["Accuracy"]
cv_mean.sort_values(by="Accuracy",ascending=False)

#Training and predicting

In [None]:
#logistic regression:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logreg = linear_model.LogisticRegression(max_iter = 1000)
logreg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = logreg.predict(X_test)


cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

In [None]:
filename = 'churn.pkl'
pickle.dump(logreg, open(filename, 'wb'))


# ['Contract_Month-to-month', 'TechSupport_No', 'PaymentMethod_Electronic check', 'InternetService_Fiber optic',
#  'PaperlessBilling_Yes', 'SeniorCitizen', 'StreamingMovies_No', 'StreamingTV_No', 'MonthlyCharges']
              # New instance for prediction
data = [[1, 1, 0, 0, 0, 0, 1, 1, 56.95]]

prediction = logreg.predict(data)

#the prediction result
if prediction[0] == 0:
    print("Churn: No")
else:
    print("Churn: Yes")

#analysis

In [None]:
org_df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
org_df.head()

 ['Contract_Month-to-month', 'TechSupport_No', 'PaymentMethod_Electronic check', 'InternetService_Fiber optic',
  'PaperlessBilling_Yes', 'SeniorCitizen', 'StreamingMovies_No', 'StreamingTV_No', 'MonthlyCharges']

In [None]:
new_odf = pd.get_dummies(df2)
print(new_odf.columns)
new_odf.head()

In [None]:
fnamess = ['Contract_Month-to-month', 'TechSupport_No', 'PaymentMethod_Electronic check', 'InternetService_Fiber optic', 'PaperlessBilling_Yes', 'SeniorCitizen', 'StreamingMovies_No', 'StreamingTV_No']
for fnames in fnamess:
    fnames = fnames.strip()
    if fnames in new_odf.columns:
        if 1 in new_odf[fnames].values:
            no_occurrences = new_odf[fnames].value_counts()[1]
            print("Number of", fnames, "are:", no_occurrences)
            print()
        else:
          print("try again")



In [None]:
fnamess = ['Contract_Month-to-month', 'TechSupport_No', 'PaymentMethod_Electronic check', 'InternetService_Fiber optic', 'PaperlessBilling_Yes', 'SeniorCitizen', 'StreamingMovies_No', 'StreamingTV_No']
churn_label = "Churn"

churn_rates = []

for fnames in fnamess:
    fnames = fnames.strip()
    if fnames in new_odf.columns:
        churn_rate = new_odf.loc[new_odf[fnames] == 1, churn_label].mean()
        churn_rates.append(churn_rate)

# Create the bar chart
plt.bar(fnamess, churn_rates)
plt.xlabel('Features')
plt.ylabel('Churn Rate')
plt.title('Churn Rate for Different Features')

# Rotate x-axis labels if needed
plt.xticks(rotation=90)

# Display the chart
plt.show()