# Imports

In [None]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# Loading Data

In [None]:

df = pd.read_csv(
    "C:\\Users\\Kagero\\PycharmProjects\\Customer Churn\\Customer Churn data\\customer_churn_dataset-training-master.csv")

print(df.head())


In [None]:
df.drop('CustomerID', axis=1, inplace=True)
print(df.head())

In [None]:
missing_values = df.isnull().sum()
print(missing_values)
df = df.dropna()
plt.figure(figsize=(8, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis', yticklabels=False)
plt.title('Heatmap of Missing Values')
plt.show()
# Highlight the null values with a background color


In [None]:
# Calculate the correlation matrix
correlation_matrix = df.corr(numeric_only=True)

# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Feature Correlation Matrix')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Loop through each numerical column in the DataFrame
for col in df.select_dtypes(include='number').columns:
    # Calculate the mean and mode
    mean_val = df[col].mean()
    mode_val = df[col].mode()[0]  # mode() returns a Series, take the first value
    
    # Plot the distribution
    plt.figure(figsize=(8, 5))
    sns.histplot(df[col], kde=True, color='skyblue')

    # Add lines for the mean and mode
    plt.axvline(mean_val, color='red', linestyle='--', label=f'Mean: {mean_val:.2f}')
    plt.axvline(mode_val, color='green', linestyle='-', label=f'Mode: {mode_val}')

    plt.title(f'Distribution of Feature: {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.legend()
    plt.show()

# data prprocessing 

In [None]:
non_numeric_df = df.select_dtypes(exclude=['number'])
numeric_df = df.select_dtypes(include=['number'])
print(non_numeric_df)
print(numeric_df)

In [None]:
non_numeric_arr=np.array(non_numeric_df)
encoder_multi = OneHotEncoder(sparse_output=False)
one_hot_encoded_multi = encoder_multi.fit_transform(non_numeric_arr[:,1:])

print( one_hot_encoded_multi)
print(one_hot_encoded_multi.shape)

# To understand the columns, get the feature names
feature_names = encoder_multi.get_feature_names_out(['Subscription Type', 'Contract Length'])
print("Feature Names:", feature_names)

In [None]:
data = np.array(numeric_df)
X = data[:, 0:-1]
X=X-X.mean(axis=0)
#X=np.append(X,one_hot_encoded_multi,axis=1)
X=X[:,np.array([1,3,4])]
Y = data[:, -1]
#print(X.mean(axis=0))
print(X.shape)
print(Y.shape)
print(np.sum(np.isnan(X)))
print(np.sum(np.isnan(Y)))

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X, Y, test_size=0.33, random_state=42)

In [None]:
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)
print(np.isnan(X_train))
print(np.sum(np.isnan(X_val)))
print(np.isnan(y_train))
print(np.sum(np.isnan(y_val)))

# Model

In [None]:
logReg_clf = LogisticRegression(max_iter=300, random_state=42,penalty="l2",solver="newton-cholesky")
logReg_clf.fit(X_train, y_train)

Validation 

In [None]:
res = logReg_clf.predict(X_val)
print(confusion_matrix(y_val, res))
print(classification_report(y_val, res))

In [None]:
model_XG = XGBClassifier(
    device='cpu',
    n_estimators=200,
    learning_rate=0.5,
    objective='binary:logistic',
    reg_lambda=0.5
)

model_XG.fit(X_train, y_train)

In [None]:
XG_res = model_XG.predict(X_val)
print(confusion_matrix(y_val, XG_res))
print(classification_report(y_val, XG_res))

Testing

In [None]:
df_test = pd.read_csv(
    "C:\\Users\\Kagero\\PycharmProjects\\Customer Churn\\Customer Churn data\\customer_churn_dataset-testing-master.csv")

# Display the first 5 rows of the DataFrame
print(df_test.head())

In [None]:
df_test.drop('CustomerID', axis=1, inplace=True)
print(df_test.head())
missing_values = df_test.isnull().sum()
#df_test=df_test.dropna()

In [None]:
missing_values_t = df_test.isnull().sum()
print(missing_values_t)
df_test = df_test.dropna()
plt.figure(figsize=(8, 6))
sns.heatmap(df_test.isnull(), cbar=False, cmap='viridis', yticklabels=False)
plt.title('Heatmap of Missing Values')
plt.show()
# Highlight the null values with a background color

In [None]:

# Calculate the correlation matrix
correlation_matrix_t = df_test.corr(numeric_only=True)

# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix_t, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Feature Correlation Matrix')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Loop through each numerical column in the DataFrame
for col in df_test.select_dtypes(include='number').columns:
    # Calculate the mean and mode
    mean_val = df_test[col].mean()
    mode_val = df_test[col].mode()[0]  # mode() returns a Series, take the first value
    
    plt.figure(figsize=(8, 5))
    sns.histplot(df[col], kde=True, color='skyblue')

    plt.axvline(mean_val, color='red', linestyle='--', label=f'Mean: {mean_val:.2f}')
    plt.axvline(mode_val, color='green', linestyle='-', label=f'Mode: {mode_val}')

    plt.title(f'Distribution of Feature: {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.legend()
    plt.show()

In [None]:
non_numeric_df_test = df_test.select_dtypes(exclude=['number'])
numeric_df_test = df_test.select_dtypes(include=['number'])
print(non_numeric_df_test)
print(numeric_df_test)

In [None]:
non_numeric_arr_t=np.array(non_numeric_df_test)
encoder_multi = OneHotEncoder(sparse_output=False)
one_hot_encoded_multi_t = encoder_multi.fit_transform(non_numeric_arr_t[:,1:])

print(one_hot_encoded_multi_t)
print(one_hot_encoded_multi_t.shape)

# To understand the columns, get the feature names
feature_names = encoder_multi.get_feature_names_out(['Subscription Type', 'Contract Length'])
print("Feature Names:", feature_names)

In [None]:
data_test = np.array(numeric_df_test)
X_test = data_test[:, 0:-1]
X_test=X_test-X_test.mean(axis=0)
#X_test=np.append(X_test,one_hot_encoded_multi_t,axis=1)
X_test=X_test[:,np.array([1,3,4])]
Y_test = data_test[:, -1]
print(X_test.mean(axis=0))
print(X_test.shape)
print(Y_test.shape)
print(np.sum(np.isnan(X_test)))
print(np.sum(np.isnan(Y_test)))

In [None]:
res_test = logReg_clf.predict(X_test)
print(confusion_matrix(Y_test, res_test))
print(classification_report(Y_test, res_test))

In [None]:
XG_res_test = model_XG.predict(X_test)
print(confusion_matrix(Y_test, XG_res_test))
print(classification_report(Y_test, XG_res_test))