### Customer Churning Project


In [None]:
# Importing relevant libraries for plotting and visualisation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer




# Data Exploration

In [None]:
data = pd.read_csv("C:\\Users\\USER\\phase3\\phase3\\phase3\\Data Sets\\bigml_59c28831336c6604c800002a.csv")
data


In [None]:
# inspect the data columns
data.head()

In [None]:
# summary of the dataset structure and characteristics
data.info()

In [None]:
# the structure of the data set
data.shape

In [None]:
# inspect the statistical summary of the dataset
data.describe()

In [None]:
# Understand the dataset variables
data.columns

In [None]:
# Check Unique Values for each variable.
unique_values_per_col = data.nunique()
print("Number of Unique Values per Column:")
print(unique_values_per_col)

In [None]:
# check unique values in the dataset
for column in data.columns:
    unique_values = data[column].unique()
    print(f"Unique values in column '{column}': {unique_values}")

In [None]:
# Check for missing values in the entire the dataset
missing_values_total = data.isna().sum()

# Check for missing values in each column
missing_values_per_column = data.isna().sum(axis=0)

# Check for missing values in each row
missing_values_per_row = data.isna().sum(axis=1)

print("Missing values in the entire dataset:")
print(missing_values_total)

print("\nMissing values in each column:")
print(missing_values_per_column)

print("\nMissing values in each row:")
print(missing_values_per_row)


In [None]:
#  distribution of variables in the dataset
# Columns in the dataset
numerical_columns = data.select_dtypes(include=['int64', 'float64']).columns

# Plot histograms for numerical variables
plt.figure(figsize=(12, 8))
for i, column in enumerate(numerical_columns, 1):
    plt.subplot(4, 4, i)
    sns.histplot(data[column], kde=True, color='skyblue', bins=20)
    plt.title(column)
    plt.xlabel('')
plt.tight_layout()
plt.show()

In [None]:
# Drop unnecessary columns
data = data.drop(columns=["state", "phone number"])
data


In [None]:
# Scatter plots for numerical variables vs. churn
plt.figure(figsize=(12, 8))
for i, column in enumerate(numerical_columns, 1):
    plt.subplot(4, 4, i)
    sns.scatterplot(x=column, y='churn', data=data, color='skyblue', alpha=0.5)
    plt.title(column)
plt.tight_layout()
plt.show()

In [None]:
#data = pd.get_dummies(data, columns=[ "international plan", "voice mail plan"], drop_first=True)

## Convert categorical to numerical data

Using data label encoder

In [None]:

# use labe encoding to transform categorical data
label_encoder = LabelEncoder()

data['international plan'] = label_encoder.fit_transform(data['international plan'])

# Label encoding 'voice mail plan'
data['voice mail plan'] = label_encoder.fit_transform(data['voice mail plan'])

# decode international plan
decoded_international_plan = label_encoder.inverse_transform(data['international plan'])

# decode voice mail plan
decoded_voice_mail_plan = label_encoder.inverse_transform(data['voice mail plan'])

## Correlation of variables in the dataset

In [None]:
# correlation of the dataset
data.corr()

## Heatmap Visuals

In [None]:
# Create a heatmap to visualise the correlation of the variables
plt.figure(figsize=(12, 8))
corr_matrix = data.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()

## Scatterplot 

In [None]:
# Scatterplot: Total Day Minutes vs. Total Day Charge with hue of churn
plt.figure(figsize=(8, 6))
sns.scatterplot(x='total day minutes', y='total day charge', hue='churn', data=data)
plt.title('Total Day Minutes vs. Total Day Charge (Colored by Churn)')
plt.xlabel('Total Day Minutes')
plt.ylabel('Total Day Charge')
plt.show()

# Scatterplot: Total Eve Minutes vs. Total Eve Charge with hue of churn
plt.figure(figsize=(8, 6))
sns.scatterplot(x='total eve minutes', y='total eve charge', hue='churn', data=data)
plt.title('Total Eve Minutes vs. Total Eve Charge (Colored by Churn)')
plt.xlabel('Total Eve Minutes')
plt.ylabel('Total Eve Charge')
plt.show()

# Scatterplot: Total Night Minutes vs. Total Night Charge with hue of churn
plt.figure(figsize=(8, 6))
sns.scatterplot(x='total night minutes', y='total night charge', hue='churn', data=data)
plt.title('Total Night Minutes vs. Total Night Charge (Colored by Churn)')
plt.xlabel('Total Night Minutes')
plt.ylabel('Total Night Charge')
plt.show()

# Scatterplot: Total Intl Minutes vs. Total Intl Charge with hue of churn
plt.figure(figsize=(8, 6))
sns.scatterplot(x='total intl minutes', y='total intl charge', hue='churn', data=data)
plt.title('Total Intl Minutes vs. Total Intl Charge (Colored by Churn)')
plt.xlabel('Total Intl Minutes')
plt.ylabel('Total Intl Charge')
plt.show()

# Scatterplot: Customer Service Calls vs. Churn with hue of churn
plt.figure(figsize=(8, 6))
sns.scatterplot(x='customer service calls', y='churn', hue='churn', data=data)
plt.title('Customer Service Calls vs. Churn (Colored by Churn)')
plt.xlabel('Customer Service Calls')
plt.ylabel('Churn')
plt.show()


## Histogram Visualisation 

Check the distriution of variables with high correlation with churn

In [None]:
# distribution of total day charge and churn status
fig = px.histogram(data, x="total day charge", color="churn", 
                   title="Distribution of Total day Charge by Churn Status")
fig.show()

In [None]:
# distribution of total int charge verse vi churn
fig = px.histogram(data, x="total intl charge", color="churn", 
                   title="Distribution of Total Intl Charge by Churn Status")
fig.show()

In [None]:
# distribution of customer service calls and churn
fig = px.histogram(data, x="customer service calls", color="churn",
                   title="Distribution of customer service calls by Churn Status")
fig.show()

In [None]:
# distribution of total eve change and churn
fig = px.histogram(data, x="total eve charge", color="churn",
                   title="Distribution of total eve charge by Churn Status")
fig.show()

In [None]:
# distribution of international plan and churn
fig = px.histogram(data, x="international plan", color="churn",
                   title="Distribution of International Plan with Color Encoding by Churn Status",
                   labels={"international plan": "International Plan", "churn": "Churn Status"})
fig.update_xaxes(type='category')  # Ensure 'international plan' is treated as a categorical variable
fig.show()

In [None]:
data.info()

### Data Processing

In [None]:
data = pd.get_dummies(data, columns=[ "international plan", "voice mail plan", "churn"], drop_first=True)

In [None]:
for var in data.columns:
    print(var,'\n', data[var].value_counts()/len(data))

In [None]:
# Differentiate  continuous and categorical features
continuous_features = ['account length', 'total day minutes', 'total day calls', 
                       'total day charge', 'total eve minutes', 'total eve calls', 
                       'total eve charge', 'total night minutes', 'total night calls', 
                       'total night charge', 'total intl minutes', 'total intl calls', 
                       'total intl charge', 'customer service calls']

categorical_features = ['area code', 'number vmail messages', 'international plan_1', 'voice mail plan_1']
X_continuous = data[continuous_features]
X_categorical = data[categorical_features]

In [None]:
# Split data, train and test sets
X_train_cont, X_test_cont, X_train_cat, X_test_cat, y_train, y_test = train_test_split(X_continuous, X_categorical, y, test_size=0.2, random_state=42)


In [None]:
# Prepocess continous and categorical features
continuous_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first'))
])

In [None]:
# merge the preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('cont', continuous_transformer, continuous_features),
        ('cat', categorical_transformer, categorical_features)
])


In [None]:
# random forest classifier
rf_classifier = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [1648]:
rf_classifier = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])


In [1663]:
import pandas as pd

# Get the column names from the original DataFrame
continuous_features = data.columns.tolist()  # Assuming df is your original DataFrame

# Convert numpy array to pandas DataFrame with specified column names
X_train_df = pd.DataFrame(X_train, columns=continuous_features)

# Plot histograms
for feat in continuous_features:    
    fig = X_train_df[feat].hist(bins=20)
    fig.set_ylabel('number of cases')
    fig.set_xlabel(feat)
    
    plt.show()


ValueError: Shape of passed values is (2666, 18), indices imply (2666, 19)

In [1662]:
data.shape

(3333, 19)

In [None]:
# Split features and target variable
X = data.drop(columns=["churn"])
y = data["churn"]

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
# Convert target variable to numerical values
data['churn'] = data['churn'].astype(int)
data['churn']

In [None]:

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
