# End to End Data Analysis: MTN Churn Data 2025

In [1]:
#Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:
#Load data
mtndata = pd.read_csv('mtn_customer_churn.csv')

#To view the data structure
print("MTN Customer Churn:", mtndata.shape)

MTN Customer Churn: (974, 17)


# Dataset Overview

In [3]:
#MTN Customer data overview

#To check for data types and missing value we use info()
print("\nMTN Customers Info")
print(mtndata.info())


MTN Customers Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 974 entries, 0 to 973
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Customer ID                974 non-null    object 
 1   Full Name                  974 non-null    object 
 2   Date of Purchase           974 non-null    object 
 3   Age                        974 non-null    int64  
 4   State                      974 non-null    object 
 5   MTN Device                 974 non-null    object 
 6   Gender                     974 non-null    object 
 7   Satisfaction Rate          974 non-null    float64
 8   Customer Review            974 non-null    object 
 9   Customer Tenure in months  974 non-null    int64  
 10  Subscription Plan          974 non-null    object 
 11  Unit Price                 974 non-null    float64
 12  Number of Times Purchased  974 non-null    int64  
 13  Total Revenue              974

In [4]:
#Summary Statistics
print("\nMTN Customers Summary:")
print(mtndata.describe(include='all'))


MTN Customers Summary:
       Customer ID      Full Name Date of Purchase         Age State  \
count          974            974              974  974.000000   974   
unique         496            484                3         NaN    35   
top       CUST0003  Halima Walker       01/02/2025         NaN  Osun   
freq             3              5              450         NaN    43   
mean           NaN            NaN              NaN   48.043121   NaN   
std            NaN            NaN              NaN   17.764307   NaN   
min            NaN            NaN              NaN   16.000000   NaN   
25%            NaN            NaN              NaN   32.000000   NaN   
50%            NaN            NaN              NaN   49.000000   NaN   
75%            NaN            NaN              NaN   63.750000   NaN   
max            NaN            NaN              NaN   80.000000   NaN   

             MTN Device  Gender  Satisfaction Rate Customer Review  \
count               974     974         9

# Cleaning & Transformation

In [5]:
# Convert to numeric, coercing errors to NaN
numeric_cols = ['Unit Price', 'Total Revenue', 'Age', 'Satisfaction Rate',
                'Customer Tenure in months', 'Number of Times Purchased', 'Data Usage']
for col in numeric_cols:
    mtndata[col] = pd.to_numeric(mtndata[col], errors='coerce')

# Check for NaN values after conversion
print(mtndata[numeric_cols].isna().sum())

Unit Price                   0
Total Revenue                0
Age                          0
Satisfaction Rate            0
Customer Tenure in months    0
Number of Times Purchased    0
Data Usage                   0
dtype: int64


In [6]:
# Check for duplicates
duplicates = mtndata[mtndata.duplicated(subset=['Customer ID'], keep=False)]
print(f"Number of duplicate Customer IDs: {len(duplicates)}")
print(duplicates[['Customer ID', 'MTN Device', 'Subscription Plan']].head())



Number of duplicate Customer IDs: 805
  Customer ID           MTN Device            Subscription Plan
2    CUST0003  5G Broadband Router  150GB FUP Monthly Unlimited
3    CUST0003      Mobile SIM Card       1GB+1.5mins Daily Plan
4    CUST0003       Broadband MiFi  30GB Monthly Broadband Plan
7    CUST0006      Mobile SIM Card             7GB Monthly Plan
8    CUST0006  5G Broadband Router  1.5TB Yearly Broadband Plan


In [11]:
# Calculate churn rate by state
churn_by_state = mtndata.groupby('State')['Customer Churn Status'].value_counts(normalize=True).unstack().fillna(0)
churn_by_state['Churn Rate'] = churn_by_state['Yes'] * 100

# Display top states by churn rate
print(churn_by_state[['Churn Rate']].sort_values(by='Churn Rate', ascending=False))



Customer Churn Status  Churn Rate
State                            
Adamawa                 61.111111
Imo                     51.724138
Akwa Ibom               50.000000
Kebbi                   48.275862
Benue                   46.428571
Niger                   42.307692
Kwara                   41.176471
Yobe                    38.235294
Anambra                 37.931034
Abuja (FCT)             35.714286
Edo                     33.333333
Gombe                   32.142857
Jigawa                  32.142857
Enugu                   31.250000
Kano                    30.434783
Oyo                     30.303030
Cross River             29.032258
Lagos                   28.571429
Rivers                  27.272727
Bayelsa                 25.000000
Katsina                 24.137931
Kogi                    23.529412
Nasarawa                23.076923
Abia                    22.857143
Zamfara                 21.428571
Ekiti                   21.212121
Taraba                  20.833333
Sokoto        

In [None]:
# Add churn rate column to the original dataframe
mtndata['Churn Rate'] = 0  # Initialize the column
mtndata.loc[mtndata['Customer Churn Status'] == 'Yes', 'Churn Rate'] = 100
mtndata.loc[mtndata['Customer Churn Status'] == 'No', 'Churn Rate'] = 0

print(mtndata.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 974 entries, 0 to 973
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Customer ID                974 non-null    object 
 1   Full Name                  974 non-null    object 
 2   Date of Purchase           974 non-null    object 
 3   Age                        974 non-null    int64  
 4   State                      974 non-null    object 
 5   MTN Device                 974 non-null    object 
 6   Gender                     974 non-null    object 
 7   Satisfaction Rate          974 non-null    float64
 8   Customer Review            974 non-null    object 
 9   Customer Tenure in months  974 non-null    int64  
 10  Subscription Plan          974 non-null    object 
 11  Unit Price                 974 non-null    float64
 12  Number of Times Purchased  974 non-null    int64  
 13  Total Revenue              974 non-null    float64

In [15]:
# Fill missing data in Reasons for Churn

# Fill missing values in 'Reasons for Churn' with 'Unknown'
mtndata['Reasons for Churn'].fillna('Unknown', inplace=True)

print(mtndata.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 974 entries, 0 to 973
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Customer ID                974 non-null    object 
 1   Full Name                  974 non-null    object 
 2   Date of Purchase           974 non-null    object 
 3   Age                        974 non-null    int64  
 4   State                      974 non-null    object 
 5   MTN Device                 974 non-null    object 
 6   Gender                     974 non-null    object 
 7   Satisfaction Rate          974 non-null    float64
 8   Customer Review            974 non-null    object 
 9   Customer Tenure in months  974 non-null    int64  
 10  Subscription Plan          974 non-null    object 
 11  Unit Price                 974 non-null    float64
 12  Number of Times Purchased  974 non-null    int64  
 13  Total Revenue              974 non-null    float64

In [16]:
# Save the DataFrame to a CSV file
mtndata.to_csv('modified_mtn_data.csv', index=False)  # index=False prevents writing row indices
