In [19]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import seaborn as sn
import matplotlib.pyplot as plt

In [20]:
data = pd.read_csv("cleaned_data.csv")
data.head()

Unnamed: 0,is_tv_subscriber,is_movie_package_subscriber,subscription_age,bill_avg,remaining_contract,is_contract,service_failure_count,download_avg,upload_avg,download_over_limit,churn
0,1,0,11.95,25,0.14,1,0,8.4,2.3,0,0
1,0,0,8.22,0,0.0,0,0,0.0,0.0,0,1
2,1,0,8.91,16,0.0,1,0,13.7,0.9,0,1
3,0,0,6.87,21,0.0,0,1,0.0,0.0,0,1
4,0,0,6.39,0,0.0,0,0,0.0,0.0,0,1


data is imbalanced so we have to balance it and I will use method between undersampling and oversampling 

In [21]:
fig = px.histogram(data, x='churn', color='churn', title='churn stats', barmode='overlay')
fig.update_layout(bargap=0.2)
fig.show()

In [22]:
counts = data.groupby("is_tv_subscriber")["is_movie_package_subscriber"].value_counts().unstack().fillna(0)

colors = {
    (0, 0): 'blue',    # Not Movie Package, Not TV Subscriber
    (0, 1): 'yellow',     # Movie Package, Not TV Subscriber
    (1, 0): 'red',     # Not Movie Package, TV Subscriber
    (1, 1): 'green'    # Movie Package, TV Subscriber
}

names = {
    (0, 0): "No tv subscription, No Movie package",
    (0, 1): "No tv subscription, Movie package",
    (1, 0): "Tv subscription, No Movie package",
    (1, 1): "Tv subscription, Movie package"
}

fig = go.Figure(data=[
    go.Bar(name=names[(0, 0)] , y=[counts[0][0]], marker_color=colors[(0, 0)]),
    go.Bar(name=names[(1, 0)], y=[counts[0][1]], marker_color=colors[(1, 0)]),
    go.Bar(name=names[(1, 1)], y=[counts[1][1]], marker_color=colors[(1, 1)]),
    go.Bar(name=names[(0, 1)], y=[counts[1][0]], marker_color=colors[(0, 1)])
])
fig.update_layout(barmode='group', title='Count of Customers by TV and Movie Package Subscription')
fig.update_xaxes(title='Subscription Type', tickmode='array', ticktext=[])
fig.update_yaxes(title='Count')
fig.show()

In [23]:


fig = go.Figure(data=go.Box(x=data["churn"], y=data["subscription_age"], boxmean=True))
fig.update_layout(title='Subscription Age by Churn Status',
                  xaxis_title='Churn Status', yaxis_title='Subscription Age')
fig.show()

In [24]:
fig = go.Figure(data=go.Box(x=data["is_tv_subscriber"], y=data["subscription_age"], boxmean=True))
fig.update_layout(title='Subscription Age to is tv subscriber',
                  xaxis_title='tv subsription status', yaxis_title='Subscription Age')
fig.show()



In [25]:
fig = go.Figure(data=go.Box(x=data["churn"], y=data["bill_avg"], boxmean=True))
fig.update_layout(title='Subscription Age by Churn Status',
                  xaxis_title='Churn Status', yaxis_title='bill_avg')
fig.show()

In [26]:
fig = go.Figure(data=go.Box( y=data["bill_avg"], boxmean=True))
fig.update_layout(title='bill_avg',
                  xaxis_title='bill avg', yaxis_title='Amount')
fig.show()

In [27]:
corr = data.corr()

fig = go.Figure(data=go.Heatmap(
                   z=corr.values,
                   x=corr.index.values,
                   y=corr.columns.values,
                   colorscale='Viridis',
                   colorbar=dict(title='Correlation')))
fig.update_layout(title='Correlation Heatmap')
fig.show()