#### Imports

In [145]:
import pandas as pd
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import ExtraTreesClassifier, BaggingClassifier
import warnings


#### Declaring constants

In [146]:
INPUT_PATH = './input/'
OUTPUT_PATH = './output/'

INPUT_FILENAME = 'raw.csv'

LABELS_DICT = {
    'call_failure': 'Call Failure',
    'complains': 'Complains',
    'subscription_length': 'Subscription Length',
    'charge_amount': 'Charge Amount',
    'seconds_of_use': 'Seconds of Use',
    'frequency_of_use': 'Frequency of Use',
    'frequency_of_sms': 'Frequency of SMS',
    'distinct_called_numbers': 'Distinct Called Numbers',
    'age_group': 'Age Group',
    'tariff_plan': 'Tariff Plan',
    'status': 'Status',
    'age': 'Age',
    'customer_value': 'Customer Value',
    'churn': 'Churn'
}

px.defaults.template = 'plotly_dark'
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")

#### Reading input data

In [147]:
df = pd.read_csv(INPUT_PATH + INPUT_FILENAME)

#### Previewing data attributes

In [148]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Call  Failure            3150 non-null   int64  
 1   Complains                3150 non-null   int64  
 2   Subscription  Length     3150 non-null   int64  
 3   Charge  Amount           3150 non-null   int64  
 4   Seconds of Use           3150 non-null   int64  
 5   Frequency of use         3150 non-null   int64  
 6   Frequency of SMS         3150 non-null   int64  
 7   Distinct Called Numbers  3150 non-null   int64  
 8   Age Group                3150 non-null   int64  
 9   Tariff Plan              3150 non-null   int64  
 10  Status                   3150 non-null   int64  
 11  Age                      3150 non-null   int64  
 12  Customer Value           3150 non-null   float64
 13  Churn                    3150 non-null   int64  
dtypes: float64(1), int64(13)

In [149]:
df.head()

Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Age,Customer Value,Churn
0,8,0,38,0,4370,71,5,17,3,1,1,30,197.64,0
1,0,0,39,0,318,5,7,4,2,1,2,25,46.035,0
2,10,0,37,0,2453,60,359,24,3,1,1,30,1536.52,0
3,10,0,38,0,4198,66,1,35,1,1,1,15,240.02,0
4,3,0,38,0,2393,58,2,33,1,1,1,15,145.805,0


In [150]:
df.isna().sum()

Call  Failure              0
Complains                  0
Subscription  Length       0
Charge  Amount             0
Seconds of Use             0
Frequency of use           0
Frequency of SMS           0
Distinct Called Numbers    0
Age Group                  0
Tariff Plan                0
Status                     0
Age                        0
Customer Value             0
Churn                      0
dtype: int64

#### Profiling data

In [151]:
# from ydata_profiling import ProfileReport


# profile = ProfileReport(df, title="Profiling Report")

# profile.to_file(f'{OUTPUT_PATH}profiling_report.html')

#### Renaming columns to snake case

In [152]:
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.columns = df.columns.str.replace('__', '_')

df.head()

Unnamed: 0,call_failure,complains,subscription_length,charge_amount,seconds_of_use,frequency_of_use,frequency_of_sms,distinct_called_numbers,age_group,tariff_plan,status,age,customer_value,churn
0,8,0,38,0,4370,71,5,17,3,1,1,30,197.64,0
1,0,0,39,0,318,5,7,4,2,1,2,25,46.035,0
2,10,0,37,0,2453,60,359,24,3,1,1,30,1536.52,0
3,10,0,38,0,4198,66,1,35,1,1,1,15,240.02,0
4,3,0,38,0,2393,58,2,33,1,1,1,15,145.805,0


#### Dropping duplicates

In [153]:
print(df.duplicated().value_counts())

df.drop_duplicates(inplace=True)

False    2850
True      300
dtype: int64


In [154]:
print(df.columns)
df.info()

Index(['call_failure', 'complains', 'subscription_length', 'charge_amount',
       'seconds_of_use', 'frequency_of_use', 'frequency_of_sms',
       'distinct_called_numbers', 'age_group', 'tariff_plan', 'status', 'age',
       'customer_value', 'churn'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2850 entries, 0 to 3131
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   call_failure             2850 non-null   int64  
 1   complains                2850 non-null   int64  
 2   subscription_length      2850 non-null   int64  
 3   charge_amount            2850 non-null   int64  
 4   seconds_of_use           2850 non-null   int64  
 5   frequency_of_use         2850 non-null   int64  
 6   frequency_of_sms         2850 non-null   int64  
 7   distinct_called_numbers  2850 non-null   int64  
 8   age_group                2850 non-null   int64  
 9   tariff_plan      

#### Visualizing data

In [155]:

fig = px.scatter(df, 
                 x='subscription_length', 
                 y='charge_amount', 
                 color='churn', 
                 size='charge_amount',
                 hover_data=['subscription_length'],
                 title='Charge Amount Over Subscription Length with Churn',
                 labels=LABELS_DICT)
fig.show()

fig = px.scatter(df,
                x='seconds_of_use',
                y='frequency_of_use',
                color='churn',
                labels=LABELS_DICT,
                title='Usage vs Frequency')
fig.show()

fig = px.histogram(df,
                    x='customer_value',
                    color='churn',
                    labels=LABELS_DICT,
                    title='Customer value distribution')
fig.show()

fig = px.box(df, 
            x='age_group', 
            y='subscription_length', 
            labels=LABELS_DICT,
            points='all', 
            title='Subscription Length by Age Group')
fig.show()


#### K-Means Clustering

In [156]:
num_clusters = 3

cluster_df = df.copy()
numeric_columns = cluster_df.select_dtypes(include=['float64', 'int64'])

kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
cluster_df['cluster'] = kmeans.fit_predict(numeric_columns)

fig = px.scatter(cluster_df, 
                x='seconds_of_use', 
                y='frequency_of_use', 
                labels=LABELS_DICT,
                color='cluster', 
                title='K-Means Clustering')
fig.show()

In [157]:
correlation_matrix = df.corr()

fig = px.imshow(correlation_matrix,
                labels=LABELS_DICT,
                x=correlation_matrix.index,
                y=correlation_matrix.columns,
                height=600)

fig.show()

### Preprocessing data
##### Feature scaling (MinMaxScaler)

In [158]:
numerical_cols = ['call_failure', 'subscription_length', 'seconds_of_use', 'frequency_of_use', 'frequency_of_sms', 'distinct_called_numbers', 'age', 'customer_value']

df[numerical_cols] = MinMaxScaler().fit_transform(df[numerical_cols])

##### Categorical encoding (One hot)

In [159]:
categorical_cols = ['complains', 'tariff_plan', 'status']

df = pd.get_dummies(df, columns=categorical_cols)

In [160]:
df.head()

Unnamed: 0,call_failure,subscription_length,charge_amount,seconds_of_use,frequency_of_use,frequency_of_sms,distinct_called_numbers,age_group,age,customer_value,churn,complains_0,complains_1,tariff_plan_1,tariff_plan_2,status_1,status_2
0,0.222222,0.795455,0,0.255705,0.278431,0.009579,0.175258,3,0.375,0.091277,0,1,0,1,0,1,0
1,0.0,0.818182,0,0.018607,0.019608,0.01341,0.041237,2,0.25,0.021261,0,1,0,1,0,0,1
2,0.277778,0.772727,0,0.143534,0.235294,0.687739,0.247423,3,0.375,0.709617,0,1,0,1,0,1,0
3,0.277778,0.795455,0,0.245641,0.258824,0.001916,0.360825,1,0.0,0.110849,0,1,0,1,0,1,0
4,0.083333,0.795455,0,0.140023,0.227451,0.003831,0.340206,1,0.0,0.067338,0,1,0,1,0,1,0


##### Handling imbalanced classes (SMOTE)

In [161]:
pipeline = Pipeline([
    ('over', SMOTE(sampling_strategy=0.5)),
    ('under', RandomUnderSampler(sampling_strategy=1.0))
])

X_resampled, y_resampled = pipeline.fit_resample(df.drop('churn', axis=1), df['churn'])
df_resampled = pd.concat([X_resampled, y_resampled], axis=1)


In [162]:
df_resampled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2404 entries, 46 to 3605
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   call_failure             2404 non-null   float64
 1   subscription_length      2404 non-null   float64
 2   charge_amount            2404 non-null   int64  
 3   seconds_of_use           2404 non-null   float64
 4   frequency_of_use         2404 non-null   float64
 5   frequency_of_sms         2404 non-null   float64
 6   distinct_called_numbers  2404 non-null   float64
 7   age_group                2404 non-null   int64  
 8   age                      2404 non-null   float64
 9   customer_value           2404 non-null   float64
 10  complains_0              2404 non-null   uint8  
 11  complains_1              2404 non-null   uint8  
 12  tariff_plan_1            2404 non-null   uint8  
 13  tariff_plan_2            2404 non-null   uint8  
 14  status_1               

#### Spliting data

In [163]:
X = df_resampled.drop('churn', axis=1)
y = df_resampled['churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Defining Classifiers

In [164]:
classifiers = [
    {
        'name': 'SVM',
        'classifier': SVC(),
        'param_grid': {
            'classifier__C': [0.1, 1, 10], 
            'classifier__kernel': ['linear', 'rbf']
        },
    },
    {
        'name': 'RandomForest',
        'classifier': RandomForestClassifier(),
        'param_grid': {
            'classifier__n_estimators': [50, 100, 200], 
            'classifier__max_depth': [None, 10, 20]
        },
    },
    {
        'name': 'LogisticRegression',
        'classifier': LogisticRegression(),
        'param_grid': {
            'classifier__C': [0.1, 1, 10],
            'classifier__penalty': ['l2'],
            'classifier__max_iter': [1000],
        },
    },
    {
        'name': 'GradientBoosting',
        'classifier': GradientBoostingClassifier(),
        'param_grid': {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__learning_rate': [0.01, 0.1, 0.2],
            'classifier__max_depth': [3, 5, 7],
        },
    },
    {
        'name': 'KNeighbors',
        'classifier': KNeighborsClassifier(),
        'param_grid': {
            'classifier__n_neighbors': [3, 5, 7],
            'classifier__weights': ['uniform', 'distance'],
        },
    },
    {
        'name': 'DecisionTree',
        'classifier': DecisionTreeClassifier(),
        'param_grid': {
            'classifier__criterion': ['gini', 'entropy'],
            'classifier__max_depth': [None, 5, 10],
        },
    },
    {
        'name': 'NaiveBayes',
        'classifier': GaussianNB(),
        'param_grid': {},
    },
    {
        'name': 'AdaBoost',
        'classifier': AdaBoostClassifier(),
        'param_grid': {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__learning_rate': [0.01, 0.1, 0.2],
        },
    },
        {
        'name': 'ExtraTrees',
        'classifier': ExtraTreesClassifier(),
        'param_grid': {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__max_depth': [None, 10, 20],
        },
    },
    {
        'name': 'Bagging',
        'classifier': BaggingClassifier(),
        'param_grid': {
            'classifier__n_estimators': [50, 100, 200],
        },
    },
    {
        'name': 'NuSVC',
        'classifier': NuSVC(),
        'param_grid': {
            'classifier__nu': [0.25, 0.5, 0.75],
            'classifier__kernel': ['linear', 'rbf'],
        },
    },
    {
        'name': 'RadiusNeighbors',
        'classifier': RadiusNeighborsClassifier(),
        'param_grid': {
            'classifier__radius': [1.0, 1.5, 2.0],
            'classifier__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        },
    },
]

#### Model training and evaluation

In [165]:
best_overall_model_name = None
best_overall_accuracy = 0.0
results = []

with open(f'{OUTPUT_PATH}model_eval_results.txt', 'w') as file:
    for classifier in classifiers:
        file.write(f"Training and evaluating {classifier['name']}...\n")

        pipeline = Pipeline([
            ('classifier', classifier['classifier']),
        ])

        grid_search = GridSearchCV(pipeline, classifier['param_grid'], cv=5, scoring='accuracy')
        grid_search.fit(X_train, y_train)

        best_params = grid_search.best_params_
        best_model = grid_search.best_estimator_

        y_pred = best_model.predict(X_test)
        test_accuracy = accuracy_score(y_test, y_pred)

        file.write(f"{classifier['name']} Test Set Accuracy: {test_accuracy:.4f}\n")

        conf_matrix = confusion_matrix(y_test, y_pred)
        class_report = classification_report(y_test, y_pred)
        file.write(f"{classifier['name']} Confusion Matrix:\n{conf_matrix}\n")
        file.write(f"{classifier['name']} Classification Report:\n{class_report}\n")

        result = {
            'name': classifier['name'],
            'test_accuracy': test_accuracy,
            'best_params': best_params,
            'conf_matrix': conf_matrix,
            'class_report': class_report,
        }

        results.append(result)

        if test_accuracy > best_overall_accuracy:
            best_overall_model_name = classifier['name']
            best_overall_accuracy = test_accuracy

        print(f"{classifier['name']} Test Set Accuracy: {test_accuracy*100:.2f}%")

        file.write(f"Best hyperparameters: {best_params}\n")
        file.write("\n" + "="*50 + "\n")

file.close()

df_results = pd.DataFrame(results)
df_results = df_results.sort_values(by='test_accuracy', ascending=False)

fig = px.bar(
    df_results, 
    x='test_accuracy', 
    y='name', 
    orientation='h', 
    title='Test set accuracy of different classifiers',
    color='name',
    labels={'name': 'Classifier'}
)

fig.update_layout(xaxis_title='Test Set Accuracy', yaxis_title='Classifier')
fig.show()


print("\nOverall best model:")
print(f"Model: {best_overall_model_name}")
print(f"Test Set Accuracy: {best_overall_accuracy*100:.2f}%")

SVM Test Set Accuracy: 86.90%
RandomForest Test Set Accuracy: 96.47%
LogisticRegression Test Set Accuracy: 85.86%
GradientBoosting Test Set Accuracy: 95.84%
KNeighbors Test Set Accuracy: 95.43%
DecisionTree Test Set Accuracy: 94.59%
NaiveBayes Test Set Accuracy: 80.04%
AdaBoost Test Set Accuracy: 90.85%
ExtraTrees Test Set Accuracy: 97.51%
Bagging Test Set Accuracy: 95.43%
NuSVC Test Set Accuracy: 89.60%
RadiusNeighbors Test Set Accuracy: 76.09%



Overall best model:
Model: ExtraTrees
Test Set Accuracy: 97.51%


In [166]:

classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

feature_importances = classifier.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

fig = px.bar(
    feature_importance_df,
    x='Importance',
    y='Feature',
    orientation='h',
    title='Feature Importance',
    labels={'Importance': 'Importance Score', 'Feature': 'Feature'},
)

fig.show()
