## Import dependencies

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

## Load Data

### Read raw data

In [None]:
# Load raw data
df = pd.read_csv("../data/WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Read first five rows of dataframe
df.head()

## Data Cleaning

### Check for missing data

In [None]:
df.isna().sum()

### Convert 'TotalCharges' and 'MonthlyCharges' to float

In [None]:
df['TotalCharges'] = df['TotalCharges'].apply(lambda x: 0.0 if x == ' ' else float(x))

## Feature Engineering

### One-Hot Encode Features

In [None]:
# Encode binary features
df['gender_encoded'] = df['gender'].apply(lambda x: 0 if x == "Female" else 1)
df['Partner_encoded'] = df['Partner'].apply(lambda x: 0 if x == "No" else 1)
df['Dependents_encoded'] = df['Dependents'].apply(lambda x: 0 if x == "No" else 1)
df['PhoneService_encoded'] = df['PhoneService'].apply(lambda x: 0 if x == "No" else 1)
df['PaperlessBilling_encoded'] = df['PaperlessBilling'].apply(lambda x: 0 if x == "No" else 1)
df['Churn_encoded'] = df['Churn'].apply(lambda x: 0 if x == "No" else 1)

### Label Encode Features

In [None]:
from sklearn.preprocessing import LabelEncoder
pd.set_option("display.max_columns", None)

features_to_label_encode = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 
                            'DeviceProtection',  'TechSupport', 'StreamingTV', 'StreamingMovies',
                             'Contract', 'PaymentMethod',]

def apply_label_encoding(df, list_features):
    le = LabelEncoder()

    for feature in list_features:
        df[f'{feature}_encoded'] = le.fit_transform(df[feature])
    
    return df

apply_label_encoding(df, features_to_label_encode)

## Feature Selection 

In [None]:
# Recreate dataframe with
df_features = df[['Churn_encoded', 'gender_encoded','SeniorCitizen', 'Partner_encoded', 'Dependents_encoded',
                  'tenure', 'PhoneService_encoded', 'MultipleLines_encoded', 'InternetService_encoded',
                  'OnlineSecurity_encoded', 'OnlineBackup_encoded', 'DeviceProtection_encoded', 'TechSupport_encoded',
                  'StreamingTV_encoded', 'StreamingMovies_encoded', 'Contract_encoded','PaperlessBilling_encoded',
                  'PaymentMethod_encoded','MonthlyCharges', 'TotalCharges',
                  ]].copy()

In [None]:
# Reset the index to ensure sequential indexing
df_features = df_features.reset_index(drop=True)

plt.figure(figsize=(12, 10))
sns.heatmap(df_features.corr(), 
            annot=True,
            cmap="Blues",
            fmt=".2f",
            linewidths=.5,
            square=True,
            cbar_kws={"shrink": .8}
        )

# Add labels, title, and adjust axes params
plt.title("Heat map of correlation matrix", fontsize=13, weight="bold")
plt.xlabel('Feature', fontsize=10)
plt.ylabel('Feature', fontsize=10)
plt.xticks(fontsize=8, rotation=45, ha='right')
plt.yticks(fontsize=8)
plt.tight_layout()
plt.show()

In [None]:
# Less important features
less_important_features = ['gender_encoded', 'PhoneService_encoded', 'MultipleLines_encoded', 
                           'InternetService_encoded','StreamingTV_encoded', 'StreamingMovies_encoded',]

# Important features
selected_final_df = df_features['Churn_encoded', 'SeniorCitizen', 'Partner_encoded',
                                'Dependents_encoded', 'tenure', 'OnlineSecurity_encoded', 'OnlineBackup_encoded',
                                'DeviceProtection_encoded', 'TechSupport_encoded','Contract_encoded',
                                'PaperlessBilling_encoded', 'PaymentMethod_encoded', 'MonthlyCharges',
                                'TotalCharges']