In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy.stats as stats
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import recall_score, accuracy_score, classification_report, confusion_matrix

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTEENN
from sklearn.feature_selection import SelectKBest
from collections import Counter

import pickle
import sqlalchemy as sa
import pyodbc
import warnings
warnings.filterwarnings('ignore')
import matplotlib.ticker as mtick

In [7]:
class Telecom_customer():
    
# Data Gathering & Data Analysis
    def get_data(self):
        """fetch data from database and return dataframe"""
        Engine=sa.create_engine("mysql+pymysql://root:56664444Kj@localhost:3306/project")
        self.churn_df = pd.read_sql_table('customer_churn',Engine)
        
        #Categorical feature
        self.categorical_feature = {feature for feature in self.churn_df.columns if self.churn_df[feature].dtypes == 'O'}
#         print(f'Count of Categorical feature: {len(self.categorical_feature)}')
#         print(f'Categorical feature are:\n {self.categorical_feature}')

# Feature Selection
    def feature_selection(self):

        # Total charges are in object dtype so convert into Numerical feature 
        self.churn_df['Total_Charges'] = pd.to_numeric(self.churn_df['Total_Charges'], errors='coerce')
        
        # replace NaN values with mean value
        self.churn_df.Total_Charges = self.churn_df.Total_Charges.fillna(self.churn_df.Total_Charges.median())
        
        features = ['Gender','Senior_Citizen','Partner','Dependents','Tenure_Months','Phone_Service','Multiple_Lines',
                    'Internet_Service','Online_Security','Online_Backup','Device_Protection','Tech_Support',
                    'Streaming_TV','Streaming_Movies','Contract',
                 'Paperless_Billing','Payment_Method','Monthly_Charges','Total_Charges']
        
        # Categorical feature
        categorical_feature = {feature for feature in  self.churn_df.columns if self.churn_df[feature].dtypes == 'O'}
#         print(f'Count of Categorical feature: {len(categorical_feature)}')
#         print(f'Categorical feature are:\n {categorical_feature}')
        
        encoder = LabelEncoder()
        for feature in categorical_feature:
             self.churn_df[feature] = encoder.fit_transform(self.churn_df[feature])
            
        self.churn_df.drop(['CustomerID','Count', 'Country', 'State', 'City','Zip_Code','Lat_Long','Longitude',
                       'Latitude','Churthn_Label','Churn_Score','CLTV','Churn_Reason'],axis=1,inplace=True)
    
    def preprocess_data(self):
        """preprocess data and return model trainable data"""
        self.X = self.churn_df.drop(['Churn_Value'],axis=1)
        self.Y = self.churn_df['Churn_Value']
        
    def train_model(self):
    # Model Training/Building
    # Using SMOTENN Technique
        st=SMOTEENN()
        self.x_train_st, self.y_train_st = st.fit_resample(self.X,self.Y)
        print("The number of classes before fit {}".format(Counter(self.Y)))
        print("The number of classes after fit {}".format(Counter(self.y_train_st)))
        
#splitting the over sampling dataset 
        self.x_train_sap, self.x_test_sap, self.y_train_sap, self.y_test_sap = train_test_split(self.x_train_st, 
                                                                                    self.y_train_st, test_size=0.2)
# Random forest classifier
        self.Rfc_sampling = RandomForestClassifier(n_estimators=150,criterion='gini', max_depth=15, 
                                                   min_samples_leaf=10, min_samples_split=6)
        self.Rfc_sampling.fit(self.x_train_sap, self.y_train_sap)


    def test_evaluation(self,string):
# Model Evaluatioon Training & Testing Data
        print(string.center(50,'*'))
    
        self.test_pred = self.Rfc_sampling.predict(self.x_test_sap)
        cnf_matrix = confusion_matrix(self.y_test_sap, self.test_pred)
        print('Confusion Matrix :\n', cnf_matrix)
    
        accuracy = accuracy_score(self.y_test_sap, self.test_pred)
        print('Accuracy :', accuracy)
    
        clf_report = classification_report(self.y_test_sap, self.test_pred )
        print('Classification Report :\n', clf_report)
    
    def train_evaluation(self,string):

        print(string.center(50,'*'))
    
        self.train_pred = self.Rfc_sampling.predict(self.x_train_sap)
        cnf_matrix = confusion_matrix(self.y_train_sap, self.train_pred)
        print('Confusion Matrix :\n', cnf_matrix)
    
        accuracy = accuracy_score(self.y_train_sap, self.train_pred)
        print('Accuracy :', accuracy)
    
        clf_report = classification_report(self.y_train_sap, self.train_pred)
        print('Classification Report :\n', clf_report)
        
        return self.churn_df

    def load_file(self):
        with open('RandomForest_model.pkl', 'wb') as f:
            pickle.dump(self.Rfc_sampling, f)
            
        # Load the Model back from file
        with open('RandomForest_model.pkl', 'rb') as file:  
            self.load_model = pickle.load(file)
            
            
    def new_data(self):
        Gender = 'Female'
        Senior_Citizen = 'No'
        Partner = 'Yes'
        Dependents = 'No'
        Tenure_Months = 50
        Phone_Service = 'Yes'
        Multiple_Lines = 'No'
        Internet_Service = 'Yes'
        Online_Security = 'No internet service'
        Online_Backup = 'No internet service'
        Device_Protection = 'No internet service'
        Tech_Support = 'No internet service'
        Streaming_TV = 'Yes'
        Streaming_Movies = 'Yes'
        Contract = 'Two year'
        Paperless_Billing = 'Yes'
        Payment_Method = 'Bank transfer (automatic)'
        Monthly_Charges = 30.21
        Total_Charges = 2499.4
        
        data = [[Gender,Senior_Citizen,Partner,Dependents,Tenure_Months,Phone_Service,Multiple_Lines,Internet_Service,
                 Online_Security,Online_Backup,Device_Protection,Tech_Support,Streaming_TV,Streaming_Movies,Contract,
                 Paperless_Billing,Payment_Method,Monthly_Charges,Total_Charges]]

        df = pd.DataFrame(data, columns=['Gender','Senior_Citizen','Partner','Dependents','Tenure_Months',
                                         'Phone_Service','Multiple_Lines','Internet_Service','Online_Security',
                                         'Online_Backup','Device_Protection','Tech_Support','Streaming_TV',
                                         'Streaming_Movies','Contract','Paperless_Billing','Payment_Method',
                                         'Monthly_Charges','Total_Charges'])
        print('**'*50)
        print(self.load_model.score(self.x_test_sap, self.y_test_sap))

        for feature in df.columns:
            if df[feature].dtypes == 'object':
                categorical_feature = feature
                print(categorical_feature)
    
        encoder = LabelEncoder()
        for feature in df.columns:
            if df[feature].dtypes == 'object':
                df[feature] = encoder.fit_transform(df[feature])
        
        self.single = self.load_model.predict(df)
        self.probability = self.load_model.predict_proba(df)[:,1]
        
        print('**'*50)
        print('Prediccted Class=',self.single)
        print('Probablity=',self.probability)
        
        if self.single == 1:
            print("This Customer is likely to be Churned!")
            print(f"Confidence level is {np.round(self.probability*100, 2)}")
        else:
            print("This Customer is likely to be Continue!")
            print(f"Confidence level is {np.round(self.probability*100, 2)}")
            
            
obj=Telecom_customer()
obj.get_data()
obj.feature_selection()
obj.preprocess_data()
obj.train_model()
obj.test_evaluation('Test Data Evaluation')
print('*#'*30)
obj.train_evaluation('Train Data Evaluation')
obj.load_file()
obj.new_data()



The number of classes before fit Counter({0: 5163, 1: 1869})
The number of classes after fit Counter({1: 3224, 0: 2672})
***************Test Data Evaluation***************
Confusion Matrix :
 [[480  42]
 [ 40 618]]
Accuracy : 0.9305084745762712
Classification Report :
               precision    recall  f1-score   support

           0       0.92      0.92      0.92       522
           1       0.94      0.94      0.94       658

    accuracy                           0.93      1180
   macro avg       0.93      0.93      0.93      1180
weighted avg       0.93      0.93      0.93      1180

*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#
**************Train Data Evaluation***************
Confusion Matrix :
 [[2022  128]
 [  80 2486]]
Accuracy : 0.9558948261238338
Classification Report :
               precision    recall  f1-score   support

           0       0.96      0.94      0.95      2150
           1       0.95      0.97      0.96      2566

    accuracy             