In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score , classification_report
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

df = pd.read_csv('dataset.csv')

In [9]:
def preprocess(df):
    # Drop the 'customerID' column
    if 'customerID' in df.columns:
        df.drop('customerID', axis=1, inplace=True)
    
    df = df.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)  
    # Strip whitespace from object columns
    
    df.replace('', np.nan, inplace=True)  
    # Replace empty strings with NaN
    
    df.dropna(inplace=True)  
    # Drop rows with NaN values
     
    df['TotalCharges'] = df['TotalCharges'].astype(float)
    df['SeniorCitizen'] = df['SeniorCitizen'].astype(float)
    df['tenure'] = df['tenure'].astype(float)
    df['MonthlyCharges'] = df['MonthlyCharges'].astype(float)
    # Convert 'TotalCharges' column to float
    
    new_df = pd.get_dummies(df, drop_first=True)
    
    return new_df

In [10]:
def smotter(new_df):
    X = new_df.iloc[:, :-1]
    y = new_df.iloc[:, -1]

    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                        y,
                                                        test_size=0.2,
                                                        random_state=105)
    
    sm = SMOTE(random_state=102)

    X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())
    
    X_train, X_test, y_train, y_test = train_test_split(X_train_res,
                                                        y_train_res, 
                                                        test_size=0.2,
                                                        random_state=105)
    
    return X_train, X_test, y_train, y_test

In [11]:
new_df = preprocess(df)

X_train, X_test, y_train, y_test = smotter(new_df)

In [12]:
X_train.shape

(6604, 30)

In [13]:
from sklearn.pipeline import make_pipeline

# Create the pipeline
pipeline = Pipeline([
    ('Standardization', StandardScaler()),
    ('Random_Forest', RandomForestClassifier())
     ])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)


In [14]:
pipeline

In [15]:
accuracy

0.8504842615012107

In [16]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters and their possible values
param_grid = {
    'Random_Forest__n_estimators': [100, 200, 300],
    'Random_Forest__max_depth': [None, 5, 10]
}

# Create the GridSearchCV object with the pipeline and parameter grid
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Get the best model and its hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best hyperparameters and score
print("Best Hyperparameters:", best_params)
print("Best Score:", best_score)

# Make predictions on the test data using the best model
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(classification_report(y_test, y_pred))

Best Hyperparameters: {'Random_Forest__max_depth': None, 'Random_Forest__n_estimators': 300}
Best Score: 0.8452460257380772
Accuracy: 0.8535108958837773
              precision    recall  f1-score   support

           0       0.87      0.84      0.86       845
           1       0.84      0.86      0.85       807

    accuracy                           0.85      1652
   macro avg       0.85      0.85      0.85      1652
weighted avg       0.85      0.85      0.85      1652



In [17]:
print(best_model)

Pipeline(steps=[('Standardization', StandardScaler()),
                ('Random_Forest', RandomForestClassifier(n_estimators=300))])


In [18]:
import pickle

filename = 'model.pkl'

pickle.dump(best_model, open(filename, 'wb'))

load_model = pickle.load(open(filename, 'rb'))

model_score_r1 = load_model.score(X_test, y_test)

model_score_r1

0.8535108958837773

In [19]:
y_pred_prob = best_model.predict_proba(X_test)
y_pred_prob

array([[0.01666667, 0.98333333],
       [0.99      , 0.01      ],
       [0.48333333, 0.51666667],
       ...,
       [0.12      , 0.88      ],
       [0.99      , 0.01      ],
       [0.24666667, 0.75333333]])

In [20]:
y_pred_prob[0]

array([0.01666667, 0.98333333])

In [21]:
y_pred[0]

1

In [22]:
load_model.predict_proba(X_test)

array([[0.01666667, 0.98333333],
       [0.99      , 0.01      ],
       [0.48333333, 0.51666667],
       ...,
       [0.12      , 0.88      ],
       [0.99      , 0.01      ],
       [0.24666667, 0.75333333]])

In [23]:
x = ['Female',0,'Yes','No',1,'No','No phone service',
     'DSL','No','Yes','No','No','No','No',
     'Month-to-month','Yes','Electronic check',29.85,29.85]
    
col =['gender','SeniorCitizen','Partner','Dependents',
      'tenure','PhoneService','MultipleLines','InternetService',
      'OnlineSecurity','OnlineBackup','DeviceProtection',
      'TechSupport','StreamingTV','StreamingMovies',
      'Contract','PaperlessBilling','PaymentMethod',
      'MonthlyCharges','TotalCharges']

temp_df = pd.DataFrame(data= [x], columns= col)
temp_df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85


In [24]:
col_for = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges',
               'gender_Male', 'Partner_Yes', 'Dependents_Yes', 'PhoneService_Yes',
               'MultipleLines_No phone service', 'MultipleLines_Yes',
               'InternetService_Fiber optic', 'InternetService_No',
               'OnlineSecurity_No internet service', 'OnlineSecurity_Yes',
               'OnlineBackup_No internet service', 'OnlineBackup_Yes',
               'DeviceProtection_No internet service', 'DeviceProtection_Yes',
               'TechSupport_No internet service', 'TechSupport_Yes',
               'StreamingTV_No internet service', 'StreamingTV_Yes',
               'StreamingMovies_No internet service', 'StreamingMovies_Yes',
               'Contract_One year', 'Contract_Two year', 'PaperlessBilling_Yes',
               'PaymentMethod_Credit card (automatic)',
               'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check',
               'Churn_Yes']

In [26]:
def preprocess(df):
    # Drop the 'customerID' column
    if 'customerID' in df.columns:
        df.drop('customerID', axis=1, inplace=True)
    
    df = df.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)  
    # Strip whitespace from object columns
    
    df.replace('', np.nan, inplace=True)  
    # Replace empty strings with NaN
    
    df.dropna(inplace=True)  
    # Drop rows with NaN values
     
    df['TotalCharges'] = df['TotalCharges'].astype(float)
    df['SeniorCitizen'] = df['SeniorCitizen'].astype(float)
    df['tenure'] = df['tenure'].astype(float)
    df['MonthlyCharges'] = df['MonthlyCharges'].astype(float)
    # Convert 'TotalCharges' column to float
    
    col_for = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges',
               'gender_Male', 'Partner_Yes', 'Dependents_Yes', 'PhoneService_Yes',
               'MultipleLines_No phone service', 'MultipleLines_Yes',
               'InternetService_Fiber optic', 'InternetService_No',
               'OnlineSecurity_No internet service', 'OnlineSecurity_Yes',
               'OnlineBackup_No internet service', 'OnlineBackup_Yes',
               'DeviceProtection_No internet service', 'DeviceProtection_Yes',
               'TechSupport_No internet service', 'TechSupport_Yes',
               'StreamingTV_No internet service', 'StreamingTV_Yes',
               'StreamingMovies_No internet service', 'StreamingMovies_Yes',
               'Contract_One year', 'Contract_Two year', 'PaperlessBilling_Yes',
               'PaymentMethod_Credit card (automatic)',
               'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check',
               'Churn_Yes']

    new_df = pd.get_dummies(df, columns=col_for, drop_first=True)
    
    return new_df

temp = preprocess(temp_df)
temp


KeyError: "['gender_Male', 'Partner_Yes', 'Dependents_Yes', 'PhoneService_Yes', 'MultipleLines_No phone service', 'MultipleLines_Yes', 'InternetService_Fiber optic', 'InternetService_No', 'OnlineSecurity_No internet service', 'OnlineSecurity_Yes', 'OnlineBackup_No internet service', 'OnlineBackup_Yes', 'DeviceProtection_No internet service', 'DeviceProtection_Yes', 'TechSupport_No internet service', 'TechSupport_Yes', 'StreamingTV_No internet service', 'StreamingTV_Yes', 'StreamingMovies_No internet service', 'StreamingMovies_Yes', 'Contract_One year', 'Contract_Two year', 'PaperlessBilling_Yes', 'PaymentMethod_Credit card (automatic)', 'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check', 'Churn_Yes'] not in index"

In [27]:
df.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [None]:
new_df = pd.get_dummies(temp_df, drop_first=True)
new_df

In [None]:
load_model.predict(temp)