In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score , classification_report
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

original_dataframe = pd.read_csv('dataset.csv')
df = original_dataframe.copy()
df.shape

(7043, 21)

In [3]:
def preprocess(df):
    # Drop the 'customerID' column
    if 'customerID' in df.columns:
        df.drop('customerID', axis=1, inplace=True)
    
    df = df.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)  
    # Strip whitespace from object columns
    
    df.replace('', np.nan, inplace=True)
    # Replace empty strings with NaN
    
    df = df.dropna().reset_index(drop=True)
    # Drop rows with NaN values
     
    df['TotalCharges'] = df['TotalCharges'].astype(float)
    df['SeniorCitizen'] = df['SeniorCitizen'].astype(float)
    df['tenure'] = df['tenure'].astype(float)
    df['MonthlyCharges'] = df['MonthlyCharges'].astype(float)
    # Convert 'TotalCharges' column to float
        
    return df

In [4]:
new_df = preprocess(df)
new_df
print(new_df.shape)

X = new_df.iloc[:, :-1]
y = new_df.iloc[:, -1]

print(X.shape, y.shape)
# X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=105)

(7032, 20)
(7032, 19) (7032,)


In [5]:
def X_encoder(X):    
    
    numerical_columns = X.select_dtypes(include='float').columns
    categorical_columns = X.select_dtypes(include='object').columns
    
    X_num = X[numerical_columns].values
    
    ohe = OneHotEncoder(sparse_output = False, drop= 'first', handle_unknown='ignore')
    X_cat = ohe.fit_transform(X[categorical_columns])
    
    X = np.hstack((X_num,X_cat))
    
    return X

In [6]:
X = X_encoder(X)
print(X.shape)
X

(7032, 30)


array([[  0.  ,   1.  ,  29.85, ...,   0.  ,   1.  ,   0.  ],
       [  0.  ,  34.  ,  56.95, ...,   0.  ,   0.  ,   1.  ],
       [  0.  ,   2.  ,  53.85, ...,   0.  ,   0.  ,   1.  ],
       ...,
       [  0.  ,  11.  ,  29.6 , ...,   0.  ,   1.  ,   0.  ],
       [  1.  ,   4.  ,  74.4 , ...,   0.  ,   0.  ,   1.  ],
       [  0.  ,  66.  , 105.65, ...,   0.  ,   0.  ,   0.  ]])

In [7]:
def y_encoder(y):
    
    lab = LabelEncoder()

    y = lab.fit_transform(y)
    
    return y

In [8]:
y = y_encoder(y)
print(y.shape)

y

(7032,)


array([0, 0, 1, ..., 0, 1, 0])

In [9]:
from imblearn.over_sampling import SMOTE
def smoter(X, y):
    sm = SMOTE(random_state=102)
    
    X, y = sm.fit_resample(X, y.ravel())
    
    return X, y

In [10]:
X, y = smoter(X, y)

In [11]:
X.shape, y.shape

((10326, 30), (10326,))

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=105)

In [13]:
from sklearn.pipeline import Pipeline

# Create the pipeline
pipeline = Pipeline([
    ('Standardization', StandardScaler()),
    ('Random_Forest', RandomForestClassifier())
     ])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)

In [14]:
pipeline

In [15]:
accuracy

0.8707647628267183

In [16]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters and their possible values
param_grid = {
    'Random_Forest__n_estimators': [100, 200, 300],
    'Random_Forest__max_depth': [None, 5, 10]
}

# Create the GridSearchCV object with the pipeline and parameter grid
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Get the best model and its hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best hyperparameters and score
print("Best Hyperparameters:", best_params)
print("Best Score:", best_score)

# Make predictions on the test data using the best model
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(classification_report(y_test, y_pred))

Best Hyperparameters: {'Random_Forest__max_depth': None, 'Random_Forest__n_estimators': 300}
Best Score: 0.8571428571428571
Accuracy: 0.8712487899322362
              precision    recall  f1-score   support

           0       0.85      0.90      0.87      1037
           1       0.89      0.85      0.87      1029

    accuracy                           0.87      2066
   macro avg       0.87      0.87      0.87      2066
weighted avg       0.87      0.87      0.87      2066



In [17]:
print(best_model)

Pipeline(steps=[('Standardization', StandardScaler()),
                ('Random_Forest', RandomForestClassifier(n_estimators=300))])


In [18]:
import pickle

filename = 'model.pkl'

pickle.dump(best_model, open(filename, 'wb'))

load_model = pickle.load(open(filename, 'rb'))

model_score_r1 = load_model.score(X_test, y_test)

model_score_r1

0.8712487899322362

In [19]:
y_pred_prob = best_model.predict_proba(X_test)
y_pred_prob

array([[0.97333333, 0.02666667],
       [0.56      , 0.44      ],
       [1.        , 0.        ],
       ...,
       [0.81      , 0.19      ],
       [0.21627778, 0.78372222],
       [0.00333333, 0.99666667]])

In [20]:
y_pred_prob[0]

array([0.97333333, 0.02666667])

In [21]:
y_pred[0]

0

In [22]:
X_test

array([[ 0.        , 33.        , 20.1       , ...,  0.        ,
         0.        ,  1.        ],
       [ 1.        , 42.        , 74.15      , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.        , 72.        , 84.9       , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 1.        ,  7.        , 64.95      , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.        ,  1.        , 20.05      , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.43377282, 15.43377282, 98.21541071, ...,  0.        ,
         1.        ,  0.        ]])

In [23]:
load_model.predict_proba(X_test)

array([[0.97333333, 0.02666667],
       [0.56      , 0.44      ],
       [1.        , 0.        ],
       ...,
       [0.81      , 0.19      ],
       [0.21627778, 0.78372222],
       [0.00333333, 0.99666667]])

In [28]:
x = ['Female',0,'Yes','No',1,'No','No phone service',
 'DSL','No','Yes','No','No','No','No',
 'Month-to-month','Yes','Electronic check',29.85,29.85]

col =['gender', 'SeniorCitizen','Partner','Dependents',
      'tenure','PhoneService','MultipleLines','InternetService',
      'OnlineSecurity','OnlineBackup','DeviceProtection',
      'TechSupport','StreamingTV','StreamingMovies','Contract',
      'PaperlessBilling','PaymentMethod','MonthlyCharges',
      'TotalCharges']

new_df = pd.DataFrame(data = [x], columns=col)

df = pd.concat([original_dataframe.iloc[:,1:len(original_dataframe.columns)-1],
                new_df], 
               ignore_index = True) 

X = preprocess(df)

X = X_encoder(X)

X_test = X[-1].reshape(1, -1)
prediction = load_model.predict(X_test)
prob = load_model.predict_proba(X_test)
probability_No,probability_yes = prob[0][0]*100 , prob[0][1]*100

if prediction[0]==0:
    print('No Churn')
    print(f'{round(probability_No,2)}% Probability')
else:
    print('yes Churn')
    print(f'{round(probability_yes,2)}% Probability')


No Churn
92.67% Probability


In [25]:
prob = load_model.predict_proba(X_test)


In [26]:
probability_No,probability_yes

(92.66666666666666, 7.333333333333333)

In [27]:
Male
1
No
No
1
No
No
phone service
DSL
No
No
Yes
No
No
Yes
Month-to-month
Yes
Electronic check
39.65
39.65


SyntaxError: invalid syntax (3036699800.py, line 8)