In [28]:
import numpy as np
import pandas as pd

In [29]:
df = pd.read_csv("7 churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## KEEPING ONLY THE IMPORTANT COLUMNS

In [30]:
columns_to_keep = ['gender' , 'SeniorCitizen','Partner','Dependents','tenure','PhoneService','MultipleLines','Contract','TotalCharges','Churn']

In [31]:
df = df[columns_to_keep]

In [32]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,Contract,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,Month-to-month,29.85,No
1,Male,0,No,No,34,Yes,No,One year,1889.5,No
2,Male,0,No,No,2,Yes,No,Month-to-month,108.15,Yes
3,Male,0,No,No,45,No,No phone service,One year,1840.75,No
4,Female,0,No,No,2,Yes,No,Month-to-month,151.65,Yes


## ENCODING THE DATA INTO BINARY 

In [33]:
binary_columns = ['gender' , 'Partner','Dependents','PhoneService','MultipleLines','Contract','Churn']
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

for cols in binary_columns:
    df[cols] = label_encoder.fit_transform(df[cols])

In [34]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,Contract,TotalCharges,Churn
0,0,0,1,0,1,0,1,0,29.85,0
1,1,0,0,0,34,1,0,1,1889.5,0
2,1,0,0,0,2,1,0,0,108.15,1
3,1,0,0,0,45,0,1,1,1840.75,0
4,0,0,0,0,2,1,0,0,151.65,1


## Train Test Splitting

In [35]:
X = df.drop('Churn' , axis = 1)
y = df['Churn']

In [36]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.2 , random_state=42)

## Converting the TOTALCHARGES into float

In [37]:
X_train['TotalCharges'] = pd.to_numeric(X_train['TotalCharges'] ,errors = 'coerce')  
X_test['TotalCharges'] =  pd.to_numeric(X_test['TotalCharges'] , errors = 'coerce')

In [38]:
X_train.isnull().sum()

gender            0
SeniorCitizen     0
Partner           0
Dependents        0
tenure            0
PhoneService      0
MultipleLines     0
Contract          0
TotalCharges     10
dtype: int64

## REMOVING THE NULL VALUES 

In [39]:
X_train['TotalCharges'].fillna(X_train['TotalCharges'].mean() , inplace = True)
X_test['TotalCharges'].fillna(X_test['TotalCharges'].mean() , inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train['TotalCharges'].fillna(X_train['TotalCharges'].mean() , inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test['TotalCharges'].fillna(X_test['TotalCharges'].mean() , inplace = True)


In [40]:
X_train.isnull().sum()

gender           0
SeniorCitizen    0
Partner          0
Dependents       0
tenure           0
PhoneService     0
MultipleLines    0
Contract         0
TotalCharges     0
dtype: int64

## STANDARDIZATION OF DATA

In [41]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)


In [42]:
X_train

array([[-1.02516569, -0.4377492 , -0.96957859, ..., -1.00053704,
         0.37290835, -0.42210502],
       [-1.02516569, -0.4377492 , -0.96957859, ...,  1.10833901,
         1.5775905 ,  1.25536015],
       [ 0.97545208, -0.4377492 ,  1.03137591, ...,  0.05390099,
        -0.83177379, -1.00299144],
       ...,
       [ 0.97545208, -0.4377492 ,  1.03137591, ..., -1.00053704,
        -0.83177379, -0.87799925],
       [ 0.97545208,  2.28441306, -0.96957859, ...,  1.10833901,
        -0.83177379, -0.48254445],
       [ 0.97545208, -0.4377492 , -0.96957859, ..., -1.00053704,
         0.37290835, -0.81110232]], shape=(5634, 9))

## APPLYING LOGISTIC REGRESSION

In [43]:
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression()
lg.fit(X_train , y_train)

In [44]:
y_pred = lg.predict(X_test)
y_pred

array([1, 0, 0, ..., 0, 0, 1], shape=(1409,))

## ACCURACY CHECKING

In [45]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.7778566359119943

In [46]:
import pickle
pickle.dump(lg,open('Project2_CustomerChur.pkl' , 'wb'))

## CREATING A CLASSIFICATION SYSTEM

In [47]:
def predictive(gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,Contract,TotalCharges):
    data = {
        'gender' : [gender],
        'SeniorCitizen' : [SeniorCitizen],
        'Partner' : [Partner],
        'Dependents' : [Dependents],
        'tenure' : [tenure],
        'PhoneService' : [PhoneService],
        'MultipleLines' : [MultipleLines],
        'Contract' : [Contract],
        'TotalCharges' : [TotalCharges],
        
    }

    df1 = pd.DataFrame(data)

    #Encoding the categorical columns
    categorical_columns = ['gender','SeniorCitizen','Partner','Dependents','tenure','PhoneService','MultipleLines','Contract','TotalCharges']
    for cols in categorical_columns:
        df1[cols] = label_encoder.fit_transform(df1[cols])

    df1= scaler.fit_transform(df1)
    result = lg.predict(df1).reshape(1,-1)
    return result[0]

In [51]:
gender = 'Female'
SeniorCitizen = 'No'
Partner = 'Yes'
Dependents = 'No'
tenure = 1
PhoneService = 'No'
MultipleLines = 'No phone service'
Contract = 'Month-to-Month'
TotalCharges = 29.85
result = predictive(gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,Contract,TotalCharges)

if result == 0:
    print("Not Churn")
else:
    print("Churn")


Not Churn
