# Training Data Trasformation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
train_data = pd.read_csv("/content/churn-bigml-80.csv")
train_data.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [None]:
columns = train_data.columns
row = len(train_data.index)
print(columns)
print(len(columns))
row

Index(['State', 'Account length', 'Area code', 'International plan',
       'Voice mail plan', 'Number vmail messages', 'Total day minutes',
       'Total day calls', 'Total day charge', 'Total eve minutes',
       'Total eve calls', 'Total eve charge', 'Total night minutes',
       'Total night calls', 'Total night charge', 'Total intl minutes',
       'Total intl calls', 'Total intl charge', 'Customer service calls',
       'Churn'],
      dtype='object')
20


2666

In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2666 entries, 0 to 2665
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   State                   2666 non-null   object 
 1   Account length          2666 non-null   int64  
 2   Area code               2666 non-null   int64  
 3   International plan      2666 non-null   object 
 4   Voice mail plan         2666 non-null   object 
 5   Number vmail messages   2666 non-null   int64  
 6   Total day minutes       2666 non-null   float64
 7   Total day calls         2666 non-null   int64  
 8   Total day charge        2666 non-null   float64
 9   Total eve minutes       2666 non-null   float64
 10  Total eve calls         2666 non-null   int64  
 11  Total eve charge        2666 non-null   float64
 12  Total night minutes     2666 non-null   float64
 13  Total night calls       2666 non-null   int64  
 14  Total night charge      2666 non-null   

In [None]:
train_data.isnull().sum()

State                     0
Account length            0
Area code                 0
International plan        0
Voice mail plan           0
Number vmail messages     0
Total day minutes         0
Total day calls           0
Total day charge          0
Total eve minutes         0
Total eve calls           0
Total eve charge          0
Total night minutes       0
Total night calls         0
Total night charge        0
Total intl minutes        0
Total intl calls          0
Total intl charge         0
Customer service calls    0
Churn                     0
dtype: int64

In [None]:
unq_states = train_data["State"].unique()
print(unq_states)
print(train_data["International plan"].unique())
print(train_data["Voice mail plan"].unique())
print(train_data["Churn"].unique())
unq_states = list(unq_states)

['KS' 'OH' 'NJ' 'OK' 'AL' 'MA' 'MO' 'WV' 'RI' 'IA' 'MT' 'ID' 'VT' 'VA'
 'TX' 'FL' 'CO' 'AZ' 'NE' 'WY' 'IL' 'NH' 'LA' 'GA' 'AK' 'MD' 'AR' 'WI'
 'OR' 'DE' 'IN' 'UT' 'CA' 'SD' 'NC' 'WA' 'MN' 'NM' 'NV' 'DC' 'NY' 'KY'
 'ME' 'MS' 'MI' 'SC' 'TN' 'PA' 'HI' 'ND' 'CT']
['No' 'Yes']
['Yes' 'No']
[False  True]


## Feature Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_data["International plan"] = le.fit_transform(train_data["International plan"])
train_data["Voice mail plan"] = le.fit_transform(train_data["Voice mail plan"])
train_data["Churn"] = le.fit_transform(train_data["Churn"])

In [None]:
print(train_data["International plan"].unique())
print(train_data["Voice mail plan"].unique())
print(train_data["Churn"].unique())

[0 1]
[1 0]
[0 1]


In [None]:
temp = pd.get_dummies(train_data["State"])
train_data = pd.concat([train_data, temp],axis=1)
train_data.drop(["State"], axis = 1, inplace = True)

In [None]:
train_data.head()

Unnamed: 0,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,...,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY
0,128,415,0,1,25,265.1,110,45.07,197.4,99,...,0,0,0,0,0,0,0,0,0,0
1,107,415,0,1,26,161.6,123,27.47,195.5,103,...,0,0,0,0,0,0,0,0,0,0
2,137,415,0,0,0,243.4,114,41.38,121.2,110,...,0,0,0,0,0,0,0,0,0,0
3,84,408,1,0,0,299.4,71,50.9,61.9,88,...,0,0,0,0,0,0,0,0,0,0
4,75,415,1,0,0,166.7,113,28.34,148.3,122,...,0,0,0,0,0,0,0,0,0,0


## Feature Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
unq_states.append("International plan")
unq_states.append("Voice mail plan")
scale_col = train_data.drop(unq_states, axis = 1).columns
train_data[scale_col] = scaler.fit_transform(train_data[scale_col])

In [None]:
train_data.head()

Unnamed: 0,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,...,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY
0,0.524793,0.068627,0,1,0.5,0.755701,0.6875,0.755701,0.542755,0.582353,...,0,0,0,0,0,0,0,0,0,0
1,0.438017,0.068627,0,1,0.52,0.460661,0.76875,0.460597,0.537531,0.605882,...,0,0,0,0,0,0,0,0,0,0
2,0.561983,0.068627,0,0,0.0,0.693843,0.7125,0.69383,0.333242,0.647059,...,0,0,0,0,0,0,0,0,0,0
3,0.342975,0.0,1,0,0.0,0.853478,0.44375,0.853454,0.170195,0.517647,...,0,0,0,0,0,0,0,0,0,0
4,0.305785,0.068627,1,0,0.0,0.4752,0.70625,0.475184,0.407754,0.717647,...,0,0,0,0,0,0,0,0,0,0


## Spliting Data

In [None]:
X_train = train_data.iloc[:, :-1].values
y_train = train_data.iloc[:, -1].values

In [None]:
X_train[:1]

array([[0.52479339, 0.06862745, 0.        , 1.        , 0.5       ,
        0.75570125, 0.6875    , 0.75570087, 0.54275502, 0.58235294,
        0.54286639, 0.57216055, 0.43609023, 0.5721519 , 0.5       ,
        0.15      , 0.5       , 0.11111111, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ]])

In [None]:
y_train[:5]

array([0, 0, 0, 0, 0], dtype=uint8)

## Train Regression Model

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)
training_result = model.predict(X_train)

## Train Results

In [None]:
cm = confusion_matrix(y_train, training_result)
acc = accuracy_score(y_train, training_result)

print(f"TRAINING RESULTS : \n {cm} \nAccuracy : {acc*100}%")

TRAINING RESULTS : 
 [[2600    0]
 [  66    0]] 
 97.52438109527381%


# Test Data

In [None]:
test_data = pd.read_csv("/content/churn-bigml-20.csv")
test_data.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,LA,117,408,No,No,0,184.5,97,31.37,351.6,80,29.89,215.8,90,9.71,8.7,4,2.35,1,False
1,IN,65,415,No,No,0,129.1,137,21.95,228.5,83,19.42,208.8,111,9.4,12.7,6,3.43,4,True
2,NY,161,415,No,No,0,332.9,67,56.59,317.8,97,27.01,160.6,128,7.23,5.4,9,1.46,4,True
3,SC,111,415,No,No,0,110.4,103,18.77,137.3,102,11.67,189.6,105,8.53,7.7,6,2.08,2,False
4,HI,49,510,No,No,0,119.3,117,20.28,215.1,109,18.28,178.7,90,8.04,11.1,1,3.0,1,False


In [None]:
print(test_data["International plan"].unique())
print(test_data["Voice mail plan"].unique())
print(test_data["Churn"].unique())

['No' 'Yes']
['No' 'Yes']
[False  True]


## Feature Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
test_data["International plan"] = le.fit_transform(test_data["International plan"])
test_data["Voice mail plan"] = le.fit_transform(test_data["Voice mail plan"])
test_data["Churn"] = le.fit_transform(test_data["Churn"])

In [None]:
temp = pd.get_dummies(test_data["State"])
test_data = pd.concat([test_data, temp],axis=1)
test_data.drop(["State"], axis = 1, inplace = True)

## Feature Scaling

In [None]:
scale_col = test_data.drop(unq_states, axis = 1).columns
test_data[scale_col] = scaler.fit_transform(test_data[scale_col])

## Spliting Data

In [None]:
X_test = test_data.iloc[:, :-1].values
y_test = test_data.iloc[:, -1].values

In [None]:
testing_result = model.predict(X_test)

## Test Results

In [None]:
cm = confusion_matrix(y_test, testing_result)
acc = accuracy_score(y_test, testing_result)

print(f"TESTING RESULTS : \n {cm} \n Accuracy : {acc*100}%")

TESTING RESULTS : 
 [[656   0]
 [ 11   0]] 
 98.35082458770614%
