**Importing the necessary libraries**

In [194]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import QuantileTransformer, StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, f1_score
import seaborn as sns

**Import and see general statistics for csv file**

In [195]:
data = pd.read_csv("Churn_Modelling.csv")

In [196]:
data.head

<bound method NDFrame.head of       RowNumber  CustomerId    Surname  CreditScore Geography  Gender  Age  \
0             1    15634602   Hargrave          619    France  Female   42   
1             2    15647311       Hill          608     Spain  Female   41   
2             3    15619304       Onio          502    France  Female   42   
3             4    15701354       Boni          699    France  Female   39   
4             5    15737888   Mitchell          850     Spain  Female   43   
...         ...         ...        ...          ...       ...     ...  ...   
9995       9996    15606229   Obijiaku          771    France    Male   39   
9996       9997    15569892  Johnstone          516    France    Male   35   
9997       9998    15584532        Liu          709    France  Female   36   
9998       9999    15682355  Sabbatini          772   Germany    Male   42   
9999      10000    15628319     Walker          792    France  Female   28   

      Tenure    Balance  NumOfPro

**Checks that there all values are present**

In [197]:
data.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

**Assigns columns to X(features) and y(target)**

In [204]:
X = data[["CreditScore", "Age", "Tenure", "Balance", "NumOfProducts", "EstimatedSalary"]]
# X = data[["Area", "Eccentricity", "Extent"]]
y = data["Exited"] 



**Splits the dataset into training and testing data with 30% being for testing**

In [205]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9, stratify=y)

In [206]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train)

**Uses a label encoder to transform the two target strings into binary**

In [201]:
label_encoder = LabelEncoder()
data['Gender'] = label_encoder.fit_transform(data['Gender'])


**Fits the min max scaler to the training data**

In [207]:
X_train

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,EstimatedSalary
3814,646,30,7,0.00,2,153566.97
6873,628,37,9,0.00,2,34689.77
2537,573,33,0,90124.64,1,137476.71
3694,608,33,4,0.00,1,79304.38
6394,466,41,2,152102.18,2,181879.56
...,...,...,...,...,...,...
9123,718,33,7,102874.28,1,117841.06
2563,843,27,5,0.00,2,67494.23
837,686,56,5,111642.08,1,80553.87
6058,668,56,9,110993.79,1,134396.64


In [208]:
X_train_scaled = MinMaxScaler().fit_transform(X_train)
X_train_scaled

array([[0.592     , 0.16216216, 0.7       , 0.        , 0.33333333,
        0.76785028],
       [0.556     , 0.25675676, 0.9       , 0.        , 0.33333333,
        0.17340751],
       [0.446     , 0.2027027 , 0.        , 0.40682301, 0.        ,
        0.6873913 ],
       ...,
       [0.672     , 0.51351351, 0.5       , 0.50395282, 0.        ,
        0.40274991],
       [0.636     , 0.51351351, 0.9       , 0.50102644, 0.        ,
        0.67198947],
       [0.638     , 0.43243243, 0.4       , 0.67580787, 0.66666667,
        0.62436383]])

Converts all the values into a value between 0 and 1 so that columns with significantly larger values do not dominate the decisions

**Fits a logistic regression machine learning model to the training data**

In [214]:
model_accuracy_results = []

In [215]:
log_reg = LogisticRegression(random_state=9, max_iter=10000)
log_reg.fit(X_train, y_train)
y_pred_val = log_reg.predict(X_val)
model_accuracy_results.append(accuracy_score(y_val, y_pred_val))


In [216]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred_val = knn.predict(X_val)
model_accuracy_results.append(accuracy_score(y_val, y_pred_val))


In [217]:
for score in model_accuracy_results:
    print(score)

0.77625
0.7575


**Tests the prediction of the model on test data**

**Creates a confusion matrix**

In [218]:
cnf_matrix = confusion_matrix(y_val, y_pred_val)

In [117]:
cnf_matrix


array([[80, 16],
       [17, 67]], dtype=int64)

The top left(True positive) and bottom right(True negative) have high values which shows the model is relatively accurate with predictions

In [118]:
accuracy_score(y_test, y_pred)

0.8166666666666667

In [119]:
f1_test_score = f1_score(y_test, y_pred)
f1_test_score

0.8023952095808383

In [120]:
test_record = {
    "Area": 75000,
    # "MajorAxisLength": 600,
    # "MinorAxisLength": 200,
    "Eccentricity": 0.7,
    # "ConvexArea": 200000,
    "Extent": 0.7,
    # "Perimeter": 1000}
}

test_record_df = pd.DataFrame([test_record])
test_prediction = log_reg.predict(test_record_df)

test_prediction

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- ConvexArea
- MajorAxisLength
- MinorAxisLength
- Perimeter
