#  Data Preprocessing & Importing

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statistics as st
import tensorflow as tf

## Importing the dataset

In [2]:
dataset = pd.read_csv('train.csv')
X = dataset.iloc[:,3:-1].values
y = dataset.iloc[:, -1].values
data=pd.read_csv("test.csv")
TD = data.iloc[:,3:].values


## Checking and filling for missing data 

In [3]:
#temporarily creating a variable to check for missing data
feature_X = pd.DataFrame(X)
feature_Y = pd.DataFrame(y)

# Check for missing values
missing_values_X = feature_X.isnull().sum().sum()
missing_values_Y = feature_Y.isnull().sum().sum()
print(f"Total missing values: {missing_values_X}")
print(f"Total missing values: {missing_values_Y}")

Total missing values: 0
Total missing values: 0


## Encoding categorical variable

### Label Encoder for Gender variable

In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X[:,2] = le.fit_transform(X[:,2])
TD[:,2] = le.fit_transform(TD[:,2])

###  One Hot Encoder for Geography

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1])], remainder='passthrough')
X = ct.fit_transform(X)
TD = ct.transform(TD)

# Removing the features

In [6]:
from scipy.stats import chi2_contingency
for i in range(0,12):
    contingency_table = pd.crosstab(X[:,i], y)
    chi2_statistic, p_value, degrees_of_freedom, expected_counts = chi2_contingency(contingency_table)
    print("the syntax :", i)
    if p_value < 0.05:
        print("Reject null hypothesis & p_value is :" ,p_value)
    else:
        print("Fail to reject null hypothesis  & p_value is :",p_value)

the syntax : 0
Reject null hypothesis & p_value is : 1.7044531471585912e-66
the syntax : 1
Reject null hypothesis & p_value is : 4.273747023659205e-181
the syntax : 2
Reject null hypothesis & p_value is : 2.0059458282549308e-10
the syntax : 3
Reject null hypothesis & p_value is : 4.2867740034911244e-13
the syntax : 4
Reject null hypothesis & p_value is : 9.362308387850991e-109
the syntax : 5
Reject null hypothesis & p_value is : 0.0
the syntax : 6
Reject null hypothesis & p_value is : 4.091048215137019e-09
the syntax : 7
Reject null hypothesis & p_value is : 2.985505693743467e-40
the syntax : 8
Reject null hypothesis & p_value is : 0.0
the syntax : 9
Reject null hypothesis & p_value is : 0.04823086724376921
the syntax : 10
Reject null hypothesis & p_value is : 4.094743931635039e-136
the syntax : 11
Fail to reject null hypothesis  & p_value is : 0.3573849943115016


In [7]:
# only salary data Fail to reject null hypothesis
print(np.mean(X[:,11]))
print(st.stdev(X[:,11]))
# THIS IS SAME IN TEST SET SO WE CAN REMOVE ""Salary"" 

117716.4972340008
45461.85399884175


In [8]:
# romoving the salary
X = X[:,0:11]
TD = TD[:,0:11]
print(X)

[[1.0 0.0 0.0 ... 1.0 1.0 0.0]
 [0.0 1.0 0.0 ... 1.0 1.0 0.0]
 [0.0 0.0 1.0 ... 2.0 1.0 0.0]
 ...
 [1.0 0.0 0.0 ... 1.0 1.0 0.0]
 [1.0 0.0 0.0 ... 2.0 1.0 0.0]
 [1.0 0.0 0.0 ... 2.0 1.0 1.0]]


# Spliting Train and Test data

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.06)

# Deep learning

## SCALING

In [10]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train_ANN=ss.fit_transform(X_train)
X_test_ANN=ss.transform(X_test)


## training

In [11]:
ann = tf.keras.models.Sequential()
ann.add(tf.keras.layers.Dense(units=11, activation='relu'))
ann.add(tf.keras.layers.Dense(units=11, activation='relu'))
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))
ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy','AUC'])
ann.fit(X_train_ANN, y_train, batch_size = 32, epochs = 100)

Epoch 1/100
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - AUC: 0.6674 - accuracy: 0.7965 - loss: 0.4737
Epoch 2/100
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - AUC: 0.8793 - accuracy: 0.8736 - loss: 0.3231
Epoch 3/100
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - AUC: 0.8973 - accuracy: 0.8827 - loss: 0.2964
Epoch 4/100
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - AUC: 0.9018 - accuracy: 0.8826 - loss: 0.2931
Epoch 5/100
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - AUC: 0.9081 - accuracy: 0.8813 - loss: 0.2848
Epoch 6/100
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - AUC: 0.9052 - accuracy: 0.8827 - loss: 0.2876
Epoch 7/100
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - AUC: 0.9160 - accuracy: 0.8858 - loss: 0.2767
Epoch 8/100
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[

<keras.src.callbacks.history.History at 0x29a717f5c40>

## COMPARING WITH TEST SET

In [12]:
y_pred_ANN = ann.predict(X_test_ANN)
y_pred_ANN = (y_pred_ANN > 0.5)
print(np.concatenate((y_pred_ANN.reshape(len(y_pred_ANN),1), y_test.reshape(len(y_test),1)),1))

[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[[0. 0.]
 [0. 0.]
 [0. 0.]
 ...
 [1. 1.]
 [0. 0.]
 [1. 0.]]


In [13]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred_ANN)
print(cm)
accuracy_score(y_test, y_pred_ANN)

[[668  33]
 [ 64 135]]


0.8922222222222222

# SCAILING ,PREDICTION AND SUBMISSION

### SCAILING

In [14]:
TD=ss.transform(TD)

In [15]:
PRED=ann.predict(TD)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [16]:
submission = data[['id']]
submission["Exited"]=PRED

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission["Exited"]=PRED


In [18]:
submission.to_csv("submission.csv",index=False)