In [83]:
import  numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [84]:
columns = ["age", "workclass", "fnlwgt", "education","education_num","marital status","occupation","relationship","race","sex","capital_gain","capital_loss","hrs_per_week","native_country","income"]
df = pd.read_csv("adult.data", header=None, names=columns)
df.to_csv("adult.csv", index=False)

In [85]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital status,occupation,relationship,race,sex,capital_gain,capital_loss,hrs_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [86]:
df = df.replace(" ?", pd.NA)

In [87]:
cols = df.select_dtypes(include="object").columns
print(cols)

Index(['workclass', 'education', 'marital status', 'occupation',
       'relationship', 'race', 'sex', 'native_country', 'income'],
      dtype='object')


In [88]:
df = pd.get_dummies(df,columns=cols,drop_first=True)
df = df.astype(int)
df

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hrs_per_week,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,...,native_country_ Puerto-Rico,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia,income_ >50K
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,38,215646,9,0,0,40,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,53,234721,7,0,0,40,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,28,338409,13,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,12,0,0,38,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
32557,40,154374,9,0,0,40,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1
32558,58,151910,9,0,0,40,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
32559,22,201490,9,0,0,20,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0


In [89]:
df.shape

(32561, 98)

In [90]:
X_train = df.drop(columns=["income_ >50K"])
y_train=df["income_ >50K"]

In [91]:
X_train.shape


(32561, 97)

In [92]:


df2 = pd.read_csv("adult.test", header=None, names=columns,skiprows=1)
df2.to_csv("adult_test.csv", index=False)
df2 = df2.replace(" ?", pd.NA)

cols2= df2.select_dtypes(include="object").columns
print(cols2)

df2 = pd.get_dummies(df2,columns=cols2,drop_first=True)
df2.astype(int)
df2.head()

Index(['workclass', 'education', 'marital status', 'occupation',
       'relationship', 'race', 'sex', 'native_country', 'income'],
      dtype='object')


Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hrs_per_week,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,...,native_country_ Puerto-Rico,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia,income_ >50K.
0,25,226802,7,0,0,40,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False
1,38,89814,9,0,0,50,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False
2,28,336951,12,0,0,40,True,False,False,False,...,False,False,False,False,False,False,True,False,False,True
3,44,160323,10,7688,0,40,False,False,True,False,...,False,False,False,False,False,False,True,False,False,True
4,18,103497,10,0,0,30,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [93]:
X_test=df2.drop(columns=["income_ >50K."])
y_test=df2["income_ >50K."]
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)
print(X_train.shape)
print(X_test.shape)

(32561, 97)
(16281, 97)


In [94]:
n_input = X_train.shape[1]
n_hidden1 = 64
n_hidden2 = 32
n_output = 1
np.random.seed(42)

w1 = np.random.randn(n_hidden1, n_input) * np.sqrt(2 / n_input)
b1 = np.zeros((n_hidden1, 1))

w2 = np.random.randn(n_hidden2, n_hidden1) * np.sqrt(2 / n_hidden1)
b2 = np.zeros((n_hidden2, 1))
w3 = np.random.randn(n_output, n_hidden2) * np.sqrt(2 / n_hidden2)
b3 = np.zeros((n_output, 1))

In [98]:
def solution(X, y, w1, b1, w2, b2, w3, b3):
    epochs = 100000
    lr = 0.01
    eps = 1e-8

    X = X.values.T
    m = X.shape[1]
    y = y.values.reshape(1, m)

    for i in range(epochs):


        idx = np.random.randint(0, m)
        X_i = X[:, idx:idx+1]
        y_i = y[:, idx:idx+1]


        z1 = np.dot(w1, X_i) + b1
        a1 = np.maximum(0, z1)

        z2 = np.dot(w2, a1) + b2
        a2 = np.maximum(0, z2)

        z3 = np.dot(w3, a2) + b3
        y_hat = 1 / (1 + np.exp(-np.clip(z3, -500, 500)))


        dz3 = y_hat - y_i
        dw3 = np.dot(dz3, a2.T)
        db3 = dz3

        da2 = np.dot(w3.T, dz3)
        dz2 = da2.copy()
        dz2[z2 <= 0] = 0
        dw2 = np.dot(dz2, a1.T)
        db2 = dz2

        da1 = np.dot(w2.T, dz2)
        dz1 = da1.copy()
        dz1[z1 <= 0] = 0
        dw1 = np.dot(dz1, X_i.T)
        db1 = dz1


        w3 -= lr * dw3
        b3 -= lr * db3
        w2 -= lr * dw2
        b2 -= lr * db2
        w1 -= lr * dw1
        b1 -= lr * db1


        if i % 100 == 0:
            Z1_all = np.dot(w1, X) + b1
            A1_all = np.maximum(0, Z1_all)

            Z2_all = np.dot(w2, A1_all) + b2
            A2_all = np.maximum(0, Z2_all)

            Z3_all = np.dot(w3, A2_all) + b3
            Y_hat_all = 1 / (1 + np.exp(-np.clip(Z3_all, -500, 500)))


            loss = -np.mean(y * np.log(Y_hat_all + eps) + (1 - y) * np.log(1 - Y_hat_all + eps))
            print(f"Iteration {i}, Dataset Loss: {loss:.4f}")
    return w1, b1, w2, b2, w3, b3


In [99]:
w1, b1, w2, b2, w3, b3 = solution(X_train,y_train,w1,b1,w2,b2,w3,b3)

Iteration 0, Dataset Loss: 0.5523
Iteration 100, Dataset Loss: 0.5527
Iteration 200, Dataset Loss: 0.5520
Iteration 300, Dataset Loss: 0.5520
Iteration 400, Dataset Loss: 0.5522
Iteration 500, Dataset Loss: 0.5521
Iteration 600, Dataset Loss: 0.5529
Iteration 700, Dataset Loss: 0.5530
Iteration 800, Dataset Loss: 0.5522
Iteration 900, Dataset Loss: 0.5520
Iteration 1000, Dataset Loss: 0.5521
Iteration 1100, Dataset Loss: 0.5522
Iteration 1200, Dataset Loss: 0.5521
Iteration 1300, Dataset Loss: 0.5522
Iteration 1400, Dataset Loss: 0.5522
Iteration 1500, Dataset Loss: 0.5521
Iteration 1600, Dataset Loss: 0.5521
Iteration 1700, Dataset Loss: 0.5520
Iteration 1800, Dataset Loss: 0.5520
Iteration 1900, Dataset Loss: 0.5524
Iteration 2000, Dataset Loss: 0.5525
Iteration 2100, Dataset Loss: 0.5521
Iteration 2200, Dataset Loss: 0.5521
Iteration 2300, Dataset Loss: 0.5524
Iteration 2400, Dataset Loss: 0.5524
Iteration 2500, Dataset Loss: 0.5522
Iteration 2600, Dataset Loss: 0.5521
Iteration 270

In [100]:

X_arr = np.array(X_test)
y_arr = np.array(y_test).reshape(1, -1)
z1_test = np.dot(w1, X_arr.T) + b1
a1_test = np.maximum(0, z1_test)
z2_test = np.dot(w2, a1_test) + b2
a2_test = np.maximum(0, z2_test)
z3_test = np.dot(w3, a2_test) + b3
z3_test = z3_test.astype(np.float64)
y_hat_test = 1 / (1 + np.exp(-np.clip(z3_test, -500, 500)))

y_pred = (y_hat_test >= 0.5).astype(int)
accuracy = np.mean(y_pred == y_arr) * 100
print("Case1: Raw data Accuracy-", accuracy)

Case1: Raw data Accuracy- 76.3773723972729


Case_2


In [101]:
X_min = X_train.min(axis=0)
X_max = X_train.max(axis=0)
X_testing = pd.DataFrame(X_test, columns=X_train.columns)
X_train_scaled = (X_train - X_min) / (X_max - X_min + 1e-8)
X_test_scaled  = (X_testing - X_min) / (X_max - X_min + 1e-8)

In [102]:
n_input = X_train_scaled.shape[1]
n_hidden1 = 64
n_hidden2 = 32
n_output = 1
np.random.seed(42)

w1s = np.random.randn(n_hidden1, n_input) * np.sqrt(2 / n_input)
b1s = np.zeros((n_hidden1, 1))

w2s = np.random.randn(n_hidden2, n_hidden1) * np.sqrt(2 / n_hidden1)
b2s = np.zeros((n_hidden2, 1))
w3s = np.random.randn(n_output, n_hidden2) * np.sqrt(2 / n_hidden2)
b3s = np.zeros((n_output, 1))

In [103]:

w1s,b1s,w2s,b2s,w3s,b3s=solution(X_train_scaled,y_train,w1s,b1s,w2s,b2s,w3s,b3s)


Iteration 0, Dataset Loss: 0.6049
Iteration 100, Dataset Loss: 0.5487
Iteration 200, Dataset Loss: 0.5058
Iteration 300, Dataset Loss: 0.4688
Iteration 400, Dataset Loss: 0.4539
Iteration 500, Dataset Loss: 0.4294
Iteration 600, Dataset Loss: 0.4373
Iteration 700, Dataset Loss: 0.4099
Iteration 800, Dataset Loss: 0.4074
Iteration 900, Dataset Loss: 0.4376
Iteration 1000, Dataset Loss: 0.4199
Iteration 1100, Dataset Loss: 0.4042
Iteration 1200, Dataset Loss: 0.3938
Iteration 1300, Dataset Loss: 0.3904
Iteration 1400, Dataset Loss: 0.3937
Iteration 1500, Dataset Loss: 0.4004
Iteration 1600, Dataset Loss: 0.4086
Iteration 1700, Dataset Loss: 0.3958
Iteration 1800, Dataset Loss: 0.3879
Iteration 1900, Dataset Loss: 0.4041
Iteration 2000, Dataset Loss: 0.4079
Iteration 2100, Dataset Loss: 0.4137
Iteration 2200, Dataset Loss: 0.3842
Iteration 2300, Dataset Loss: 0.3853
Iteration 2400, Dataset Loss: 0.3826
Iteration 2500, Dataset Loss: 0.3901
Iteration 2600, Dataset Loss: 0.3824
Iteration 270

In [104]:

X_arr = np.array(X_test_scaled)
y_arr = np.array(y_test).reshape(1, -1)
z1_test = np.dot(w1s, X_arr.T) + b1s
a1_test = np.maximum(0, z1_test)
z2_test = np.dot(w2s, a1_test) + b2s
a2_test = np.maximum(0, z2_test)
z3_test = np.dot(w3s, a2_test) + b3s
z3_test = z3_test.astype(np.float64)
y_hat_test = 1 / (1 + np.exp(-np.clip(z3_test, -500, 500)))

y_pred = (y_hat_test >= 0.5).astype(int)
accuracy_sc = np.mean(y_pred == y_arr) * 100
print("Case1: Scaled data Accuracy-", accuracy_sc)

Case1: Scaled data Accuracy- 83.8523432221608
