In [1]:
import  numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
columns = ["age", "workclass", "fnlwgt", "education","education_num","marital status","occupation","relationship","race","sex","capital_gain","capital_loss","hrs_per_week","native_country","income"]
df = pd.read_csv("adult.data", header=None, names=columns)
df.to_csv("adult.csv", index=False)

In [3]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital status,occupation,relationship,race,sex,capital_gain,capital_loss,hrs_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df = df.replace(" ?", pd.NA)

In [5]:
cols = df.select_dtypes(include="object").columns
print(cols)

Index(['workclass', 'education', 'marital status', 'occupation',
       'relationship', 'race', 'sex', 'native_country', 'income'],
      dtype='object')


In [6]:
df = pd.get_dummies(df,columns=cols,drop_first=True)
df = df.astype(int)
df

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hrs_per_week,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,...,native_country_ Puerto-Rico,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia,income_ >50K
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,38,215646,9,0,0,40,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,53,234721,7,0,0,40,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,28,338409,13,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,12,0,0,38,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
32557,40,154374,9,0,0,40,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1
32558,58,151910,9,0,0,40,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
32559,22,201490,9,0,0,20,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0


In [7]:
df.shape

(32561, 98)

In [8]:
X_train = df.drop(columns=["income_ >50K"])
y_train=df["income_ >50K"]

In [9]:
X_train.shape


(32561, 97)

In [10]:


df2 = pd.read_csv("adult.test", header=None, names=columns,skiprows=1)
df2.to_csv("adult_test.csv", index=False)
df2 = df2.replace(" ?", pd.NA)

cols2= df2.select_dtypes(include="object").columns
print(cols2)

df2 = pd.get_dummies(df2,columns=cols2,drop_first=True)
df2.astype(int)
df2.head()

Index(['workclass', 'education', 'marital status', 'occupation',
       'relationship', 'race', 'sex', 'native_country', 'income'],
      dtype='object')


Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hrs_per_week,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,...,native_country_ Puerto-Rico,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia,income_ >50K.
0,25,226802,7,0,0,40,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False
1,38,89814,9,0,0,50,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False
2,28,336951,12,0,0,40,True,False,False,False,...,False,False,False,False,False,False,True,False,False,True
3,44,160323,10,7688,0,40,False,False,True,False,...,False,False,False,False,False,False,True,False,False,True
4,18,103497,10,0,0,30,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [11]:
X_test=df2.drop(columns=["income_ >50K."])
y_test=df2["income_ >50K."]
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)
print(X_train.shape)
print(X_test.shape)

(32561, 97)
(16281, 97)


In [12]:
n_input = X_train.shape[1]
n_hidden1 = 64
n_hidden2 = 32
n_output = 1
np.random.seed(42)
# He initialization
w1 = np.random.randn(n_hidden1, n_input) * np.sqrt(2 / n_input)
b1 = np.zeros((n_hidden1, 1))

w2 = np.random.randn(n_hidden2, n_hidden1) * np.sqrt(2 / n_hidden1)
b2 = np.zeros((n_hidden2, 1))
w3 = np.random.randn(n_output, n_hidden2) * np.sqrt(2 / n_hidden2)
b3 = np.zeros((n_output, 1))

In [13]:
def solution(X, y, w1, b1, w2, b2, w3, b3, batch_size=64):
    epochs = 1000
    eps = 1e-8
    lr = 0.01
    X = X.values.T
    m = X.shape[1]
    y = y.values.reshape(1, m)

    for i in range(epochs):
        # 1. RESET epoch_loss at the start of every epoch
        epoch_loss = 0
        num_batches = 0

        perm = np.random.permutation(m)
        X_shuf, y_shuf = X[:, perm], y[:, perm]

        for j in range(0, m, batch_size):
            X_batch = X_shuf[:, j:j+batch_size]
            y_batch = y_shuf[:, j:j+batch_size]
            curr_m = X_batch.shape[1]

            # --- FORWARD PASS ---
            z1 = np.dot(w1, X_batch) + b1
            a1 = np.maximum(0, z1)
            z2 = np.dot(w2, a1) + b2
            a2 = np.maximum(0, z2)
            z3 = np.dot(w3, a2) + b3
            y_hat = 1 / (1 + np.exp(-np.clip(z3, -500, 500)))

            # --- BACKPROPAGATION ---
            dz3 = y_hat - y_batch
            dw3 = (1/curr_m) * np.dot(dz3, a2.T)
            db3 = (1/curr_m) * np.sum(dz3, axis=1, keepdims=True)

            da2 = np.dot(w3.T, dz3)
            dz2 = np.array(da2, copy=True)
            dz2[z2 <= 0] = 0
            dw2 = (1/curr_m) * np.dot(dz2, a1.T)
            db2 = (1/curr_m) * np.sum(dz2, axis=1, keepdims=True)

            da1 = np.dot(w2.T, dz2)
            dz1 = np.array(da1, copy=True)
            dz1[z1 <= 0] = 0
            dw1 = (1/curr_m) * np.dot(dz1, X_batch.T)
            db1 = (1/curr_m) * np.sum(dz1, axis=1, keepdims=True)

            # --- OPTIMIZER ---
            w3 -= lr * dw3
            b3 -= lr * db3
            w2 -= lr * dw2
            b2 -= lr * db2
            w1 -= lr * dw1
            b1 -= lr * db1

            # 2. Accumulate loss INSIDE the batch loop
            batch_loss = -np.mean(y_batch * np.log(y_hat + eps) + (1 - y_batch) * np.log(1 - y_hat + eps))
            epoch_loss += batch_loss
            num_batches += 1

        # 3. Print the average loss for THIS epoch
        if i % 100 == 0:
            avg_loss = epoch_loss / num_batches
            print(f"Epoch {i}, Loss: {avg_loss:.4f}")

    return w1, b1, w2, b2, w3, b3

In [14]:
w1, b1, w2, b2, w3, b3 = solution(X_train,y_train,w1,b1,w2,b2,w3,b3)

Epoch 0, Loss: 0.7183
Epoch 100, Loss: 0.5520
Epoch 200, Loss: 0.5520
Epoch 300, Loss: 0.5521
Epoch 400, Loss: 0.5519
Epoch 500, Loss: 0.5520
Epoch 600, Loss: 0.5521
Epoch 700, Loss: 0.5520
Epoch 800, Loss: 0.5520
Epoch 900, Loss: 0.5520


In [15]:

X_arr = np.array(X_test)
y_arr = np.array(y_test).reshape(1, -1)
z1_test = np.dot(w1, X_arr.T) + b1
a1_test = np.maximum(0, z1_test)
z2_test = np.dot(w2, a1_test) + b2
a2_test = np.maximum(0, z2_test)
z3_test = np.dot(w3, a2_test) + b3
z3_test = z3_test.astype(np.float64)
y_hat_test = 1 / (1 + np.exp(-np.clip(z3_test, -500, 500)))

y_pred = (y_hat_test >= 0.5).astype(int)
accuracy = np.mean(y_pred == y_arr) * 100
print("Case1: Raw data Accuracy-", accuracy)

Case1: Raw data Accuracy- 76.3773723972729


Case_2


In [16]:
X_min = X_train.min(axis=0)
X_max = X_train.max(axis=0)
X_testing = pd.DataFrame(X_test, columns=X_train.columns)
X_train_scaled = (X_train - X_min) / (X_max - X_min + 1e-8)
X_test_scaled  = (X_testing - X_min) / (X_max - X_min + 1e-8)

In [17]:
n_input = X_train_scaled.shape[1]
n_hidden1 = 64
n_hidden2 = 32
n_output = 1
np.random.seed(42)
# He initialization
w1s = np.random.randn(n_hidden1, n_input) * np.sqrt(2 / n_input)
b1s = np.zeros((n_hidden1, 1))

w2s = np.random.randn(n_hidden2, n_hidden1) * np.sqrt(2 / n_hidden1)
b2s = np.zeros((n_hidden2, 1))
w3s = np.random.randn(n_output, n_hidden2) * np.sqrt(2 / n_hidden2)
b3s = np.zeros((n_output, 1))

In [18]:

w1s,b1s,w2s,b2s,w3s,b3s=solution(X_train_scaled,y_train,w1s,b1s,w2s,b2s,w3s,b3s)


Epoch 0, Loss: 0.4705
Epoch 100, Loss: 0.3050
Epoch 200, Loss: 0.2918
Epoch 300, Loss: 0.2839
Epoch 400, Loss: 0.2769
Epoch 500, Loss: 0.2723
Epoch 600, Loss: 0.2680
Epoch 700, Loss: 0.2632
Epoch 800, Loss: 0.2603
Epoch 900, Loss: 0.2575


In [19]:

X_arr = np.array(X_test_scaled)
y_arr = np.array(y_test).reshape(1, -1)
z1_test = np.dot(w1s, X_arr.T) + b1s
a1_test = np.maximum(0, z1_test)
z2_test = np.dot(w2s, a1_test) + b2s
a2_test = np.maximum(0, z2_test)
z3_test = np.dot(w3s, a2_test) + b3s
z3_test = z3_test.astype(np.float64)
y_hat_test = 1 / (1 + np.exp(-np.clip(z3_test, -500, 500)))

y_pred = (y_hat_test >= 0.5).astype(int)
accuracy_sc = np.mean(y_pred == y_arr) * 100
print("Case1: Scaled data Accuracy-", accuracy_sc)

Case1: Scaled data Accuracy- 83.90762238191756
