In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Column names for the Adult dataset
columns = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
    "hours-per-week", "native-country", "income"
]

# Load dataset with missing values handled
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/adult/adult.data", names=columns, sep=r',\s*', engine='python')

# --- Data Preprocessing and Feature Selection ---
print("\n--- Original Income Distribution ---")
print(df['income'].value_counts())

# Drop the 'fnlwgt' column (sampling weight, not a predictive feature)
df = df.drop('fnlwgt', axis=1)

# Convert the target variable 'income' to numerical (0 for <=50K, 1 for >50K)
# This handles the class imbalance issue if the original data has both classes
df['income'] = (df['income'] == '>50K').astype(int)


# Identify categorical columns (excluding 'income' which is now numerical)
categorical_cols = df.select_dtypes(include=['object']).columns

# Apply one-hot encoding to categorical features
# drop_first=True prevents multicollinearity
df_processed = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print("\n--- Processed DataFrame Head (after one-hot encoding) ---")
print(df_processed.head())
print("\n--- Processed DataFrame Info ---")
df_processed.info()
print("\n--- Income Distribution After Conversion ---")
print(df_processed['income'].value_counts())

# --- Data Splitting, Scaling, and Oversampling ---

# Ensure imblearn is installed: pip install imbalanced-learn
# If you encounter a ModuleNotFoundError for imblearn, run the above command in a new cell.

def scale_dataset(dataframe, oversample=False):

  # Initialize and apply StandardScaler
  scaler = StandardScaler()
  X_scaled = scaler.fit_transform(X)

  # Apply RandomOverSampler only if oversample is True
  if oversample:
    ros = RandomOverSampler(random_state=42) # Added random_state for reproducibility
    X_resampled, y_resampled = ros.fit_resample(X_scaled, y)
    return X_resampled, y_resampled
  else:
    return X_scaled, y

# Split the dataset into training, validation, and test sets
# Using stratify=y to maintain the original proportion of income classes in splits
X = df_processed.drop('income', axis=1)
y = df_processed['income']

X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Split training data further into training and validation sets (0.25 of 0.8 = 0.2 of total)
X_train_raw, X_val_raw, y_train_raw, y_val_raw = train_test_split(X_train_raw, y_train_raw, test_size=0.25, random_state=42, stratify=y_train_raw)

# Apply scaling and oversampling (oversample only on the training set)
X_train, y_train = scale_dataset(X_train_raw.assign(income=y_train_raw), oversample=True)
X_val, y_val = scale_dataset(X_val_raw.assign(income=y_val_raw), oversample=False)
X_test, y_test = scale_dataset(X_test_raw.assign(income=y_test_raw), oversample=False)

print("\n--- Income Distribution in Oversampled Training Set ---")
unique_train, counts_train = np.unique(y_train, return_counts=True)
print(dict(zip(unique_train, counts_train)))



--- Original Income Distribution ---
income
<=50K    24720
>50K      7841
Name: count, dtype: int64

--- Processed DataFrame Head (after one-hot encoding) ---
   age  education-num  capital-gain  capital-loss  hours-per-week  income  \
0   39             13          2174             0              40       0   
1   50             13             0             0              13       0   
2   38              9             0             0              40       0   
3   53              7             0             0              40       0   
4   28             13             0             0              40       0   

   workclass_Federal-gov  workclass_Local-gov  workclass_Never-worked  \
0                  False                False                   False   
1                  False                False                   False   
2                  False                False                   False   
3                  False                False                   False   
4           

# **KNN**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [None]:
best_k = 1
best_accuracy = 0
accuracies = []

# Iterate and evaluate KNN with different 'k' values
for k in range(1, 21): # Test k from 1 to 20
    knn_tuned = KNeighborsClassifier(n_neighbors=k)
    knn_tuned.fit(X_train, y_train)
    y_pred_k = knn_tuned.predict(X_val) # Use validation set for tuning
    current_accuracy = accuracy_score(y_val, y_pred_k)
    accuracies.append(current_accuracy)
    if current_accuracy > best_accuracy:
        best_accuracy = current_accuracy
        best_k = k

print(f"Best K found using Validation Set: {best_k} with Validation Accuracy: {best_accuracy:.4f}")

# Train the best KNN model on the full training set and evaluate on test set
knn_tuned_best = KNeighborsClassifier(n_neighbors=best_k)
knn_tuned_best.fit(X_train, y_train)
y_pred_knn_tuned_test = knn_tuned_best.predict(X_test)
print(f"Tuned KNN Test Accuracy (k={best_k}): {accuracy_score(y_test, y_pred_knn_tuned_test):.4f}")
print(f"Tuned KNN Test Classification Report (k={best_k}):\n", classification_report(y_test, y_pred_knn_tuned_test))

Best K found using Validation Set: 2 with Validation Accuracy: 0.7964
Tuned KNN Test Accuracy (k=2): 0.7967
Tuned KNN Test Classification Report (k=2):
               precision    recall  f1-score   support

           0       0.86      0.88      0.87      4945
           1       0.58      0.54      0.56      1568

    accuracy                           0.80      6513
   macro avg       0.72      0.71      0.71      6513
weighted avg       0.79      0.80      0.79      6513



# **Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
# 3. Improved Model: Logistic Regression (often a more robust baseline)
print("\n--- Improved Model: Logistic Regression ---")
log_reg = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000) # 'liblinear' good for small datasets, binary, and L1/L2 regularization
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_log_reg))


--- Improved Model: Logistic Regression ---
Logistic Regression Accuracy: 0.8086903116843237
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.80      0.86      4945
           1       0.57      0.85      0.68      1568

    accuracy                           0.81      6513
   macro avg       0.76      0.82      0.77      6513
weighted avg       0.85      0.81      0.82      6513



In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
# 2. Baseline Gaussian Naive Bayes Classifier
print("\n--- Baseline Gaussian Naive Bayes Classifier ---")
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)
print("Gaussian Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_gnb))
print("Gaussian Naive Bayes Classification Report:\n", classification_report(y_test, y_pred_gnb))


--- Baseline Gaussian Naive Bayes Classifier ---
Gaussian Naive Bayes Accuracy: 0.7592507293106096
Gaussian Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.76      1.00      0.86      4945
           1       0.00      0.00      0.00      1568

    accuracy                           0.76      6513
   macro avg       0.38      0.50      0.43      6513
weighted avg       0.58      0.76      0.66      6513



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.svm import SVC

sk_model = SVC()
sk_model.fit(X_train, y_train)
y_pred_sk = sk_model.predict(X_test)
print("SVC Accuracy:", classification_report(y_test, y_pred_sk))

SVC Accuracy:               precision    recall  f1-score   support

           0       0.94      0.79      0.86      4945
           1       0.57      0.85      0.68      1568

    accuracy                           0.81      6513
   macro avg       0.76      0.82      0.77      6513
weighted avg       0.85      0.81      0.82      6513



# **Neural Network**

In [None]:
import tensorflow as tf

In [None]:
def plot_history(history):
  fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))
  ax1.plot(history.history['loss'], label='loss')
  ax1.plot(history.history['val_loss'], label='val_loss')
  ax1.set_xlabel('Epoch')
  ax1.set_ylabel('Binary crossentropy')
  ax1.grid(True)

  ax2.plot(history.history['accuracy'], label='accuracy')
  ax2.plot(history.history['val_accuracy'], label='val_accuracy')
  ax2.set_xlabel('Epoch')
  ax2.set_ylabel('Accuracy')
  ax2.grid(True)

  plt.show()

In [None]:
def train_model(X_train, y_train, num_nodes, dropout_prob, lr, batch_size, epochs):
  nn_model=tf.keras.Sequential([
      tf.keras.layers.Dense(128, activation='relu'),
      tf.keras.layers.Dense(64, activation='relu'),
      tf.keras.layers.Dense(1, activation='sigmoid')
  ])
  nn_model.compile(optimizer=tf.keras.optimizers.Adam(0.0001), loss='binary_crossentropy', metrics=['accuracy'])
  history = nn_model.fit(
    X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=0
  )

  return nn_model, history

Hyperparameters Tuning

In [None]:
least_val_loss = float('inf')
least_loss_model = None
epochs=100
for num_nodes in [16, 32, 64]:
  for dropout_prob in[0, 0.2]:
    for lr in [0.01, 0.005, 0.001]:
      for batch_size in [32, 64, 128]:
        print(f"{num_nodes} nodes, dropout {dropout_prob}, lr {lr}, batch size {batch_size}")
        model, history = train_model(X_train, y_train, num_nodes, dropout_prob, lr, batch_size, epochs)
        plot_history(history)
        val_loss = model.evaluate(X_val, y_val)[0]
        if val_loss < least_val_loss:
          least_val_loss = val_loss
          least_loss_model = model

Output hidden; open in https://colab.research.google.com to view.

In [None]:
y_pred = least_loss_model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int).reshape(-1,)

[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step


In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.86      0.89      4945
           1       0.63      0.76      0.69      1568

    accuracy                           0.83      6513
   macro avg       0.77      0.81      0.79      6513
weighted avg       0.85      0.83      0.84      6513

