In [1]:
import pandas as pd

df = pd.read_csv('HW2_dataset.csv')

print(df.head())
print(df.info())
print(df.describe())
print(df.isnull().sum())


   age  workclass  fnlwgt     education  educational-num      marital-status  \
0   25    Private  226802          11th                7       Never-married   
1   38    Private   89814       HS-grad                9  Married-civ-spouse   
2   28  Local-gov  336951    Assoc-acdm               12  Married-civ-spouse   
3   44    Private  160323  Some-college               10  Married-civ-spouse   
4   18          ?  103497  Some-college               10       Never-married   

          occupation relationship   race  gender  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black    Male             0             0   
1    Farming-fishing      Husband  White    Male             0             0   
2    Protective-serv      Husband  White    Male             0             0   
3  Machine-op-inspct      Husband  Black    Male          7688             0   
4                  ?    Own-child  White  Female             0             0   

   hours-per-week native-country incom

In [2]:
df.replace('?', pd.NA, inplace=True)

for column in ['workclass', 'occupation']:
    mode_value = df[column].mode()[0]
    df[column].fillna(mode_value, inplace=True)

print(df['workclass'].value_counts())
print(df['occupation'].value_counts())


workclass
Private             36705
Self-emp-not-inc     3862
Local-gov            3136
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: count, dtype: int64
occupation
Prof-specialty       8981
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
Transport-moving     2355
Handlers-cleaners    2072
Farming-fishing      1490
Tech-support         1446
Protective-serv       983
Priv-house-serv       242
Armed-Forces           15
Name: count, dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(mode_value, inplace=True)


In [4]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)
df.replace('?', pd.NA, inplace=True)

for column in ['workclass', 'occupation']:
    mode_value = df[column].mode()[0]
    df[column] = df[column].fillna(mode_value)


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA

df = pd.read_csv('HW2_dataset.csv')

categorical_cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']  # update with your actual columns
numerical_cols = [col for col in df.columns if col not in categorical_cols + ['income']]

df_encoded = pd.get_dummies(df, columns=categorical_cols)

train_df, temp_df = train_test_split(df_encoded, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_df.drop('income', axis=1))
val_scaled = scaler.transform(val_df.drop('income', axis=1))
test_scaled = scaler.transform(test_df.drop('income', axis=1))

pca = PCA(n_components=0.95) 
train_pca = pca.fit_transform(train_scaled)
val_pca = pca.transform(val_scaled)
test_pca = pca.transform(test_scaled)

print("Original feature count:", train_scaled.shape[1])
print("Reduced feature count:", train_pca.shape[1])


Original feature count: 108
Reduced feature count: 88


In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(train_pca, train_df['income'])

val_predictions = knn.predict(val_pca)
print("Validation Accuracy:", accuracy_score(val_df['income'], val_predictions))

test_predictions = knn.predict(test_pca)
print("Test Accuracy:", accuracy_score(test_df['income'], test_predictions))
print(classification_report(test_df['income'], test_predictions))


Validation Accuracy: 0.8281463281463282
Test Accuracy: 0.829261635048451
              precision    recall  f1-score   support

       <=50K       0.87      0.91      0.89      5591
        >50K       0.66      0.58      0.62      1736

    accuracy                           0.83      7327
   macro avg       0.77      0.74      0.75      7327
weighted avg       0.82      0.83      0.83      7327



In [8]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_neighbors': range(1, 16), 
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}
knn = KNeighborsClassifier()

grid_search = GridSearchCV(knn, param_grid, cv=5, verbose=1, scoring='accuracy')  

grid_search.fit(train_pca, train_df['income'])
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy: {:.2f}".format(grid_search.best_score_))

best_knn = grid_search.best_estimator_
val_predictions = best_knn.predict(val_pca)
print("Validation Accuracy with Best Parameters:", accuracy_score(val_df['income'], val_predictions))

test_predictions = best_knn.predict(test_pca)
print("Test Accuracy with Best Parameters:", accuracy_score(test_df['income'], test_predictions))
print(classification_report(test_df['income'], test_predictions))


Fitting 5 folds for each of 60 candidates, totalling 300 fits


Traceback (most recent call last):
  File "C:\Users\HemnSheikh\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\model_selection\_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\HemnSheikh\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\metrics\_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\HemnSheikh\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\metrics\_scorer.py", line 371, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^

Best parameters: {'metric': 'euclidean', 'n_neighbors': 13, 'weights': 'uniform'}
Best cross-validation accuracy: 0.83
Validation Accuracy with Best Parameters: 0.8353808353808354
Test Accuracy with Best Parameters: 0.835812747372731
              precision    recall  f1-score   support

       <=50K       0.87      0.92      0.90      5591
        >50K       0.69      0.57      0.62      1736

    accuracy                           0.84      7327
   macro avg       0.78      0.74      0.76      7327
weighted avg       0.83      0.84      0.83      7327



In [9]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
svm_model = SVC(kernel='rbf', C=1.0, random_state=42) 

svm_model.fit(train_pca, train_df['income'])
val_predictions = svm_model.predict(val_pca)
print("Validation Accuracy:", accuracy_score(val_df['income'], val_predictions))
test_predictions = svm_model.predict(test_pca)
print("Test Accuracy:", accuracy_score(test_df['income'], test_predictions))
print(classification_report(test_df['income'], test_predictions))


Validation Accuracy: 0.8549003549003549
Test Accuracy: 0.8538283062645011
              precision    recall  f1-score   support

       <=50K       0.87      0.94      0.91      5591
        >50K       0.76      0.56      0.65      1736

    accuracy                           0.85      7327
   macro avg       0.82      0.75      0.78      7327
weighted avg       0.85      0.85      0.85      7327



In [10]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],  
    'kernel': ['rbf']  
}

grid_search = GridSearchCV(SVC(random_state=42), param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(train_pca, train_df['income'])

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

best_svm = grid_search.best_estimator_
test_predictions = best_svm.predict(test_pca)
print("Test Accuracy with Best Parameters:", accuracy_score(test_df['income'], test_predictions))
print(classification_report(test_df['income'], test_predictions))


Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best parameters: {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}
Best cross-validation accuracy: 0.8462078554866131
Test Accuracy with Best Parameters: 0.8543742322915245
              precision    recall  f1-score   support

       <=50K       0.88      0.94      0.91      5591
        >50K       0.76      0.57      0.65      1736

    accuracy                           0.85      7327
   macro avg       0.82      0.76      0.78      7327
weighted avg       0.85      0.85      0.85      7327



In [11]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score

nb_model = GaussianNB()

nb_model.fit(train_pca, train_df['income'])

val_predictions = nb_model.predict(val_pca)
print("Validation Accuracy:", accuracy_score(val_df['income'], val_predictions))

test_predictions = nb_model.predict(test_pca)
print("Test Accuracy:", accuracy_score(test_df['income'], test_predictions))
print(classification_report(test_df['income'], test_predictions))


Validation Accuracy: 0.6143871143871144
Test Accuracy: 0.6144397434147673
              precision    recall  f1-score   support

       <=50K       0.93      0.54      0.68      5591
        >50K       0.37      0.87      0.52      1736

    accuracy                           0.61      7327
   macro avg       0.65      0.70      0.60      7327
weighted avg       0.80      0.61      0.64      7327



In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
import numpy as np

nb_model = GaussianNB()
param_grid = {
    'var_smoothing': np.logspace(0,-9, num=100)  
}

grid_search = GridSearchCV(nb_model, param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(train_pca, train_df['income'])

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

best_nb = grid_search.best_estimator_
test_predictions = best_nb.predict(test_pca)
print("Test Accuracy with Best Parameters:", accuracy_score(test_df['income'], test_predictions))
print(classification_report(test_df['income'], test_predictions))


Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best parameters: {'var_smoothing': 1.0}
Best cross-validation accuracy: 0.7900213390473985
Test Accuracy with Best Parameters: 0.797051999454074
              precision    recall  f1-score   support

       <=50K       0.93      0.80      0.86      5591
        >50K       0.55      0.79      0.65      1736

    accuracy                           0.80      7327
   macro avg       0.74      0.80      0.75      7327
weighted avg       0.84      0.80      0.81      7327



In [16]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB

param_grid = {
    'var_smoothing': np.logspace(-2, 1, num=50)  
}

refined_grid_search = GridSearchCV(GaussianNB(), param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
refined_grid_search.fit(train_pca, train_df['income'])

print("Best parameters:", refined_grid_search.best_params_)
print("Best cross-validation accuracy:", refined_grid_search.best_score_)

best_refined_nb = refined_grid_search.best_estimator_
test_predictions = best_refined_nb.predict(test_pca)
print("Test Accuracy with Best Parameters:", accuracy_score(test_df['income'], test_predictions))
print(classification_report(test_df['income'], test_predictions))


Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best parameters: {'var_smoothing': 1.5998587196060574}
Best cross-validation accuracy: 0.8227207980432514
Test Accuracy with Best Parameters: 0.8389518220281151
              precision    recall  f1-score   support

       <=50K       0.85      0.95      0.90      5591
        >50K       0.76      0.47      0.58      1736

    accuracy                           0.84      7327
   macro avg       0.81      0.71      0.74      7327
weighted avg       0.83      0.84      0.82      7327



In [17]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score

mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', max_iter=300, random_state=42)

mlp.fit(train_pca, train_df['income'])

val_predictions = mlp.predict(val_pca)
print("Validation Accuracy:", accuracy_score(val_df['income'], val_predictions))

test_predictions = mlp.predict(test_pca)
print("Test Accuracy:", accuracy_score(test_df['income'], test_predictions))
print(classification_report(test_df['income'], test_predictions))


Validation Accuracy: 0.843024843024843
Test Accuracy: 0.8400436740821619
              precision    recall  f1-score   support

       <=50K       0.88      0.92      0.90      5591
        >50K       0.69      0.59      0.64      1736

    accuracy                           0.84      7327
   macro avg       0.78      0.75      0.77      7327
weighted avg       0.83      0.84      0.84      7327



In [19]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'hidden_layer_sizes': [(100,), (150,)],
    'activation': ['relu'],
    'alpha': [0.0001, 0.001],
    'max_iter': [300]
}

mlp = MLPClassifier(random_state=42)
grid_search = GridSearchCV(mlp, param_grid, cv=2, scoring='accuracy', verbose=1, n_jobs=1) 
grid_search.fit(train_pca, train_df['income'])


print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

best_mlp = grid_search.best_estimator_
test_predictions = best_mlp.predict(test_pca)
print("Test Accuracy with Best Parameters:", accuracy_score(test_df['income'], test_predictions))
print(classification_report(test_df['income'], test_predictions))


Fitting 2 folds for each of 4 candidates, totalling 8 fits




Best parameters: {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (100,), 'max_iter': 300}
Best cross-validation accuracy: 0.8271081160815001
Test Accuracy with Best Parameters: 0.845639415859151
              precision    recall  f1-score   support

       <=50K       0.88      0.92      0.90      5591
        >50K       0.71      0.60      0.65      1736

    accuracy                           0.85      7327
   macro avg       0.79      0.76      0.77      7327
weighted avg       0.84      0.85      0.84      7327



In [20]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam',
                    alpha=0.001, max_iter=500, random_state=42, learning_rate_init=0.001)

mlp.fit(train_pca, train_df['income'])

test_predictions = mlp.predict(test_pca)
print("Test Accuracy:", accuracy_score(test_df['income'], test_predictions))
print(classification_report(test_df['income'], test_predictions))


Test Accuracy: 0.8370410809335335
              precision    recall  f1-score   support

       <=50K       0.89      0.90      0.89      5591
        >50K       0.67      0.62      0.64      1736

    accuracy                           0.84      7327
   macro avg       0.78      0.76      0.77      7327
weighted avg       0.83      0.84      0.84      7327

