In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import LeaveOneOut
import random
from sklearn.linear_model import Lasso, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
header=["age","workclass","fnlwgt","education","education_num","martial_status","occupation","relationship","race","sex","capital_gain","capital_loss","hours_per_week","native_country","50k"]
df=pd.read_csv("adult.csv",names=header)
df=df.replace(" ?",np.nan)
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,martial_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,50k
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


this data has 15 variables, with 6 continous variables. All the particpant are over 16 years old. The weights on the CPS files are controlled to independent estimates of the civilian noninstitutional population of the US.  These are prepared monthly
by Population Division at the Census Bureau. The goal of this model is to find the relationship between if a person makes above or equals to 50k in a year with their backgrounds(all the variables).
 

In [3]:
df2=pd.get_dummies(df,columns=["workclass","education","martial_status","occupation","relationship","race","sex","native_country","50k"])
columns=["50k_ <=50K","50k_ >50K"]
X=df2.drop(columns=columns)
y=df2["50k_ >50K"]


since below 50k will be represendted as 0 in the "50k_ >50K" column, so i will just drop the column "50k_ <=50K"

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42)

scaler = StandardScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)



Models I will be using are gonna be Random Forest and SVM.

In [5]:
model = RandomForestClassifier()
model.fit(X_train_norm, y_train)
y_pred = model.predict(X_test_norm)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
X_train_norm_df = pd.DataFrame(X_train_norm, columns=X.columns)
X_test_norm_df = pd.DataFrame(X_test_norm, columns=X.columns)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", report)
print("Confusion Matrix:\n", confusion_matrix)

Accuracy: 0.86
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.93      0.91      6214
           1       0.74      0.62      0.67      1927

    accuracy                           0.86      8141
   macro avg       0.81      0.78      0.79      8141
weighted avg       0.85      0.86      0.85      8141

Confusion Matrix:
 <function confusion_matrix at 0x000002392CCDF310>


In [None]:
from itertools import combinations

# Function to perform best subset selection
def best_subset_selection(X, y, model):
    best_score = float('-inf')
    best_subset = None

    for k in range(1, X.shape[1] + 1):
        for subset in combinations(X.columns, k):
            subset_X = X[list(subset)]
            score = np.mean(cross_val_score(model, subset_X, y, cv=5, scoring='neg_mean_squared_error'))
            
            if score > best_score:
                best_score = score 
                best_subset = subset
                
    return best_subset
# Perform best subset selection
best_subset = best_subset_selection(X_train_norm_df, y_train, model)

# Print the best subset of features
print("Best Subset:", best_subset)


In [None]:
svm_model = SVC(kernel='linear', C=1.0)  # You can adjust the kernel and C parameter
svm_model.fit(X_train_norm, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test_norm)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)
confusion_matrix_result = confusion_matrix(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report_result)
print("Confusion Matrix:\n", confusion_matrix_result)


In [None]:
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1]}

# Create the SVM model
svm = SVC(kernel='linear')
svm.fit(X_train_norm,y_train)
# Create the grid search with cross-validation
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train_norm, y_train)

# Print the best parameters found
print("Best Parameters:", grid_search.best_params_)


final conclusion:

The model has a decent accuracy of 86%, but it's important to consider precision, recall, and F1-score for each class to understand its performance on individual classes.Class 0 (presumably the majority class) has higher precision, recall, and F1-score compared to Class 1. Class 1 has a lower recall, suggesting that the model is not capturing all instances of this class effectively.

Therefore this model is not good enough although it is already better than the SVM model.