In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report


In [2]:
columns = ["age", "workclass", "fnlwgt", "education", "education_num", "marital_status",
           "occupation", "relationship", "race", "sex", "capital_gain", "capital_loss",
           "hours_per_week", "native_country", "income"]
data = pd.read_csv("adult major.csv", names=columns)

In [3]:
data.head()


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
data.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [5]:
data = data.dropna()

In [6]:
X = data.drop("income", axis=1)
y = data["income"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(),
    "KNN": KNeighborsClassifier(),
    "SVC": SVC(kernel="linear")
}

# One-hot encoding categorical columns
X_train_encoded = pd.get_dummies(X_train)
X_test_encoded = pd.get_dummies(X_test)

X_train_encoded, X_test_encoded = X_train_encoded.align(X_test_encoded, join='outer', axis=1, fill_value=0)

predictions = {}
for model_name, model in models.items():
    model.fit(X_train_encoded, y_train)
    y_pred = model.predict(X_test_encoded)
    predictions[model_name] = y_pred

In [8]:
from sklearn.metrics import accuracy_score
best_model = max(predictions, key=lambda x: accuracy_score(y_test, predictions[x]))

In [9]:
for model_name, y_pred in predictions.items():
    print(f"Model: {model_name}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

Model: Decision Tree
Confusion Matrix:
[[4328  614]
 [ 557 1014]]
Classification Report:
              precision    recall  f1-score   support

       <=50K       0.89      0.88      0.88      4942
        >50K       0.62      0.65      0.63      1571

    accuracy                           0.82      6513
   macro avg       0.75      0.76      0.76      6513
weighted avg       0.82      0.82      0.82      6513

Model: Random Forest
Confusion Matrix:
[[4590  352]
 [ 588  983]]
Classification Report:
              precision    recall  f1-score   support

       <=50K       0.89      0.93      0.91      4942
        >50K       0.74      0.63      0.68      1571

    accuracy                           0.86      6513
   macro avg       0.81      0.78      0.79      6513
weighted avg       0.85      0.86      0.85      6513

Model: Logistic Regression
Confusion Matrix:
[[4784  158]
 [1147  424]]
Classification Report:
              precision    recall  f1-score   support

       <=50K      

In [10]:
for model_name, y_pred in predictions.items():
    report = classification_report(y_test, y_pred, target_names=['<=50K', '>50K'], output_dict=True)
    precision = report['>50K']['precision']
    recall = report['>50K']['recall']
    f1_score = report['>50K']['f1-score']
    accuracy = report['accuracy']
    print(f"Model: {model_name}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-score: {f1_score}")
    print(f"Accuracy: {accuracy}\n")

Model: Decision Tree
Precision: 0.6228501228501229
Recall: 0.645448758752387
F1-score: 0.6339481087839951
Accuracy: 0.820205742361431

Model: Random Forest
Precision: 0.7363295880149813
Recall: 0.6257161043921069
F1-score: 0.6765313145216793
Accuracy: 0.8556732688469215

Model: Logistic Regression
Precision: 0.7285223367697594
Recall: 0.2698917886696372
F1-score: 0.393869019972132
Accuracy: 0.7996315062183326

Model: KNN
Precision: 0.5719063545150501
Recall: 0.32654360280076383
F1-score: 0.4157212317666126
Accuracy: 0.7785966528481498

Model: SVC
Precision: 0.6781292984869326
Recall: 0.31381285805219605
F1-score: 0.4290687554395126
Accuracy: 0.7985567326884692



In [11]:
print(f"Best Model: {best_model}")


Best Model: Random Forest
