In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


ModuleNotFoundError: No module named 'matplotlib'

In [None]:
df = pd.read_csv("clean_adult.csv")

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,Private,77053,HS-grad,9,Widowed,Prof-specialty,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,Private,186061,Some-college,10,Widowed,Prof-specialty,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [None]:
X = df.drop(["income", "fnlwgt" ], axis = 1)
y = df["income"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 42
)

In [None]:
numeric_features = ["age", "education.num", "capital.gain", "capital.loss", "hours.per.week"]
categorical_features = ["workclass", "marital.status", "occupation", "relationship", "race", "sex", "native.country"]

numeric_transformer = StandardScaler()
Categorical_transformer = OneHotEncoder(handle_unknown = "ignore")

preprocessor = ColumnTransformer(
    transformers = [
        ("num", numeric_transformer, numeric_features),
        ("cat", Categorical_transformer, categorical_features)
    ]
)

In [None]:
# clf is LOgistic regression model
clf = Pipeline(steps =[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter = 1000))
])

clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))



Accuracy: 0.849437148217636

Classification Report:
               precision    recall  f1-score   support

       <=50K       0.88      0.93      0.90      4853
        >50K       0.73      0.59      0.65      1543

    accuracy                           0.85      6396
   macro avg       0.81      0.76      0.78      6396
weighted avg       0.84      0.85      0.84      6396



In [None]:
# clf1 is DecisionTreeClassifier

clf1 = Pipeline(steps =[
    ("preprocessor", preprocessor),
    ("classifier", DecisionTreeClassifier(max_depth = 10, random_state = 42))
])

clf1.fit(X_train, y_train)

In [None]:
y_pred = clf1.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))



Accuracy: 0.8572545340838024

Classification Report:
               precision    recall  f1-score   support

       <=50K       0.87      0.95      0.91      4853
        >50K       0.79      0.56      0.65      1543

    accuracy                           0.86      6396
   macro avg       0.83      0.75      0.78      6396
weighted avg       0.85      0.86      0.85      6396



In [None]:
# Clf 2 is RandomForestClassifier

clf2 = Pipeline(steps =[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators = 200, random_state = 42))

])

clf2.fit(X_train, y_train)

In [None]:
y_pred = clf2.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))



Accuracy: 0.8492808005003127

Classification Report:
               precision    recall  f1-score   support

       <=50K       0.88      0.92      0.90      4853
        >50K       0.72      0.61      0.66      1543

    accuracy                           0.85      6396
   macro avg       0.80      0.77      0.78      6396
weighted avg       0.84      0.85      0.84      6396



In [None]:
# packageing model pipline
joblib.dump(clf1, "model_pipeline.joblib")

pipeline = joblib.load("model_pipeline.joblib")