In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from category_encoders import TargetEncoder
os.chdir("D:/EMPPRED/")

In [2]:
df = pd.read_csv("artifacts/raw.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [3]:
cat_columns = df.select_dtypes(include="object")

In [4]:
cat_columns.head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [5]:
for column in cat_columns.columns:
    df[column] = df[column].str.replace(" ?","other").replace(" ","")

In [6]:
for column in cat_columns.columns:
    df[column] = df[column].str.replace("-","_")

In [7]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39,State_gov,77516,Bachelors,13,Never_married,Adm_clerical,Not_in_family,White,Male,2174,0,40,United_States,0
1,50,Self_emp_not_inc,83311,Bachelors,13,Married_civ_spouse,Exec_managerial,Husband,White,Male,0,0,13,United_States,0
2,38,Private,215646,HS_grad,9,Divorced,Handlers_cleaners,Not_in_family,White,Male,0,0,40,United_States,0
3,53,Private,234721,11th,7,Married_civ_spouse,Handlers_cleaners,Husband,Black,Male,0,0,40,United_States,0
4,28,Private,338409,Bachelors,13,Married_civ_spouse,Prof_specialty,Wife,Black,Female,0,0,40,Cuba,0


In [8]:
df.rename(columns=lambda x: x.replace('-', '_'), inplace=True)

In [9]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'country', 'salary'],
      dtype='object')

In [10]:
X = df.drop('salary',axis=1)
y = df['salary']

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [12]:
num_feature = X.select_dtypes(exclude='object')
cat_feature = X.select_dtypes(include="object")


n_features = list(num_feature.columns)
cat_features = list(cat_feature.columns)

In [13]:
cat_features

['workclass',
 'education',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'country']

In [14]:
cat_feature.head()

Unnamed: 0,workclass,education,marital_status,occupation,relationship,race,sex,country
0,State_gov,Bachelors,Never_married,Adm_clerical,Not_in_family,White,Male,United_States
1,Self_emp_not_inc,Bachelors,Married_civ_spouse,Exec_managerial,Husband,White,Male,United_States
2,Private,HS_grad,Divorced,Handlers_cleaners,Not_in_family,White,Male,United_States
3,Private,11th,Married_civ_spouse,Handlers_cleaners,Husband,Black,Male,United_States
4,Private,Bachelors,Married_civ_spouse,Prof_specialty,Wife,Black,Female,Cuba


In [15]:
num_feature.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,39,77516,13,2174,0,40
1,50,83311,13,0,0,13
2,38,215646,9,0,0,40
3,53,234721,7,0,0,40
4,28,338409,13,0,0,40


In [16]:
from sklearn.preprocessing import OneHotEncoder

In [17]:
num_pipeline = Pipeline(
    steps=[
        ("SimpleImputer",SimpleImputer(strategy='median')),
        ("scaler",StandardScaler())
    ]
)
cat_pipeline = Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ("encoder",OneHotEncoder(sparse_output=False)),
        

    ]
)

In [19]:
from sklearn.tree import DecisionTreeClassifier

In [20]:
logistic_step_name ="decision_tree_classifier"
final_pipeline = Pipeline(
    steps=[
        (
            "feature_processing",
            ColumnTransformer(
                transformers=[
                    ("num_pipeline",num_pipeline,n_features),
                    ("cat_pipeline",cat_pipeline,cat_features),
                ],
                remainder='passthrough',
            ),
        ),
        (
            logistic_step_name,
            DecisionTreeClassifier(
                criterion="gini",
                splitter="best",
                max_depth=5,
            ),
        ),


    ]
)
final_pipeline.set_output(transform="pandas")

In [21]:
preprocessor = ColumnTransformer([
    ('num_pipeline',num_pipeline),
    ('cat_pipeline',cat_pipeline)
])

In [58]:
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier

In [59]:
pipe = make_pipeline(preprocessor,DecisionTreeClassifier(criterion='gini',splitter='best',max_depth=5))

In [22]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [23]:
final_pipeline.fit(X_train,y_train)

In [18]:
X_train = preprocessor.fit_transform(X_train)

In [19]:
X_test = preprocessor.transform(X_test)

In [25]:
## Model Training

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [26]:
import numpy as np
def evaluate_model(true, predicted):
    acc = accuracy_score(true, predicted)
    cr = classification_report(true, predicted)
    cf = confusion_matrix(true, predicted)
    return acc, cr, cf

In [28]:
## Train multiple models
## Model Ecaluation
models={
    'LogisticRegression':LogisticRegression(),
    'DecisionTreeClassifier':DecisionTreeClassifier(),
 
    
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    acc, cr, cf=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("acc:",acc*100)
    print("cr:",cr)
    print("cf",cf)

 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression
Model Training Performance
acc: 85.53587880028662
cr:               precision    recall  f1-score   support

           0       0.88      0.93      0.91      7455
           1       0.74      0.61      0.66      2314

    accuracy                           0.86      9769
   macro avg       0.81      0.77      0.79      9769
weighted avg       0.85      0.86      0.85      9769

cf [[6956  499]
 [ 914 1400]]
DecisionTreeClassifier
Model Training Performance
acc: 81.06254478452247
cr:               precision    recall  f1-score   support

           0       0.88      0.87      0.88      7455
           1       0.60      0.62      0.61      2314

    accuracy                           0.81      9769
   macro avg       0.74      0.75      0.74      9769
weighted avg       0.81      0.81      0.81      9769

cf [[6479  976]
 [ 874 1440]]


In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [20]:
parameters = {
    "criterion": ['gini','entropy','log_loss'],
    "splitter":['random','best'],
    "max_depth":[1,2,3,4,5]
}

clf = GridSearchCV(DecisionTreeClassifier(),param_grid=parameters,cv=5,verbose=1)

In [21]:
clf.fit(X_train,y_train)

NameError: name 'X_train' is not defined

In [41]:
clf.best_params_['criterion']

'gini'

In [42]:
tree = DecisionTreeClassifier(
    criterion=clf.best_params_['criterion'],
    splitter= clf.best_params_['splitter'],
    max_depth=clf.best_params_['max_depth']

)
tree.fit(X_train,y_train)

In [25]:
y_pred = pipe.predict(X_test)

In [26]:
y_pred

array([0, 0, 1, ..., 0, 0, 1], dtype=int64)

In [27]:
30,"Private",190040,"Bachelors",13,"Never-married","Machine-op-inspct","Not-in-family","White","Female",0,0,40,"United-States"

(30,
 'Private',
 190040,
 'Bachelors',
 13,
 'Never-married',
 'Machine-op-inspct',
 'Not-in-family',
 'White',
 'Female',
 0,
 0,
 40,
 'United-States')

In [24]:
import pickle
pickle.dump(final_pipeline, open('pipe.pkl', 'wb'))

In [25]:
model = pickle.load(open("pipe.pkl","rb"))

In [27]:
X_test.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'country'],
      dtype='object')

In [34]:
test_data = [30,
 'Private',
 190040,
 'Bachelors',
 13,
 'Never_married',
 'Machine_op_inspct',
 'Not_in_family',
 'White',
 'Female',
 0,
 0,
 40,
 'United_States']

test_data = pd.DataFrame([test_data],columns=['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'country'])

In [35]:
prediction = model.predict(test_data)

In [36]:
prediction

array([0], dtype=int64)