In [1]:
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder,LabelEncoder,FunctionTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('./Data/adult.csv')
df.head(1)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K


1. age
2. workclass
3. education
4. marital-status
5. occupation
6. relastionship
7. race
8. gender
9. hours per week 

In [3]:
test = pd.DataFrame([25,'Private',7,'Never-married','Machine-op-inspct','Own-child','Black','Male',40]).transpose()
test

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,25,Private,7,Never-married,Machine-op-inspct,Own-child,Black,Male,40


**1. Cleaning Data**

In [4]:
df.drop_duplicates(inplace=True)
df.drop(columns=['fnlwgt','capital-gain','capital-loss','native-country','education'],axis=1,inplace=True)
df.replace('?','other',inplace=True)
df.rename(columns={'age':0,'workclass':1,'educational-num':2,'marital-status':3,'occupation':4,'relationship':5,'race':6,'gender':7,'hours-per-week':8},inplace=True)

**2. EDA**

1. fnlwgt through analysis in future
2. category merge in categorical fetures in future

In [None]:
df.head(3)

In [None]:
# for i in df.columns[[1,2,3,4,5,6,7,8]]:
#     ax = df[i].value_counts().plot(kind='barh')  
#     ax.bar_label(ax.containers[0])
#     print(i)
#     plt.show()

In [None]:
# for i in df.columns[[1,2,3,4,5,6,7,8]]:
#     ax = df.groupby(i)['income'].value_counts().plot(kind='barh')
#     ax.bar_label(ax.containers[0])
#     print(i)
#     plt.show() 

In [None]:
# df.describe()

In [None]:
# df.corr()

**3. Preprocessing**

1. ? replace with 'other'
2. age robust scaling  [centraling median, range IQR]
3. workclass onehot encoding
4. married status onehot encoding
5. occupation onehot encoding
6. relationship onehot encoding   
7. race onehot encoding  
8. gender ordinal encoding 1.male 2. female 
9. hours per week standard scaling
10. drop capital gain/loss & native country                           

In [47]:
# x_train,x_test,y_train,y_test = train_test_split(df.iloc[:,:9],df['income'],train_size=0.8)

In [5]:
le = LabelEncoder()
df['income'] = le.fit_transform(df['income'])

In [6]:
trf1 = ColumnTransformer(
    [
        ('normalization',MinMaxScaler(),[0,8])
    ],remainder='passthrough'
)
trf2 = ColumnTransformer(
    [
        ('encoding',OneHotEncoder(sparse_output=False),[2,4,5,6,7,8])
    ],remainder='passthrough'
)

In [7]:
def clean(obj):
     obj = pd.DataFrame(obj)
     return obj[[11,  46, 44, 45, 43, 19, 25,  4,21, 16, 33, 39,  9, 35, 23,  34]]
ft = FunctionTransformer(clean)

In [8]:
pipe = Pipeline(
    [
        ('Scaling Data',trf1),
        ('Encoding Data',trf2),
        ('data',ft),
        ('classifier',MultinomialNB())
    ]
)

In [9]:
pipe.fit(df.iloc[:,:9].copy(),df['income'].copy())

**11,  46, 44, 45, 43, 19, 25,  4,21, 16, 33, 39,  9, 35, 23,  34**

In [99]:
pipe.predict(test)

array([0])

In [63]:
import pickle as pkl
pkl.dump(pipe,open('pipe.pkl','wb')) 

In [None]:
# from sklearn.metrics import accuracy_score,precision_score,confusion_matrix
# ac = accuracy_score(y_test,y_pred)
# print(ac)
# pc = precision_score(y_test,y_pred)
# print(pc)
# cm = confusion_matrix(y_test,y_pred)
# print(cm)