# Census Bureal Classification

In [1]:
import pandas as pd
import os
import pickle
import joblib
from src.utils.project_paths import DATA_RAW, DATA_PROCESSED, MODELS_PATH
from src.etl.etl import Etl
etl = Etl()
from src.model.model_training import ModelTraining
model_training = ModelTraining()
from src.model.model_slice_evaluation import ModelSliceEvaluation
model_slice_evalutation = ModelSliceEvaluation()
from src.utils import config
from sklearn.model_selection import train_test_split
import great_expectations as ge

## Importing Data

In [3]:
census = pd.read_csv(os.path.join(DATA_RAW, 'census.csv'))
census

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


## Cleaning Data

In [31]:
x_df, y_df = etl.get_clean_data(os.path.join(DATA_RAW, 'census.csv'))
census_cleaned = pd.concat([x_df, y_df], axis=1)

In [32]:
census_cleaned.columns.tolist()

['age',
 'workclass',
 'fnlgt',
 'education',
 'education_num',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital_gain',
 'capital_loss',
 'hours_per_week',
 'native_country',
 'salary']

In [4]:

x_df, y_df = etl.get_clean_data(os.path.join(DATA_RAW, 'census.csv'))
census_cleaned = pd.concat([x_df, y_df], axis=1)
census_cleaned.to_csv(os.path.join(DATA_PROCESSED, 'census_cleaned.csv'), index=False)
census_cleaned

Unnamed: 0,age,workclass,fnlgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
0,39,state-gov,77516,bachelors,13,never-married,adm-clerical,not-in-family,white,male,2174,0,40,united-states,0
1,50,self-emp-not-inc,83311,bachelors,13,married-civ-spouse,exec-managerial,husband,white,male,0,0,13,united-states,0
2,38,private,215646,hs-grad,9,divorced,handlers-cleaners,not-in-family,white,male,0,0,40,united-states,0
3,53,private,234721,11th,7,married-civ-spouse,handlers-cleaners,husband,black,male,0,0,40,united-states,0
4,28,private,338409,bachelors,13,married-civ-spouse,prof-specialty,wife,black,female,0,0,40,cuba,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,private,257302,assoc-acdm,12,married-civ-spouse,tech-support,wife,white,female,0,0,38,united-states,0
32557,40,private,154374,hs-grad,9,married-civ-spouse,machine-op-inspct,husband,white,male,0,0,40,united-states,1
32558,58,private,151910,hs-grad,9,widowed,adm-clerical,unmarried,white,female,0,0,40,united-states,0
32559,22,private,201490,hs-grad,9,never-married,adm-clerical,own-child,white,male,0,0,20,united-states,0


In [5]:
census_cleaned.salary.value_counts()

0    24698
1     7839
Name: salary, dtype: int64

## Training the Model on Data

In [15]:
model_training.execute()

## Slice Evaluation on the Model

In [5]:

model_slice_evalutation.evaluate_slices()

In [11]:


X_df, y_df = etl.get_clean_data(os.path.join(DATA_RAW, 'census.csv'))
df = pd.concat([X_df, y_df], axis=1).sample(100, random_state=config.RANDOM_STATE)
y_df = df['salary']
X_df = df.drop(columns=['salary'])

# X_train, X_test, y_train, y_test = train_test_split(
#     X_df, y_df, test_size=0.3, random_state=config.RANDOM_STATE, stratify=y_df)

In [2]:


X_df, y_df = etl.get_clean_data(os.path.join(DATA_RAW, 'census.csv'))
X_df['salary'] = y_df
X_df['salary'] = X_df['salary'].map({1: '>50k', 0: '<=50k'})

df = ge.from_pandas(X_df)

In [3]:
df.columns.tolist()

['age',
 'workclass',
 'fnlgt',
 'education',
 'education_num',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital_gain',
 'capital_loss',
 'hours_per_week',
 'native_country',
 'salary']

In [4]:
df.salary.value_counts()

<=50k    24698
>50k      7839
Name: salary, dtype: int64

In [5]:
df.marital_status.value_counts().index.tolist()

['married-civ-spouse',
 'never-married',
 'divorced',
 'separated',
 'widowed',
 'married-spouse-absent',
 'married-af-spouse']

In [6]:
categs = [
        'never-married',
        'married-civ-spouse',
        'divorced',
        'married-spouse-absent',
        'separated',
        'married-af-spouse',
        'widowed'
    ]

df.expect_column_distinct_values_to_equal_set('marital_status', categs)['success'], "marital_status column includes unknown category"

(True, 'marital_status column includes unknown category')

In [22]:
df.expect_column_distinct_values_to_equal_set('salary', ['<=50k', '>50k'])['success'], "salary column includes more than two classes"

(True, 'salary column includes more than two classes')

In [13]:
X_df

Unnamed: 0,age,workclass,fnlgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
1837,70,private,30713,hs-grad,9,married-civ-spouse,farming-fishing,husband,white,male,0,0,30,united-states
17726,28,private,158737,12th,8,married-civ-spouse,machine-op-inspct,other-relative,other,male,0,0,40,ecuador
13129,49,private,200949,10th,6,never-married,other-service,unmarried,white,female,0,0,38,peru
6561,26,private,167761,bachelors,13,married-civ-spouse,prof-specialty,husband,white,male,0,0,50,united-states
5734,49,private,200471,1st-4th,2,married-civ-spouse,craft-repair,husband,white,male,0,0,40,portugal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20131,26,private,117217,bachelors,13,divorced,other-service,not-in-family,white,female,0,0,45,united-states
7703,20,private,184756,some-college,10,never-married,adm-clerical,own-child,white,female,0,0,12,united-states
3708,36,private,160120,hs-grad,9,married-civ-spouse,adm-clerical,husband,asian-pac-islander,male,0,0,40,vietnam
2844,37,self-emp-inc,186359,hs-grad,9,married-civ-spouse,sales,husband,white,male,7688,0,60,united-states


In [14]:
y_df

1837     0
17726    0
13129    0
6561     1
5734     0
        ..
20131    0
7703     0
3708     0
2844     1
854      1
Name: salary, Length: 100, dtype: int64

In [16]:
X, y = etl.get_clean_data(os.path.join(DATA_RAW, 'census.csv'))

cat_features = [
        "workclass",
        "education",
        "marital_status",
        "occupation",
        "relationship",
        "race",
        "sex",
        "native_country",
    ]

data = pd.concat([X, y], axis=1)

train, test = train_test_split(data, test_size=0.3, random_state=12)

X_train, y_train, encoder, lb = etl.process_data(
        train, categorical_features=cat_features, label="salary", training=True
    )

X_test, y_test, encoder, lb = etl.process_data(
        test, categorical_features=cat_features, label="salary", training=False, encoder=encoder, lb=lb
    )

model = joblib.load(os.path.join(MODELS_PATH, 'gbclassifier.pkl'))

y_train_pred = model_training.inference(model, X_train)
y_test_pred = model_training.inference(model, X_test)

pre_train, rec_train, f1_train = model_training.compute_model_metrics(
    y_train_pred, y_train)
pre_test, rec_test, f1_test = model_training.compute_model_metrics(
    y_test_pred, y_test)

In [20]:
X_train.shape

(22775, 108)

In [17]:
pre_train, rec_train, f1_train

(0.45530981539023946, 0.8172572178477691, 0.5329283281547041)

In [18]:
pre_test, rec_test, f1_test

(0.46621621621621623, 0.8282070517629407, 0.544477801646382)

In [27]:
X, y = etl.get_clean_data(os.path.join(DATA_RAW, 'census.csv'))

cat_features = [
        "workclass",
        "education",
        "marital_status",
        "occupation",
        "relationship",
        "race",
        "sex",
        "native_country",
    ]

data = pd.concat([X, y], axis=1)

train, test = train_test_split(data, test_size=0.3, random_state=12)

X_train, y_train, encoder, lb = etl.process_data(
        train, categorical_features=cat_features, label="salary", training=True
    )

X_test, y_test, encoder, lb = etl.process_data(
        test, categorical_features=cat_features, label="salary", training=False, encoder=encoder, lb=lb
    )

In [34]:
data.columns

Index(['age', 'workclass', 'fnlgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'salary'],
      dtype='object')

In [28]:
X_test.shape

(9762, 108)

In [14]:
X_train.shape

(70, 66)

In [36]:
from fastapi import FastAPI, Body
import yaml
from src.utils.project_paths import MODELS_PATH, EXAMPLES_API_PATH
from src.api.model import Person, FeatureInfo
with open(EXAMPLES_API_PATH) as fp:
    examples = yaml.safe_load(fp)

Person = Body(..., examples=examples['post_examples'])