In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.model_selection import KFold
from sklearn import preprocessing
import pickle

In [2]:
from ml.data import process_data
from ml.model import train_model, compute_model_metrics

In [13]:
data=pd.read_csv('data/census_clean.csv')

In [19]:
data=data.drop("Unnamed: 0",axis=1)

In [20]:
data

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30157,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
30158,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
30159,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
30160,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [4]:
#X=data.loc[:, data.columns != ' salary']

In [5]:
#y=data[' salary']

In [6]:
#le=preprocessing.LabelEncoder()
#le.fit(y)
#list(le.classes_)
#y=le.transform(y)

In [21]:
cat_features = [
    " workclass",
    " education",
    " marital-status",
    " occupation",
    " relationship",
    " race",
    " sex",
    " native-country",
]

In [22]:
X, y, encoder, lb = process_data(
data, categorical_features=cat_features, label=" salary", training=True
)

In [23]:
lb

LabelBinarizer()

In [8]:
filename = 'ml/LabelBinarizer.sav'
pickle.dump(lb, open(filename, 'wb'))

filename = 'ml/OneHotEncoder.sav'
pickle.dump(encoder, open(filename, 'wb'))

In [24]:
kf = KFold(n_splits=5)

In [25]:
kf.get_n_splits(X)

5

In [26]:
KFold(n_splits=5, random_state=None, shuffle=False)

KFold(n_splits=5, random_state=None, shuffle=False)

In [27]:
best_precision=0
best_model=""
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index] 
    y_train, y_test = y[train_index], y[test_index] 
        
    model=train_model(X_train, y_train)
    
    preds=model.predict(X_test)
    
    precision, recall, fbeta=compute_model_metrics(y_test, preds)
    
    print("Precision: ",precision,", Recall: ", recall, ", fbeta: ", fbeta)
    
    if precision > best_precision:
        best_precision=precision
        best_model=model
    
    

Precision:  0.731764705882353 , Recall:  0.6187002652519894 , fbeta:  0.670499461013295
Precision:  0.7142857142857143 , Recall:  0.6096730245231607 , fbeta:  0.6578463800073502
Precision:  0.7173091458805745 , Recall:  0.6407832545577312 , fbeta:  0.6768901569186876
Precision:  0.7594226142742582 , Recall:  0.6234364713627386 , fbeta:  0.6847433116413593
Precision:  0.7464150943396226 , Recall:  0.6455613577023499 , fbeta:  0.6923346167308366


In [16]:
filename = 'ml/finalized_model.sav'
pickle.dump(best_model, open(filename, 'wb'))

In [27]:
data.shape[1]

16

In [28]:
len(X[0])

105

In [29]:
data.shape[1] < len(X[0])

True

In [80]:
def slice_data(df, cat_features, encoder, lb, model):
    """ Function for calculating descriptive stats on slices of the dataset."""
    for cat in cat_features:
        print("Categoria: ", cat)
        for cls in df[cat].unique():
            print(cls)
            df_temp = df[df[cat] == cls]
            
            X_temp, y_temp, encoder1, lb1 = process_data(
            df_temp, categorical_features=cat_features, label=" salary", training=False, encoder=encoder, lb=lb,
            )
            
            preds=model.predict(X_temp)
    
            precision, recall, fbeta=compute_model_metrics(y_temp, preds)
        
            with open("ml/slice_output.txt", 'a') as f:
                f.write("\nCategory: "+ cat +", "+ cls+"\n")
                f.write(" -Precision: " + str(precision)+"\n")
                f.write(" -Recall: " + str(recall)+"\n")
                f.write(" -Fbeta: " + str(fbeta)+"\n")

In [46]:
df_temp = data[" sex"] == " Male"
df_temp

0         True
1         True
2         True
3         True
4        False
         ...  
30157    False
30158     True
30159    False
30160     True
30161    False
Name:  sex, Length: 30162, dtype: bool

In [47]:
df_temp2=data[df_temp]
df_temp2

Unnamed: 0.1,Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
7,7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30154,32553,32,Private,116138,Masters,14,Never-married,Tech-support,Not-in-family,Asian-Pac-Islander,Male,0,0,11,Taiwan,<=50K
30155,32554,53,Private,321865,Masters,14,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,>50K
30156,32555,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
30158,32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K


In [67]:
X_temp, y_temp, encoder1, lb1 = process_data(
df_temp2, categorical_features=cat_features, label=" salary", training=False, encoder=encoder, lb=lb,
)

In [68]:
len(X_temp[0])

105

In [62]:
x_temporal=X[df_temp]
len(x_temporal)

20380

In [66]:
len(X[0])

105

In [81]:
slice_data(data,cat_features,encoder,lb,best_model)

Categoria:   workclass
 State-gov
 Self-emp-not-inc
 Private
 Federal-gov
 Local-gov
 Self-emp-inc
 Without-pay
Categoria:   education
 Bachelors
 HS-grad
 11th
 Masters
 9th
 Some-college
 Assoc-acdm
 7th-8th
 Doctorate
 Assoc-voc
 Prof-school
 5th-6th
 10th
 Preschool
 12th
 1st-4th
Categoria:   marital-status
 Never-married
 Married-civ-spouse
 Divorced
 Married-spouse-absent
 Separated
 Married-AF-spouse
 Widowed
Categoria:   occupation
 Adm-clerical
 Exec-managerial
 Handlers-cleaners
 Prof-specialty
 Other-service
 Sales
 Transport-moving
 Farming-fishing
 Machine-op-inspct
 Tech-support
 Craft-repair
 Protective-serv
 Armed-Forces
 Priv-house-serv
Categoria:   relationship
 Not-in-family
 Husband
 Wife
 Own-child
 Unmarried
 Other-relative
Categoria:   race
 White
 Black
 Asian-Pac-Islander
 Amer-Indian-Eskimo
 Other
Categoria:   sex
 Male
 Female
Categoria:   native-country
 United-States
 Cuba
 Jamaica
 India
 Mexico
 Puerto-Rico
 Honduras
 England
 Canada
 Germany
 Iran
 Phil

In [22]:
##PRUEBA

obj={"age": [25],
"workclass": ["loquito"],
"fnlgt": [33],
"education": ["nel"],
"education_num": [5],
"marital_status": ["casado"],
"occupation": ["nada"],
"relationship": ["soltero"],
"race": ["negro sorongo"],
"sex": ["Man"],
"capital_gain": [3],
"capital_loss": [5],
"hours_per_week": [45],
"native_country": ["Gringolandia"]}

In [23]:
data_frame=pd.DataFrame(obj)

In [19]:
data_frame=pd.DataFrame.from_dict(obj, orient ='index')

In [21]:
obj

{'age': 25,
 'workclass': 'loquito',
 'fnlgt': 33,
 'education': 'nel',
 'education_num': 5,
 'marital_status': 'casado',
 'occupation': 'nada',
 'relationship': 'soltero',
 'race': 'negro sorongo',
 'sex': 'Man',
 'capital_gain': 3,
 'capital_loss': 5,
 'hours_per_week': 45,
 'native_country': 'Gringolandia'}

In [24]:
data_frame

Unnamed: 0,age,workclass,fnlgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,25,loquito,33,nel,5,casado,nada,soltero,negro sorongo,Man,3,5,45,Gringolandia


In [31]:
def algo():
    num1=1
    num2=2
    num3=3
    num4=4
    return num1, num2, num3, num4

In [33]:
uno,dos,tres,cuatro=algo()

print(uno,dos)

1 2
