# Importing everything

In [None]:
import pandas as pd
results = pd.DataFrame()

In [None]:
FROM_DRIVE = False
DROP_CAPITAL_GAIN_AND_LOSS = False


import itertools
from time import time
import math

# Sklearn

from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier



# Metrics
from sklearn.metrics import recall_score as rec
from sklearn.metrics import precision_score as prec
from sklearn.metrics import  accuracy_score as acc
from sklearn.metrics import  f1_score as f1_scr

# Drive
from google.colab import drive

def round_up(n, decimals=0):
    multiplier = 10 ** decimals
    return math.ceil(n * multiplier) / multiplier

if FROM_DRIVE:
  sheet_id = "1i6irlQ9MJlOwtvBEvwKmhUZQaHaTsWhTEOMZHzXp2Tg"
  sheet_name = "adult"
  url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
  DataSet = pd.read_csv(url)

  sheet_test_id = "1rNIlF4VfmwBlDjo53u2qSYaZBKtjbj9sOWaemQfqrvk"
  sheet_test_name = "adult_test"
  url_test = f"https://docs.google.com/spreadsheets/d/{sheet_test_id}/gviz/tq?tqx=out:csv&sheet={sheet_test_name}"
  DataSet_test = pd.read_csv(url_test)

  DataSet = DataSet.append(DataSet_test, ignore_index=True)
else:
  DataSet = pd.read_csv('adult.csv')
  DataSet_test = pd.read_csv('adult_test.csv')
  DataSet = DataSet.append(DataSet_test, ignore_index=True)

DataSet


# Data processing and splitting

In [None]:
from sklearn import preprocessing as preprocessing
label_encoder=preprocessing.LabelEncoder()

quantitative_features = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]

mode = DataSet.mode()

for feature in quantitative_features:
    DataSet = DataSet.replace("?",mode[feature][0])
    label_encoder.fit(DataSet[feature])
    DataSet[feature]=label_encoder.transform(DataSet[feature])

label_encoder.fit(DataSet['outcome'])
DataSet['outcome']=label_encoder.transform(DataSet['outcome'])

print("Outcome: 0 stands for <=50k, 1 stands for >50k")

print("Capital-gain has "+str((DataSet['capital-gain']==0).sum()/(DataSet['capital-gain'].count())*100) + " % of it's values as 0")
print("Capital-loss has "+str((DataSet['capital-loss']==0).sum()/(DataSet['capital-loss'].count())*100) + " % of it's values as 0")

DataSet.drop(columns=['education'], inplace=True)

if DROP_CAPITAL_GAIN_AND_LOSS:
  DataSet.drop(columns=['capital-gain', 'capital-loss'], inplace=True)
  print("Capital-loss and Capital-gainhas been dropped")

X = DataSet.drop(columns=['outcome'])
y = DataSet['outcome']






In [4]:
DataSet

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,outcome
0,39,6,77516,13,4,0,1,4,1,2174,0,40,39,0
1,50,5,83311,13,2,3,0,4,1,0,0,13,39,0
2,38,3,215646,9,0,5,1,4,1,0,0,40,39,0
3,53,3,234721,7,2,5,0,2,1,0,0,40,39,0
4,28,3,338409,13,2,10,5,2,0,0,0,40,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,3,215419,13,0,10,1,4,0,0,0,36,39,0
48838,64,3,321403,9,6,9,2,2,1,0,0,40,39,0
48839,38,3,374983,13,2,10,0,4,1,0,0,50,39,0
48840,44,3,83891,13,0,0,3,1,1,5455,0,40,39,0


In [5]:
# for i in DataSet.index: 
#     if DataSet["outcome"][i] == 0:
#         DataSet["outcome"][i] = 1
#     else: 
#         DataSet["outcome"][i] = 0
# DataSet

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
clfRandomForest = RandomForestClassifier(criterion='entropy', max_depth=None, max_features='log2',min_samples_leaf = 2, min_samples_split = 5, n_estimators = 800, random_state = 128 )
clfRandomForest.fit(X_train, y_train)
y_hat = clfRandomForest.predict(X_test)
results = results.append({'Model': 'Random Forest Classifier','Accuracy': acc(y_test,y_hat), 'Precision': prec(y_hat,y_test), 'Recall': rec(y_test,y_hat), 'F1_Score': f1_scr(y_test,y_hat)}, ignore_index=True)
print('Scale: Accuracy: %.4f Precision: %.4f Recal: %.4f F1_Score: %.4f'%(acc(y_test,y_hat),rec(y_test,y_hat),prec(y_hat,y_test),f1_scr(y_test,y_hat)))

Scale: Accuracy: 0.8664 Precision: 0.6257 Recal: 0.6257 F1_Score: 0.6945


In [7]:
results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1_Score
0,Random Forest Classifier,0.866432,0.625717,0.625717,0.694512


# Doing hyper tunning by hand, maybe test set especialization

In [8]:

# max_depth =  [40, None]
# max_features =  ['log2']
# min_samples_leaf =  [2,3,4,5,6,7,8,9,10]
# min_samples_split =  [2,3,4,5,6,7,8,9,10] 
# n_estimators=  [800, 1000, 1400]


# combinate = [max_depth,max_features,min_samples_leaf,min_samples_split,n_estimators]

# all_permutations = list(itertools.product(*combinate))
# print(len(all_permutations))
# #     0               1                 2                   3                   4
# # max_depth     max_features    min_samples_leaf    min_samples_split     n_estimators
# print(all_permutations[:10])
# print(all_permutations[-10:-1])

In [9]:
# run_number = 1
# for i in all_permutations:
#     t = time()
#     clfRandomForest = RandomForestClassifier(criterion='entropy', max_depth=i[0], max_features=i[1],min_samples_leaf = i[2], min_samples_split = i[3], n_estimators = i[4] )
#     clfRandomForest.fit(X_train, y_train)
#     y_hat = clfRandomForest.predict(X_test)
#     final_time = str(round_up(time()-t))
#     print(f'---------------Run number {run_number}------------------------------')
#     results = results.append({'Model': 'Random Forest Classifier','Accuracy': acc(y_test,y_hat), 'Precision': prec(y_hat,y_test), 'Recall': rec(y_test,y_hat), 'F1_Score': f1_scr(y_test,y_hat), 'Configuration': str(i), 'Duration': final_time}, ignore_index=True)
#     print('Scale: Accuracy: %.4f Precision: %.4f Recal: %.4f F1_Score: %.4f'%(acc(y_test,y_hat),rec(y_test,y_hat),prec(y_hat,y_test),f1_scr(y_test,y_hat)))
#     print(f'Duração: {final_time}')
#     print('Configuration:')
#     print(i)
#     run_number = run_number + 1

In [None]:
run_number = 1

for i in range(1,40,1):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=run_number)
    t = time()
    clfRandomForest = RandomForestClassifier(criterion='entropy', max_depth=None, max_features='log2',min_samples_leaf = 2, min_samples_split = 5, n_estimators = 800, random_state = 128 )
    clfRandomForest.fit(X_train, y_train)
    y_hat = clfRandomForest.predict(X_test)
    final_time = str(round_up(time()-t))
    print(f'---------------Run number {run_number}------------------------------')
    results = results.append({'Model': 'Random Forest Classifier','Accuracy': acc(y_test,y_hat), 'Precision': prec(y_hat,y_test), 'Recall': rec(y_test,y_hat), 'F1_Score': f1_scr(y_test,y_hat), 'Configuration': str(i), 'Duration': final_time}, ignore_index=True)
    print('Scale: Accuracy: %.4f Precision: %.4f Recal: %.4f F1_Score: %.4f'%(acc(y_test,y_hat),rec(y_test,y_hat),prec(y_hat,y_test),f1_scr(y_test,y_hat)))
    print(f'Duração: {final_time}')
    print('Configuration:')
    print(i)
    run_number = run_number + 1
    

---------------Run number 1------------------------------
Scale: Accuracy: 0.8645 Precision: 0.6283 Recal: 0.6283 F1_Score: 0.6892
Duração: 37.0
Configuration:
1


In [None]:
results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1_Score,Configuration,Duration
0,Random Forest Classifier,0.866432,0.943555,0.943555,0.914531,,
1,Random Forest Classifier,0.864548,0.938765,0.938765,0.913403,1,40.0
2,Random Forest Classifier,0.860945,0.940187,0.940187,0.910726,2,40.0
3,Random Forest Classifier,0.863402,0.938780,0.938780,0.912615,3,40.0
4,Random Forest Classifier,0.864385,0.940183,0.940183,0.913452,4,40.0
...,...,...,...,...,...,...,...
195,Random Forest Classifier,0.869298,0.943735,0.943735,0.916762,195,40.0
196,Random Forest Classifier,0.867742,0.943978,0.943978,0.915767,196,40.0
197,Random Forest Classifier,0.868561,0.943579,0.943579,0.916254,197,40.0
198,Random Forest Classifier,0.863238,0.944149,0.944149,0.912327,198,40.0
