## **PREVISÃO DE SALÁRIO COM RANDOM FOREST**

In [None]:
! pip install ydata-profiling

In [None]:
! pip install ipywidgets

In [13]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from ydata_profiling import ProfileReport

In [14]:
data = pd.read_csv('adults.txt', sep=',')
data.head()

Unnamed: 0,age,workclass,final_weight,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


### **Análise Exploratória**

In [None]:
profile = ProfileReport(data, title='Relatório Base de Dados', html={'style':{'full_width':True}})
profile.to_notebook_iframe()
profile.to_file(output_file='Relatório Base de Dados.html')

### **Pré-Processamento dos Dados**

In [20]:
# Convertedr Variáveis Categórias para variáveis Numéricas
for variavel in ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']:
    data[variavel] = LabelEncoder().fit_transform(data[variavel])

In [21]:
data.head()

Unnamed: 0,age,workclass,final_weight,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,<=50K
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,<=50K
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,<=50K
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,<=50K
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,<=50K


In [27]:
X = data.drop('salary', axis=1)
y = data['salary']

In [28]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.20, random_state=42)

### **Criando a Máquina Preditiva**

In [30]:
modelo = RandomForestClassifier(n_estimators=1000)

modelo.fit(Xtrain, ytrain)

### **Acurácia do Modelo**

In [33]:
acuracia = modelo.score(Xtest, ytest)
print('Acurácia: %.2f%%' % acuracia)


Acurácia: 0.86%


### **Importância das Variáveis**

In [38]:
importancia = modelo.feature_importances_

print(X.columns)
print(importancia)

Index(['age', 'workclass', 'final_weight', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country'],
      dtype='object')
[0.14902446 0.04087836 0.16871272 0.03255472 0.09236347 0.06908727
 0.0681186  0.10393168 0.01401207 0.01266051 0.11217075 0.03485046
 0.08405823 0.0175767 ]


### **Previsões**

In [39]:
ypred = modelo.predict(Xtest)

In [40]:
print(ypred)

['<=50K' '<=50K' '>50K' ... '>50K' '<=50K' '<=50K']
