In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import Imputer
from sklearn.model_selection import StratifiedKFold
from sklearn import linear_model
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [2]:
dados = pd.read_csv("datasets/adult_data.txt")

In [3]:
dados.head()

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


### Nomeando as colunas

In [4]:
dados.columns = ['Age', 'Workclass', 'fnlwgt', 'Education', 'Education-num', 'Marital-status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Capital-gain', 'Capital-loss', 'Hours-per-week', 'Native-country', 'Income']

In [5]:
dados.head()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-num,Marital-status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,Income
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


### Transformando a coluna income e a coluna sex para zero ou um

In [6]:
label_income = {label: idx for idx, label in enumerate(np.unique(dados['Income']))}
label_income

{' <=50K': 0, ' >50K': 1}

In [7]:
feature_sex = {feature: idx for idx, feature in enumerate(np.unique(dados['Sex']))}
feature_sex

{' Female': 0, ' Male': 1}

In [8]:
# to convert class labels from strings to integers
dados['Income'] = dados['Income'].map(label_income)
dados['Sex'] = dados['Sex'].map(feature_sex)
dados.head()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-num,Marital-status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,Income
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,13,United-States,0
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,1,0,0,40,United-States,0
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,1,0,0,40,United-States,0
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,0,0,40,Cuba,0
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,0,0,0,40,United-States,0


### Transformando ? para NaN

In [9]:
map_replace = {'Workclass': {' ?': np.nan},
 'Occupation': {' ?': np.nan},
 'Native-country': {' ?': np.nan}}

dados = dados.replace(map_replace)
dados.head()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-num,Marital-status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,Income
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,13,United-States,0
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,1,0,0,40,United-States,0
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,1,0,0,40,United-States,0
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,0,0,40,Cuba,0
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,0,0,0,40,United-States,0


### Excluindo todas as linhas que tinha um valor NaN

In [10]:
dados = dados.dropna(axis=0)
dados.isnull().sum()

Age               0
Workclass         0
fnlwgt            0
Education         0
Education-num     0
Marital-status    0
Occupation        0
Relationship      0
Race              0
Sex               0
Capital-gain      0
Capital-loss      0
Hours-per-week    0
Native-country    0
Income            0
dtype: int64

### Excluindo a coluna Education

In [11]:
dados = dados.drop('Education', axis=1)

### Transformando as features categoricas em binario

In [12]:
dados = pd.get_dummies(dados[['Age', 'Workclass', 'fnlwgt', 'Education-num', 'Marital-status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Capital-gain', 'Capital-loss', 'Hours-per-week', 'Native-country', 'Income']])

In [13]:
dados.head()

Unnamed: 0,Age,fnlwgt,Education-num,Sex,Capital-gain,Capital-loss,Hours-per-week,Income,Workclass_ Federal-gov,Workclass_ Local-gov,...,Native-country_ Portugal,Native-country_ Puerto-Rico,Native-country_ Scotland,Native-country_ South,Native-country_ Taiwan,Native-country_ Thailand,Native-country_ Trinadad&Tobago,Native-country_ United-States,Native-country_ Vietnam,Native-country_ Yugoslavia
0,50,83311,13,1,0,0,13,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,38,215646,9,1,0,0,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,53,234721,7,1,0,0,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,28,338409,13,0,0,0,40,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,37,284582,14,0,0,0,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## Use 5-Fold Cross Validation Estratificada para obter a Accuracy através do uso dos seguintes algoritmos: Logistic Regression, kNN, Naive Bayes, SVM.

In [14]:
X = np.hstack((dados.iloc[:, : 7].values, dados.iloc[:, 8 : 87].values))
y = dados.iloc[:, 7].values

In [15]:
stratified = StratifiedKFold(n_splits=5, shuffle=True)
stratified.get_n_splits();

In [16]:
vet_acc_lr = []
vet_acc_knn = []
vet_acc_naive = []
vet_acc_svm = []

for index_train, index_test in stratified.split(X, y):
    X_train, X_test = X[index_train], X[index_test]
    y_train, y_test = y[index_train], y[index_test]
    
    std = preprocessing.StandardScaler().fit(X_train)
    X_train_std = std.transform(X_train)
    X_test_std = std.transform(X_test)
    
    #LogisticRegression
    model_lr = linear_model.LogisticRegression()
    model_lr.fit(X_train_std, y_train)
    y_pred_lr = model_lr.predict(X_test_std)
    vet_acc_lr.append(metrics.accuracy_score(y_test, y_pred_lr))
    
    #KNN
    model_knn = KNeighborsClassifier(n_neighbors=3).fit(X_train_std, y_train)
    y_pred_knn = model_knn.predict(X_test_std)
    vet_acc_knn.append(metrics.accuracy_score(y_test, y_pred_knn))
    
    #Naive Bayes
    model_naive = GaussianNB().fit(X_train_std, y_train)
    y_pred_naive = model_naive.predict(X_test_std)
    vet_acc_naive.append(metrics.accuracy_score(y_test, y_pred_naive))
    
    #SVM
    model_svm = SVC().fit(X_train_std, y_train)
    y_pred_svm = model_svm.predict(X_test_std)
    vet_acc_svm.append(metrics.accuracy_score(y_test, y_pred_svm))
    
    
print("Accuracy LogisticRegression: ", np.mean(vet_acc_lr))
print("Accuracy KNN: ", np.mean(vet_acc_knn))
print("Accuracy Naive Bayes: ", np.mean(vet_acc_naive))
print("Accuracy SVM: ", np.mean(vet_acc_svm))  



Accuracy LogisticRegression:  0.8475510871602309
Accuracy KNN:  0.8149264437733074
Accuracy Naive Bayes:  0.4975665344501775
Accuracy SVM:  0.8447661830528457


## Escolha a melhor técnica e crie um modelo final usando todo o dataset para treinar o modelo.

In [17]:
model_final = linear_model.LogisticRegression()
model_final.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)