In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn.apionly as sns
%matplotlib inline

In [196]:
# Importa os dados
gssdata = pd.read_csv('gssdata4.csv')
gssdata.partyid.unique()

array(['rep', 'other', 'dem'], dtype=object)

In [242]:
# Trata os dados

dummies_df = pd.get_dummies(data=gssdata,columns=['sex','sexornt','race','married','partyid'],drop_first=True)
poorhealth = np.where(gssdata['health'] == 'poor',1,0) # Target variable
dummies_df['poorhealth'] = poorhealth
print("Parcela dos dados que são da classe poorhealth: {:.2f}% ".format(100*dummies_df.poorhealth.mean()))

Parcela dos dados que são da classe poorhealth: 6.38% 


In [243]:
# Exclui os dados faltantes
dummies_df.income = dummies_df.income.fillna(dummies_df.income.mean())
#dummies_df = dummies_df.dropna(axis=0)
print("Parcela dos dados que são da classe poorhealth: {:.2f}% ".format(100*dummies_df.poorhealth.mean()))
dummies_df.info()



Parcela dos dados que são da classe poorhealth: 6.38% 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1551 entries, 0 to 1550
Data columns (total 14 columns):
health                              1551 non-null object
age                                 1551 non-null int64
educ                                1551 non-null int64
income                              1551 non-null float64
sex_male                            1551 non-null uint8
sexornt_dont know                   1551 non-null uint8
sexornt_heterosexual or straight    1551 non-null uint8
sexornt_homosexual or gay           1551 non-null uint8
race_other                          1551 non-null uint8
race_white                          1551 non-null uint8
married_1                           1551 non-null uint8
partyid_other                       1551 non-null uint8
partyid_rep                         1551 non-null uint8
poorhealth                          1551 non-null int32
dtypes: float64(1), int32(1), int64(2), object(1), 

In [133]:
# Desbalanceia os dados aplicando downsampling

dowsampled_df = dummies_df[dummies_df.poorhealth == 0]
counts = dummies_df.poorhealth.value_counts()
downsize = counts[1]
for i in range(10):
    shu = np.random.choice(range(dummies_df.shape[0]),size=dummies_df.shape[0],replace=False)
    dummies_df = dummies_df.iloc[shu,:]
    
selected_rows = np.random.choice(range(len(dowsampled_df)),size=downsize)
dowsampled_df = dowsampled_df.iloc[selected_rows,:]

new_df = pd.concat([dowsampled_df,dummies_df[dummies_df.poorhealth==1]],axis=0)
for i in range(10):
    shu = np.random.choice(range(new_df.shape[0]),size=new_df.shape[0],replace=False)
    new_df = new_df.iloc[shu,:]
dummies_df =  new_df
dummies_df.shape

(198, 14)

In [225]:
dummies_df.columns

Index(['health', 'age', 'educ', 'income', 'sex_male', 'sexornt_dont know',
       'sexornt_heterosexual or straight', 'sexornt_homosexual or gay',
       'race_other', 'race_white', 'married_1', 'partyid_other', 'partyid_rep',
       'poorhealth'],
      dtype='object')

In [244]:
dummies_df = dummies_df.drop(['sexornt_dont know'],axis=1)
dummies_df.columns

Index(['health', 'age', 'educ', 'income', 'sex_male',
       'sexornt_heterosexual or straight', 'sexornt_homosexual or gay',
       'race_other', 'race_white', 'married_1', 'partyid_other', 'partyid_rep',
       'poorhealth'],
      dtype='object')

#### Dadaset desbalanceado!

In [247]:
# Declara um função para normalizar os dados

def norm(xtrain,y):
    x = y.copy()
    max_ = y.income.max()
    min_ = y.income.min()
    x.income = (x.income-min_)/(max_-min_)
    return x

# Divide os dados em treino e teste
    
msk = np.random.rand(dummies_df.shape[0]) <= 0.6
train = dummies_df[msk]
test = dummies_df[~msk]



In [248]:
# Verifica se os dadaset está igualmente balancelado nos conjunto de treino e teste
print('Train Positive ratio {:.2f}%'.format(100*np.mean(train.poorhealth == 1)))
print('Test Positive ratio {:.2f}%'.format(100*np.mean(test.poorhealth == 1)))


Train Positive ratio 6.53%
Test Positive ratio 6.18%


In [231]:
train.shape

(909, 13)

In [249]:
# Desbalanceia os dados aplicando upsampling

upsampled_df = train[train.poorhealth == 1]
counts = train.poorhealth.value_counts()
upsize = counts[0]-counts[1]

selected_rows = np.random.choice(range(len(upsampled_df)),size=upsize,replace=True)
upsampled_df = upsampled_df.iloc[selected_rows,:]

train = pd.concat([train,upsampled_df],axis=0)
for i in range(10):
    shu = np.random.choice(range(train.shape[0]),size=train.shape[0],replace=False)
    train = train.iloc[shu,:]

ytrain = train.poorhealth.values    
xtrain =  train.drop(['health','poorhealth'],axis=1)
ytest = test.poorhealth.values    
xtest = test.drop(['health','poorhealth'],axis=1)

# Normaliza os dados

xtrain = norm(xtrain,xtrain)
xtest = norm(xtrain,xtest)

In [233]:
xtrain.columns


Index(['age', 'educ', 'income', 'sex_male', 'sexornt_heterosexual or straight',
       'sexornt_homosexual or gay', 'race_other', 'race_white', 'married_1',
       'partyid_other', 'partyid_rep'],
      dtype='object')

In [252]:
# Treina um classificador SVM

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix,make_scorer,mean_squared_error,mean_absolute_error,average_precision_score

mse = make_scorer(average_precision_score)
Cvals = [10**i for i in range(-3,4)]
grid_param = {'C':Cvals,'gamma':Cvals}
grid_svc = GridSearchCV(SVC(kernel='rbf'), param_grid=grid_param,cv=5)
grid_svc.fit(xtrain,ytrain)

yhat = grid_svc.predict(xtest)
print(confusion_matrix(ytest,yhat))


[[621   1]
 [ 41   0]]


In [253]:
yhat = grid_svc.predict(xtest)
print(confusion_matrix(ytest,yhat))

[[621   1]
 [ 41   0]]


In [238]:
print("Train score {:.3f} ".format(grid_svc.score(xtrain,ytrain)))
print("Test score {:.3f}".format(grid_svc.score(xtest,ytest)))
print("Best Parameters ",grid_svc.best_params_)

Train score 0.999 
Test score 0.927
Best Parameters  {'C': 1, 'gamma': 100}


In [254]:
# Aplica LogisticRegression

from sklearn.linear_model import LinearRegression

Cvals = [10**i for i in range(-4,4)]
grid_param = {'C':Cvals}
grid = GridSearchCV(LogisticRegression(),param_grid=grid_param,cv=5)

grid.fit(xtrain,ytrain)
yhat = grid.predict(xtest)
confusion_matrix(ytest,yhat)

array([[419, 203],
       [ 16,  25]])

In [255]:
print('Train Score ',grid.score(xtrain,ytrain))
print('Test Score ',grid.score(xtest,ytest))

Train Score  0.678313253012
Test Score  0.669683257919


In [305]:
probs = grid.predict_proba(xtest)
tps = []
for i in np.arange(0.1,1,0.01):
    pred = (probs[:,1]>i)*1
    cm = confusion_matrix(ytest,pred)
    tps.append(cm[0,0]+cm[1,1])
