In [29]:
import pandas as pd
import numpy as np
from sklearn.grid_search import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

In [30]:
data = pd.read_csv('adult.data',header=None,index_col=False,names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
                                                                   'marital-status', 'occupation', 'relationship', 
                                                                   'race', 'sex', 'capital-gain', 'capital-loss','hours-per-week'
                                                                  ,'native-country', 'income']) 
data_dummies = pd.get_dummies(data)
print(list(data_dummies))
features = data_dummies.loc[:,'age':'native-country_ Yugoslavia']
X = features.values
y = data_dummies['income_ <=50K'].values

['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week', 'workclass_ ?', 'workclass_ Federal-gov', 'workclass_ Local-gov', 'workclass_ Never-worked', 'workclass_ Private', 'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc', 'workclass_ State-gov', 'workclass_ Without-pay', 'education_ 10th', 'education_ 11th', 'education_ 12th', 'education_ 1st-4th', 'education_ 5th-6th', 'education_ 7th-8th', 'education_ 9th', 'education_ Assoc-acdm', 'education_ Assoc-voc', 'education_ Bachelors', 'education_ Doctorate', 'education_ HS-grad', 'education_ Masters', 'education_ Preschool', 'education_ Prof-school', 'education_ Some-college', 'marital-status_ Divorced', 'marital-status_ Married-AF-spouse', 'marital-status_ Married-civ-spouse', 'marital-status_ Married-spouse-absent', 'marital-status_ Never-married', 'marital-status_ Separated', 'marital-status_ Widowed', 'occupation_ ?', 'occupation_ Adm-clerical', 'occupation_ Armed-Forces', 'occupation_ Craft-repair',

In [13]:
#KNN
knn = KNeighborsClassifier()
#Busqueda de los mejores parametros
k_range = list(range(1, 31))
weight_options = ['uniform', 'distance']
param_dist = dict(n_neighbors=k_range, weights=weight_options)
rand = RandomizedSearchCV(knn,param_dist,cv=10,scoring='accuracy',
                         n_iter=5,random_state=5)
rand.fit(X,y)
print(rand.grid_scores_)
print(rand.best_score_)
print(rand.best_params_)

[mean: 0.79362, std: 0.00418, params: {'n_neighbors': 16, 'weights': 'distance'}, mean: 0.79884, std: 0.00457, params: {'n_neighbors': 22, 'weights': 'uniform'}, mean: 0.79681, std: 0.00481, params: {'n_neighbors': 18, 'weights': 'uniform'}, mean: 0.79887, std: 0.00403, params: {'n_neighbors': 27, 'weights': 'uniform'}, mean: 0.79936, std: 0.00415, params: {'n_neighbors': 29, 'weights': 'uniform'}]
0.79936119898
{'n_neighbors': 29, 'weights': 'uniform'}


In [14]:
#RandomForest
rf = RandomForestClassifier()
#Busqueda de los parametros
n_estimators = list(range(1,31))
max_depth = list(range(1,31))
criterion = ['gini','entropy']
param_dist = dict(criterion = criterion, max_depth=max_depth,
                 n_estimators=n_estimators)
rand = RandomizedSearchCV(rf,param_dist,cv=10,scoring='accuracy',
                         n_iter=5,random_state=5)
rand.fit(X,y)
print(rand.grid_scores_)
print(rand.best_score_)
print(rand.best_params_)

[mean: 0.85762, std: 0.00715, params: {'n_estimators': 28, 'criterion': 'gini', 'max_depth': 29}, mean: 0.84939, std: 0.00753, params: {'n_estimators': 27, 'criterion': 'gini', 'max_depth': 7}, mean: 0.85633, std: 0.00490, params: {'n_estimators': 16, 'criterion': 'entropy', 'max_depth': 28}, mean: 0.82021, std: 0.00792, params: {'n_estimators': 9, 'criterion': 'entropy', 'max_depth': 4}, mean: 0.84377, std: 0.00821, params: {'n_estimators': 3, 'criterion': 'entropy', 'max_depth': 9}]
0.857621080434
{'n_estimators': 28, 'criterion': 'gini', 'max_depth': 29}


In [15]:
#Adaboost
ab = AdaBoostClassifier()
#Busqueda de los parametros
n_estimators = list(range(1,31))
algorithm = ['SAMME','SAMME.R']
param_dist = dict(algorithm=algorithm,n_estimators=n_estimators)
rand = RandomizedSearchCV(ab,param_dist,cv=10,scoring='accuracy',
                         n_iter=5,random_state=5)
rand.fit(X,y)
print(rand.grid_scores_)
print(rand.best_score_)
print(rand.best_params_)

[mean: 0.81935, std: 0.00587, params: {'n_estimators': 2, 'algorithm': 'SAMME.R'}, mean: 0.85194, std: 0.00656, params: {'n_estimators': 13, 'algorithm': 'SAMME.R'}, mean: 0.84313, std: 0.00652, params: {'n_estimators': 5, 'algorithm': 'SAMME.R'}, mean: 0.85406, std: 0.00551, params: {'n_estimators': 23, 'algorithm': 'SAMME.R'}, mean: 0.85498, std: 0.00540, params: {'n_estimators': 27, 'algorithm': 'SAMME.R'}]
0.85497988391
{'n_estimators': 27, 'algorithm': 'SAMME.R'}


In [16]:
#SVM->SVC
svc = SVC()
#Busqueda de los parametros
kernel = ['linear','poly','rbf','sigmoid','precomputed']
max_iter = list(range(-1,10))
param_dist = dict(kernel=kernel,max_iter=max_iter)
rand = RandomizedSearchCV(svc,param_dist,cv=10,scoring='accuracy',
                         n_iter=5,random_state=5)
rand.fit(X,y)
print(rand.grid_scores_)
print(rand.best_score_)
print(rand.best_params_)



[mean: 0.76011, std: 0.00132, params: {'kernel': 'poly', 'max_iter': 5}, mean: 0.75919, std: 0.00007, params: {'kernel': 'sigmoid', 'max_iter': 7}, mean: 0.70713, std: 0.15545, params: {'kernel': 'rbf', 'max_iter': 8}, mean: 0.65520, std: 0.20698, params: {'kernel': 'poly', 'max_iter': 9}, mean: 0.29283, std: 0.15546, params: {'kernel': 'rbf', 'max_iter': 3}]
0.760111790178
{'kernel': 'poly', 'max_iter': 5}


In [None]:
#GradientBoost
gb = GradientBoostingClassifier()
#Busqueda de los parametros
n_estimators = list(range(1,31))
max_depth = list(range(1,31))
min_samples_leaf = list(range(1,11))
min_samples_split = list(range(1,11))
criterion = ['friedman_mse','mse','mae']
param_dist = dict(criterion = criterion, max_depth=max_depth,
                 n_estimators=n_estimators,min_samples_leaf=min_samples_leaf,
                 min_samples_split=min_samples_split)
rand = RandomizedSearchCV(gb,param_dist,cv=10,scoring='accuracy',
                         n_iter=5,random_state=5)
rand.fit(X,y)
print(rand.grid_scores_)
print(rand.best_score_)
print(rand.best_params_)