In [74]:
import warnings
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC

# Categorical data

In [75]:
# The file has no headers naming the columns, so we pass header=None
# and provide the column names explicitly in "names"
data = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", na_values=[" ?"],
    header=None, index_col=False,
    names=['age', 'workclass', 'fnlwgt', 'education', 'education-num',
           'marital-status', 'occupation', 'relationship', 'race', 'gender',
           'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
           'income'])
# For illustration purposes, we only select some of the columns
data = data[['workclass', 'age', 'education', 'education-num', 'occupation', 'capital-gain', 'gender', 'hours-per-week',
             'income']]
# IPython.display allows nice output formatting within the Jupyter notebook
# add some none
data.loc[0, 'education-num'] = None
data.head()

Unnamed: 0,workclass,age,education,education-num,occupation,capital-gain,gender,hours-per-week,income
0,State-gov,39,Bachelors,,Adm-clerical,2174,Male,40,<=50K
1,Self-emp-not-inc,50,Bachelors,13.0,Exec-managerial,0,Male,13,<=50K
2,Private,38,HS-grad,9.0,Handlers-cleaners,0,Male,40,<=50K
3,Private,53,11th,7.0,Handlers-cleaners,0,Male,40,<=50K
4,Private,28,Bachelors,13.0,Prof-specialty,0,Female,40,<=50K


In [76]:
data = data[1:1000]

In [77]:
data.isnull().sum()

workclass         62
age                0
education          0
education-num      0
occupation        62
capital-gain       0
gender             0
hours-per-week     0
income             0
dtype: int64

Teraz rzućmy okiem na wszystkie atrybuty kategoryczne:

In [78]:
data["workclass"].value_counts()

workclass
Private             698
Self-emp-not-inc     81
Local-gov            68
State-gov            36
Self-emp-inc         33
Federal-gov          21
Name: count, dtype: int64

In [79]:
data["education"].value_counts()

education
HS-grad         321
Some-college    225
Bachelors       165
Masters          54
Assoc-voc        48
11th             46
Assoc-acdm       35
10th             21
9th              16
7th-8th          15
Doctorate        14
5th-6th          11
Prof-school      10
12th              9
1st-4th           7
Preschool         2
Name: count, dtype: int64

In [80]:
data["gender"].value_counts()

gender
Male      670
Female    329
Name: count, dtype: int64

In [81]:
data["occupation"].value_counts()

occupation
Craft-repair         126
Exec-managerial      124
Prof-specialty       124
Sales                112
Other-service        107
Adm-clerical          93
Machine-op-inspct     61
Transport-moving      52
Tech-support          44
Handlers-cleaners     43
Farming-fishing       31
Protective-serv       16
Priv-house-serv        3
Armed-Forces           1
Name: count, dtype: int64

Sprawdźmy, czy etykiety przyjmują wartości 0 lub 1.

Jak nie to musimy jes troszkę przerobić.

In [82]:
data["income"].value_counts()

income
<=50K    767
>50K     232
Name: count, dtype: int64

In [83]:
X = data.drop(['income'], axis=1)
y = data['income'].values

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

print("X.shape: {} y.shape: {}".format(X.shape, y.shape))
print(f"y values: {np.unique(y)}")

X.shape: (999, 8) y.shape: (999,)
y values: [0 1]


Podzielmy zbiór na train/test

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

Teraz zbudujmy nasze **pipeline** preprocessingu. 

Wykorzystamy DataframeSelector aby wybrać określone atrybuty z DataFrame:

In [85]:
# A class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.attribute_names]

Zbudujmy **pipeline** dla atrybutów numerycznych:

In [86]:
imputer = SimpleImputer(strategy="median")

num_pipeline = Pipeline([
    ("select_numeric", DataFrameSelector(["education-num"])),
    ("imputer", SimpleImputer(strategy="median")),
])

In [87]:
num_pipeline.fit_transform(X_train)

array([[10.],
       [10.],
       [12.],
       [ 9.],
       [13.],
       [ 9.],
       [10.],
       [10.],
       [ 9.],
       [ 9.],
       [ 7.],
       [14.],
       [10.],
       [ 9.],
       [10.],
       [ 9.],
       [ 9.],
       [10.],
       [ 9.],
       [10.],
       [12.],
       [ 9.],
       [10.],
       [13.],
       [ 9.],
       [ 2.],
       [ 9.],
       [ 9.],
       [12.],
       [ 9.],
       [10.],
       [13.],
       [10.],
       [10.],
       [10.],
       [ 6.],
       [13.],
       [ 7.],
       [14.],
       [11.],
       [12.],
       [13.],
       [ 9.],
       [ 7.],
       [ 9.],
       [14.],
       [ 9.],
       [ 9.],
       [ 7.],
       [13.],
       [ 7.],
       [ 9.],
       [13.],
       [ 2.],
       [ 7.],
       [14.],
       [ 6.],
       [10.],
       [ 5.],
       [13.],
       [11.],
       [12.],
       [10.],
       [ 7.],
       [10.],
       [ 9.],
       [10.],
       [ 7.],
       [10.],
       [13.],
       [ 9.],
      

Będziemy także potrzebować imputera do kategorycznych kolumn napisowych (zwykły Imputer nie działa na tych kolumnach):

In [88]:
# Inspired from stackoverflow.com/questions/25239958
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self

    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

Teraz możemy zbudować pipeline dla atrybutów kategorycznych.

We can convert each categorical value to a one-hot vector using a OneHotEncoder. Right now this class can only handle integer categorical inputs, but in Scikit-Learn 0.20 it will also handle string categorical inputs (see PR https://github.com/scikit-learn/scikit-learn/issues/10521). So for now we import it from future_encoders.py

In [89]:
cat_pipeline = Pipeline([
    ("select_cat", DataFrameSelector(["workclass", "education", "occupation", "gender"])),
    ("imputer", MostFrequentImputer()),
    ("cat_encoder", OneHotEncoder(handle_unknown='ignore')),
])

In [90]:
cat_pipeline.fit_transform(X_train)

<799x38 sparse matrix of type '<class 'numpy.float64'>'
	with 3196 stored elements in Compressed Sparse Row format>

Na koniec połączmy powyższe podejścia:

In [91]:
preprocess_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

# Zad

Robimy StratifiedKFold i znajdujemy optymalne parametry dla

* SVM z jądrem rbf
* SVM z jądrem poly
* SVM liniowego
* Regresji logistycznej

In [92]:
def grid_fit(pipe, param_grid):
    grid = GridSearchCV(pipe, param_grid, cv=kfold, return_train_score=True)
    grid.fit(X_train, y_train)

    results = pd.DataFrame(grid.cv_results_)
    scores = np.array(results.mean_test_score)

    return scores, grid.best_estimator_, grid.best_params_

In [None]:
def score_model(model):
    y_pred = model.predict(X_test)

    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    accuracy = metrics.accuracy_score(y_test, y_pred)

    return precision, recall, f1, accuracy

In [93]:
seed = 123
kfold = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)

In [94]:
param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]
}

methods = [
    ("SVM Linear", Pipeline([('preprocessing', preprocess_pipeline), ('classifier', SVC(kernel="linear"))]),
     param_grid),
    ("SVM RBF", Pipeline([('preprocessing', preprocess_pipeline), ('classifier', SVC(kernel="rbf"))])
     , param_grid),
    ("SVM Poly", Pipeline([('preprocessing', preprocess_pipeline), ('classifier', SVC(kernel="poly"))])
     , param_grid),
    ("Logistic Regression", Pipeline([('preprocessing', preprocess_pipeline), ('classifier', LogisticRegression())])
     , param_grid)
]


In [96]:
df = pd.DataFrame(columns=["name", "C", "precision", "recall", "f1", "accuracy"])

for i, (method, pipe, param_grid) in enumerate(methods):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        scores, best_model, best_params = grid_fit(pipe, param_grid)
        precision, recall, f1, accuracy = score_model(best_model)
        df.loc[i] = [method, best_params['classifier__C'], *score_model(best_model)]

df

Unnamed: 0,name,C,precision,recall,f1,accuracy
0,SVM Linear,0.1,0.6,0.195652,0.295082,0.785
1,SVM RBF,10.0,0.733333,0.23913,0.360656,0.805
2,SVM Poly,1.0,0.647059,0.23913,0.349206,0.795
3,Logistic Regression,1.0,0.714286,0.326087,0.447761,0.815
