In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import warnings
from sklearn.exceptions import ConvergenceWarning
from scipy.linalg import LinAlgWarning
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
warnings.filterwarnings(action='ignore', category=LinAlgWarning)
warnings.filterwarnings(action='ignore', category=UserWarning)

# Categorical data

In [3]:
# The file has no headers naming the columns, so we pass header=None
# and provide the column names explicitly in "names"
data = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", na_values=[" ?"], 
    header=None, index_col=False,
    names=['age', 'workclass', 'fnlwgt', 'education', 'education-num',
    'marital-status', 'occupation', 'relationship', 'race', 'gender',
    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
    'income'])
# For illustration purposes, we only select some of the columns
data = data[['workclass', 'age', 'education', 'education-num', 'occupation', 'capital-gain','gender', 'hours-per-week',  'income']]
# IPython.display allows nice output formatting within the Jupyter notebook
# add some none
data['education-num'][0]=None
display(data.head())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['education-num'][0]=None


Unnamed: 0,workclass,age,education,education-num,occupation,capital-gain,gender,hours-per-week,income
0,State-gov,39,Bachelors,,Adm-clerical,2174,Male,40,<=50K
1,Self-emp-not-inc,50,Bachelors,13.0,Exec-managerial,0,Male,13,<=50K
2,Private,38,HS-grad,9.0,Handlers-cleaners,0,Male,40,<=50K
3,Private,53,11th,7.0,Handlers-cleaners,0,Male,40,<=50K
4,Private,28,Bachelors,13.0,Prof-specialty,0,Female,40,<=50K


In [4]:
data = data[1:1000]

In [5]:
data.isnull().sum()

workclass         62
age                0
education          0
education-num      0
occupation        62
capital-gain       0
gender             0
hours-per-week     0
income             0
dtype: int64

Teraz rzućmy okiem na wszystkie atrybuty kategoryczne:

In [6]:
data["workclass"].value_counts()

 Private             698
 Self-emp-not-inc     81
 Local-gov            68
 State-gov            36
 Self-emp-inc         33
 Federal-gov          21
Name: workclass, dtype: int64

In [7]:
data["education"].value_counts()

 HS-grad         321
 Some-college    225
 Bachelors       165
 Masters          54
 Assoc-voc        48
 11th             46
 Assoc-acdm       35
 10th             21
 9th              16
 7th-8th          15
 Doctorate        14
 5th-6th          11
 Prof-school      10
 12th              9
 1st-4th           7
 Preschool         2
Name: education, dtype: int64

In [8]:
data["gender"].value_counts()

 Male      670
 Female    329
Name: gender, dtype: int64

In [9]:
data["occupation"].value_counts()

 Craft-repair         126
 Exec-managerial      124
 Prof-specialty       124
 Sales                112
 Other-service        107
 Adm-clerical          93
 Machine-op-inspct     61
 Transport-moving      52
 Tech-support          44
 Handlers-cleaners     43
 Farming-fishing       31
 Protective-serv       16
 Priv-house-serv        3
 Armed-Forces           1
Name: occupation, dtype: int64

Sprawdźmy, czy etykiety przyjmują wartości 0 lub 1.

Jak nie to musimy jes troszkę przerobić.

In [10]:
X = data.drop(['income'], axis=1)
y = data['income'].values
np.unique(y)
y[ y == ' <=50K'] = 0
y[ y == ' >50K'] = 1

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

print("X.shape: {} y.shape: {}".format(X.shape, y.shape))

X.shape: (999, 8) y.shape: (999,)


Podzielmy zbiór na train/test

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

Teraz zbudujmy nasze **pipeline** preprocessingu. 

Wykorzystamy DataframeSelector aby wybrać określone atrybuty z DataFrame:

In [12]:
from sklearn.base import BaseEstimator, TransformerMixin

# A class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

Zbudujmy **pipeline** dla atrybutów numerycznych:

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


num_pipeline = Pipeline([
        ("select_numeric", DataFrameSelector(["Fare"])),
        ("imputer", SimpleImputer(strategy="median")),
    ])

In [14]:
num_pipeline.fit_transform(X_train)

array([[10.],
       [10.],
       [12.],
       [ 9.],
       [13.],
       [ 9.],
       [10.],
       [10.],
       [ 9.],
       [ 9.],
       [ 7.],
       [14.],
       [10.],
       [ 9.],
       [10.],
       [ 9.],
       [ 9.],
       [10.],
       [ 9.],
       [10.],
       [12.],
       [ 9.],
       [10.],
       [13.],
       [ 9.],
       [ 2.],
       [ 9.],
       [ 9.],
       [12.],
       [ 9.],
       [10.],
       [13.],
       [10.],
       [10.],
       [10.],
       [ 6.],
       [13.],
       [ 7.],
       [14.],
       [11.],
       [12.],
       [13.],
       [ 9.],
       [ 7.],
       [ 9.],
       [14.],
       [ 9.],
       [ 9.],
       [ 7.],
       [13.],
       [ 7.],
       [ 9.],
       [13.],
       [ 2.],
       [ 7.],
       [14.],
       [ 6.],
       [10.],
       [ 5.],
       [13.],
       [11.],
       [12.],
       [10.],
       [ 7.],
       [10.],
       [ 9.],
       [10.],
       [ 7.],
       [10.],
       [13.],
       [ 9.],
      

Będziemy także potrzebować imputera do kategorycznych kolumn napisowych (zwykły Imputer nie działa na tych kolumnach):

In [15]:
# Inspired from stackoverflow.com/questions/25239958
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

Teraz możemy zbudować pipeline dla atrybutów kategorycznych.

We can convert each categorical value to a one-hot vector using a OneHotEncoder. Right now this class can only handle integer categorical inputs, but in Scikit-Learn 0.20 it will also handle string categorical inputs (see PR https://github.com/scikit-learn/scikit-learn/issues/10521). So for now we import it from future_encoders.py

In [16]:
# from future_encoders import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["workclass", "education", "occupation", "gender"])),
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", OneHotEncoder(sparse=False, handle_unknown = 'ignore')),
    ])

In [17]:
cat_pipeline.fit_transform(X_train)

array([[0., 0., 1., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 1., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 1., 0.]])

Na koniec połączmy powyższe podejścia:

In [18]:
from sklearn.pipeline import FeatureUnion
preprocess_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

# Zad

Robimy StratifiedKFold i znajdujemy optymalne parametry dla

* SVM z jądrem rbf
* SVM z jądrem poly
* SVM liniowego
* Regresji logistycznej

In [19]:
from sklearn.model_selection import StratifiedKFold

seed=123
kfold = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)

In [20]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
pipe = Pipeline([
    ('preprocessing', preprocess_pipeline), 
    ('classifier', SVC(kernel='rbf'))])


param_grid = {
            'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]
}

grid_1 = GridSearchCV(pipe, param_grid, cv=kfold)

grid_1.fit(X_train, y_train)
grid_1.best_params_

{'classifier__C': 10}

In [21]:
pipe = Pipeline([
    ('preprocessing', preprocess_pipeline),
    ('classifier', SVC(kernel='poly'))])

param_grid = {
            'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]
}
grid_2 = GridSearchCV(pipe, param_grid, cv=kfold)
grid_2.fit(X_train, y_train)
grid_2.best_params_

{'classifier__C': 1}

In [22]:
pipe = Pipeline([
    ('preprocessing', preprocess_pipeline),
    ('classifier', SVC(kernel='linear'))])

param_grid = {
            'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]
}
grid_3 = GridSearchCV(pipe, param_grid, cv=kfold)
grid_3.fit(X_train, y_train)
grid_3.best_params_

{'classifier__C': 0.1}

In [23]:
from sklearn.linear_model import LogisticRegression
pipe = Pipeline([
    ('preprocessing', preprocess_pipeline),
    ('classifier', LogisticRegression())])

param_grid = {
            'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]
}
grid_4 = GridSearchCV(pipe, param_grid, cv=kfold)
grid_4.fit(X_train, y_train)
grid_4.best_params_

{'classifier__C': 1}

In [24]:
from sklearn import  metrics


models = []
models.append(('SVM rbf', grid_1.best_estimator_))
models.append(('SVM poly', grid_2.best_estimator_))
models.append(('SVM linear', grid_3.best_estimator_))
models.append(('Logistic regression', grid_4.best_estimator_))


precision_score = []
recall_score = []
f1_score = []
accuracy_score = []
for name, model in models:
    print(name)
    print("precision_score: {}".format(metrics.precision_score(y_test, model.predict(X_test)) ))
    print("recall_score: {}".format( metrics.recall_score(y_test, model.predict(X_test)) ))
    print("f1_score: {}".format( metrics.f1_score(y_test, model.predict(X_test)) ))
    print("accuracy_score: {}".format( metrics.accuracy_score(y_test, model.predict(X_test)) ))
    precision_score.append(metrics.precision_score(y_test, model.predict(X_test)))
    recall_score.append(metrics.recall_score(y_test, model.predict(X_test)))
    f1_score.append( metrics.f1_score(y_test, model.predict(X_test)))
    accuracy_score.append(metrics.accuracy_score(y_test, model.predict(X_test)))

SVM rbf
precision_score: 0.7333333333333333
recall_score: 0.2391304347826087
f1_score: 0.36065573770491804
accuracy_score: 0.805
SVM poly
precision_score: 0.6470588235294118
recall_score: 0.2391304347826087
f1_score: 0.34920634920634924
accuracy_score: 0.795
SVM linear
precision_score: 0.6
recall_score: 0.1956521739130435
f1_score: 0.29508196721311475
accuracy_score: 0.785
Logistic regression
precision_score: 0.7142857142857143
recall_score: 0.32608695652173914
f1_score: 0.4477611940298507
accuracy_score: 0.815


In [25]:
import pandas as pd
d = {'precision_score': precision_score, 
     'recall_score': recall_score, 
     'f1_score': f1_score,
     'accuracy_score' : accuracy_score
    }
df = pd.DataFrame(data=d)
df.insert(loc=0, column='Method', value=['SVM rbf', 'SVM poly', 'SVM linear', 'Logistic Regression'])
df

Unnamed: 0,Method,precision_score,recall_score,f1_score,accuracy_score
0,SVM rbf,0.733333,0.23913,0.360656,0.805
1,SVM poly,0.647059,0.23913,0.349206,0.795
2,SVM linear,0.6,0.195652,0.295082,0.785
3,Logistic Regression,0.714286,0.326087,0.447761,0.815
