In [1]:
def get_extension(path, verbose=False):
    filename = path.split('\\')[-1]
    extension = filename.split('.')[-1]
    
    if verbose == True:
        print('Extension =', extension, '\nfilename =', filename)
    return extension


def read_data(path, kwargs=None):
    supported_exts = ('csv')
    ext = get_extension(path)
    if ext not in supported_exts:
        raise ValueError(f'Rozszerzenie {ext} nie jest jeszcze obsługiwane')
    
    if ext == 'csv':
        import pandas as pd
        import numpy as np
        data = pd.read_csv(path, **kwargs)
        return data
    
    
def mean_target_by_1variable(df_name, variable, target, verbose=False):
    from pandasql import sqldf
    query = f"""
select
    {variable}
    ,avg({target}) as MEAN_{target}
from {df_name}
group by {variable}
order by avg({target}) desc
    """
    if verbose == True:
        print(query)
    return sqldf(query)

In [2]:
import numpy as np
import_params = {
    'names' : ('age', 'workclass', 'fnlwgt', 'education', 'education_num', 
                'marital_status','occupation','relationship', 'race', 'sex',
                'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'target')
    ,'index_col' : False
    ,'dtype' : {'age' : np.int16, 'fnlwgt' : np.int32, 'education-num' : np.int16,
                'capital-gain' : np.int32, 'capital-loss' : np.int32, 'hours-per-week' : np.int16}
    ,"skipinitialspace" : True
    ,'na_values' : ('', '?')
}
train_data = read_data("C:\\Users\\User1\\Notebooks\\ML_DS\\Adult_UCI\\adult_data.csv", import_params)
test_data = read_data("C:\\Users\\User1\\Notebooks\\ML_DS\\Adult_UCI\\adult_test.csv", import_params)

In [3]:
%%time
train_data = train_data.replace(to_replace={'target' : '>50K'}, value=1)
train_data = train_data.replace(to_replace={'target' : '<=50K'}, value=0)
test_data = test_data.replace(to_replace={'target' : '>50K'}, value=1)
test_data = test_data.replace(to_replace={'target' : '<=50K'}, value=0)

Wall time: 20.2 ms


In [4]:
train_data['target'] = train_data['target'].astype(np.int8)
test_data['target'] = test_data['target'].astype(np.int8)

In [None]:
train_data

In [9]:
%%time
nomin_vars = ('workclass', 'education', 'marital_status', 'occupation'
              , 'relationship', 'race', 'sex', 'native_country')
dfs_dict = {}
for variable in nomin_vars:
    dfs_dict[variable] = mean_target_by_1variable('train_data', variable, 'target')

Wall time: 4.36 s


In [10]:
%matplotlib Qt
for variable in dfs_dict.keys():
    dfs_dict[variable].plot.bar(x=variable, y='MEAN_target')

In [None]:
num_vars = ('age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week')
for variable in num_vars:
    a = train_data.hist(by='target', column=variable, legend=True)

In [None]:
%%time
%matplotlib Qt
import matplotlib.pyplot as plt
colors = ['green' if y == 1 else 'red' for y in train_data['target']]
train_data.plot.scatter('education_num', 'age', c=colors)
plt.title('Green: target = 1. Red: target = 0')

# Model 1 - prosty, oparty tylko na age i education_num

In [5]:
%%time
train_data_1 = train_data[['age', 'education_num', 'target']]
test_data_1 = test_data[['age', 'education_num', 'target']]

X_train = train_data_1.values[:, :-1]
y_train = train_data_1.values[:, -1]
y_train = y_train.astype('int')

X_test = test_data_1.values[:, :-1]
y_test = test_data_1.values[:, -1]

Wall time: 6 ms


In [24]:
%%time
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn import impute
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd 

ppl = Pipeline(steps=[
    ('imputation', impute.SimpleImputer(strategy='most_frequent'))
    ,('classification', RidgeClassifier())
])
cv_params = {
    'classification__alpha' : np.logspace(-3, 1, 1000)
}
search = GridSearchCV(ppl, cv_params, n_jobs=-1)
search.fit(X_train, y_train)
search.best_params_

Wall time: 16.5 s


{'classification__alpha': 0.001}

In [25]:
%%time
search.score(X_train, y_train)

Wall time: 4.99 ms


0.7775252602807039

In [26]:
%%time
res = search.predict(X_test)
res_df = pd.DataFrame(np.concatenate((X_test, y_test[:, None], res[:, None]), axis=1))
res_df.columns = ['age', 'education_num', 'target', 'prediction']

from pandasql import sqldf
query = """
select
    age
    ,education_num
    ,target
    ,prediction
    ,case
        when target = prediction and target = 1 then 'blue'
        when target = prediction and target = 0 then 'green'
        when target <> prediction and target = 1 then 'yellow'
    else 'red' end as color
    ,case
        when target = prediction and target = 1 then 'true positive'
        when target = prediction and target = 0 then 'true negative'
        when target <> prediction and target = 1 then 'false negative'
        else 'false positive'
    end as eval
from res_df
"""
res_df = sqldf(query)

Wall time: 243 ms


In [27]:
plt.figure()
for i, partition in res_df.groupby('color'):
    plt.scatter(partition['education_num'], partition['age'], c=partition['color'], label=partition['eval'].iloc[0])
plt.legend()

<matplotlib.legend.Legend at 0x1995c1058e0>

In [28]:
%%time
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn import impute
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import pandas as pd 

ppl = Pipeline(steps=[
    ('imputation', impute.SimpleImputer(strategy='most_frequent'))
    ,('classification', LogisticRegression())
])
cv_params = {
    'classification__C' : np.logspace(-3, 1, 1)
}
search = GridSearchCV(ppl, cv_params, n_jobs=-1)
search.fit(X_train, y_train)
search.best_params_

Wall time: 179 ms


{'classification__C': 0.001}

In [29]:
%%time
print(search.score(X_train, y_train))
print(search.score(X_test, y_test))

0.781026381253647
0.7838584853510226
Wall time: 8 ms


In [30]:
%%time
res = search.predict(X_test)
res_df = pd.DataFrame(np.concatenate((X_test, y_test[:, None], res[:, None]), axis=1))
res_df.columns = ['age', 'education_num', 'target', 'prediction']

from pandasql import sqldf
query = """
select
    age
    ,education_num
    ,target
    ,prediction
    ,case
        when target = prediction and target = 1 then 'blue'
        when target = prediction and target = 0 then 'green'
        when target <> prediction and target = 1 then 'yellow'
    else 'red' end as color
    ,case
        when target = prediction and target = 1 then 'true positive'
        when target = prediction and target = 0 then 'true negative'
        when target <> prediction and target = 1 then 'false negative'
        else 'false positive'
    end as eval
from res_df
"""
res_df = sqldf(query)

Wall time: 291 ms


In [31]:
plt.figure()
for i, partition in res_df.groupby('color'):
    plt.scatter(partition['education_num'], partition['age'], c=partition['color'], label=partition['eval'].iloc[0])
plt.legend()

<matplotlib.legend.Legend at 0x19961eb1940>

In [16]:
%%time
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn import impute
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import pandas as pd 

ppl = Pipeline(steps=[
    ('imputation', impute.SimpleImputer(strategy='most_frequent'))
    ,('scaling', StandardScaler())
    ,('classification', SVC(gamma='scale', class_weight='balanced'))
])
cv_params = {
    'classification__C' : np.logspace(-3, 1, 20)
    ,'classification__kernel': ['rbf', 'linear', 'poly', 'sigmoid']
    ,'classification__max_iter' : [1000]
}
search = GridSearchCV(ppl, cv_params, n_jobs=-1)
search.fit(X_train, y_train)
search.best_params_

Wall time: 3min 24s




{'classification__C': 6.158482110660261,
 'classification__kernel': 'linear',
 'classification__max_iter': 1000}

In [17]:
%%time
print(search.score(X_train, y_train))
print(search.score(X_test, y_test))

0.7350511347931574
0.7355813524967754
Wall time: 1.8 s


In [18]:
%%time
res = search.predict(X_test)
res_df = pd.DataFrame(np.concatenate((X_test, y_test[:, None], res[:, None]), axis=1))
res_df.columns = ['age', 'education_num', 'target', 'prediction']

from pandasql import sqldf
query = """
select
    age
    ,education_num
    ,target
    ,prediction
    ,case
        when target = prediction and target = 1 then 'blue'
        when target = prediction and target = 0 then 'green'
        when target <> prediction and target = 1 then 'yellow'
    else 'red' end as color
    ,case
        when target = prediction and target = 1 then 'true positive'
        when target = prediction and target = 0 then 'true negative'
        when target <> prediction and target = 1 then 'false negative'
        else 'false positive'
    end as eval
from res_df
"""
res_df = sqldf(query)

Wall time: 899 ms


In [15]:
%matplotlib Qt
import matplotlib.pyplot as plt
plt.figure()
for i, partition in res_df.groupby('color'):
    plt.scatter(partition['education_num'], partition['age'], c=partition['color'], label=partition['eval'].iloc[0])
plt.legend()

<matplotlib.legend.Legend at 0x1995f597820>

In [20]:
%%time
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn import impute
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd 

ppl = Pipeline(steps=[
    ('imputation', impute.SimpleImputer(strategy='most_frequent'))
    ,('scaling', StandardScaler())
    ,('classification', KNeighborsClassifier(n_jobs=-1))
])
cv_params = {
    'classification__n_neighbors' : [i for i in range(3, 11)]
    ,'classification__weights' : ['uniform', 'distance']
    , 'classification__p' : [i for i in range(2, 6)]
}
search = GridSearchCV(ppl, cv_params, n_jobs=-1)
search.fit(X_train, y_train)
search.best_params_

Wall time: 35.7 s


{'classification__n_neighbors': 10,
 'classification__p': 2,
 'classification__weights': 'uniform'}

In [21]:
%%time
print(search.score(X_train, y_train))
print(search.score(X_test, y_test))

0.7776481066306318
0.7702843805663043
Wall time: 1.58 s


In [22]:
%%time
res = search.predict(X_test)
res_df = pd.DataFrame(np.concatenate((X_test, y_test[:, None], res[:, None]), axis=1))
res_df.columns = ['age', 'education_num', 'target', 'prediction']

from pandasql import sqldf
query = """
select
    age
    ,education_num
    ,target
    ,prediction
    ,case
        when target = prediction and target = 1 then 'blue'
        when target = prediction and target = 0 then 'green'
        when target <> prediction and target = 1 then 'yellow'
    else 'red' end as color
    ,case
        when target = prediction and target = 1 then 'true positive'
        when target = prediction and target = 0 then 'true negative'
        when target <> prediction and target = 1 then 'false negative'
        else 'false positive'
    end as eval
from res_df
"""
res_df = sqldf(query)

Wall time: 708 ms


In [23]:
%matplotlib Qt
import matplotlib.pyplot as plt
plt.figure()
for i, partition in res_df.groupby('color'):
    plt.scatter(partition['education_num'], partition['age'], c=partition['color'], label=partition['eval'].iloc[0])
plt.legend()

<matplotlib.legend.Legend at 0x19961d51a90>

In [7]:
%%time
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn import impute
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
import pandas as pd 

ppl = Pipeline(steps=[
    ('imputation', impute.SimpleImputer(strategy='most_frequent'))
    ,('scaling', StandardScaler())
    ,('classification', GaussianNB())
])
search = ppl
search.fit(X_train, y_train)

Wall time: 12 ms


Pipeline(steps=[('imputation', SimpleImputer(strategy='most_frequent')),
                ('scaling', StandardScaler()),
                ('classification', GaussianNB())])

In [8]:
%%time
print(search.score(X_train, y_train))
print(search.score(X_test, y_test))

0.7915604557599583
0.7926417296234874
Wall time: 9.96 ms


In [9]:
%%time
res = search.predict(X_test)
res_df = pd.DataFrame(np.concatenate((X_test, y_test[:, None], res[:, None]), axis=1))
res_df.columns = ['age', 'education_num', 'target', 'prediction']

from pandasql import sqldf
query = """
select
    age
    ,education_num
    ,target
    ,prediction
    ,case
        when target = prediction and target = 1 then 'blue'
        when target = prediction and target = 0 then 'green'
        when target <> prediction and target = 1 then 'yellow'
    else 'red' end as color
    ,case
        when target = prediction and target = 1 then 'true positive'
        when target = prediction and target = 0 then 'true negative'
        when target <> prediction and target = 1 then 'false negative'
        else 'false positive'
    end as eval
from res_df
"""
res_df = sqldf(query)

Wall time: 455 ms


In [12]:
%matplotlib Qt
import matplotlib.pyplot as plt
plt.figure()
for i, partition in res_df.groupby('color'):
    plt.scatter(partition['education_num'], partition['age'], c=partition['color'], label=partition['eval'].iloc[0])
plt.legend()
plt.xlabel('Number of education classes')
plt.ylabel('Age')
plt.title('Evaluation of Naive Bayes classification')

Text(0.5, 1.0, 'Evaluation of Naive Bayes classification')

In [27]:
%%time
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn import impute
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd 

ppl = Pipeline(steps=[
    ('imputation', impute.SimpleImputer(strategy='most_frequent'))
#     ,('scaling', StandardScaler())
    ,('classification', DecisionTreeClassifier(class_weight = 'balanced'))
])
cv_params = {
    'classification__criterion' : ['gini', 'entropy']
    ,'classification__ccp_alpha' : np.linspace(0, 1, 21)
}
search = GridSearchCV(ppl, cv_params, n_jobs=-1)
search.fit(X_train, y_train)
search.best_params_

Wall time: 1.59 s


 0.24080955 0.47160705        nan 0.24080955 0.24080955        nan
 0.24080955 0.24080955        nan 0.24080955 0.24080955        nan
 0.24080955 0.24080955        nan 0.24080955 0.24080955        nan
 0.24080955 0.24080955        nan 0.24080955 0.24080955        nan
 0.24080955 0.24080955        nan 0.24080955 0.24080955        nan
 0.24080955 0.24080955        nan 0.24080955 0.24080955        nan
 0.24080955 0.24080955        nan 0.24080955 0.24080955        nan
 0.24080955 0.24080955        nan 0.24080955 0.24080955        nan
 0.24080955 0.24080955        nan 0.24080955 0.24080955        nan
 0.24080955 0.24080955        nan]


{'classification__ccp_alpha': 0.05, 'classification__criterion': 'entropy'}

In [28]:
%%time
print(search.score(X_train, y_train))
print(search.score(X_test, y_test))

0.7819170172906238
0.7805417357656164
Wall time: 10.1 ms


In [23]:
%%time
res = search.predict(X_test)
res_df = pd.DataFrame(np.concatenate((X_test, y_test[:, None], res[:, None]), axis=1))
res_df.columns = ['age', 'education_num', 'target', 'prediction']

from pandasql import sqldf
query = """
select
    age
    ,education_num
    ,target
    ,prediction
    ,case
        when target = prediction and target = 1 then 'blue'
        when target = prediction and target = 0 then 'green'
        when target <> prediction and target = 1 then 'yellow'
    else 'red' end as color
    ,case
        when target = prediction and target = 1 then 'true positive'
        when target = prediction and target = 0 then 'true negative'
        when target <> prediction and target = 1 then 'false negative'
        else 'false positive'
    end as eval
from res_df
"""
res_df = sqldf(query)

Wall time: 242 ms


In [24]:
%matplotlib Qt
import matplotlib.pyplot as plt
plt.figure()
for i, partition in res_df.groupby('color'):
    plt.scatter(partition['education_num'], partition['age'], c=partition['color'], label=partition['eval'].iloc[0])
plt.legend()
plt.xlabel('Number of education classes')
plt.ylabel('Age')
plt.title('Evaluation of Decision Tree classification')

Text(0.5, 1.0, 'Evaluation of Decision Tree classification')

In [6]:
%%time
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn import impute
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd 

ppl = Pipeline(steps=[
    ('imputation', impute.SimpleImputer(strategy='most_frequent'))
#     ,('scaling', StandardScaler())
    ,('classification', RandomForestClassifier(class_weight = 'balanced', n_jobs=-1))
])
cv_params = {
    'classification__criterion' : ['gini', 'entropy']
    ,'classification__ccp_alpha' : np.linspace(0, 1, 21)
    ,'classification__n_estimators' : [5*i for i in range(1, 101)]
}
search = GridSearchCV(ppl, cv_params, n_jobs=-1, verbose=3)
search.fit(X_train, y_train)
search.best_params_

Fitting 5 folds for each of 4200 candidates, totalling 21000 fits
Wall time: 4h 18min 47s


{'classification__ccp_alpha': 0.05,
 'classification__criterion': 'gini',
 'classification__n_estimators': 130}

In [7]:
%%time
print(search.score(X_train, y_train))
print(search.score(X_test, y_test))

0.7819170172906238
0.7805417357656164
Wall time: 170 ms


In [8]:
%%time
res = search.predict(X_test)
res_df = pd.DataFrame(np.concatenate((X_test, y_test[:, None], res[:, None]), axis=1))
res_df.columns = ['age', 'education_num', 'target', 'prediction']

from pandasql import sqldf
query = """
select
    age
    ,education_num
    ,target
    ,prediction
    ,case
        when target = prediction and target = 1 then 'blue'
        when target = prediction and target = 0 then 'green'
        when target <> prediction and target = 1 then 'yellow'
    else 'red' end as color
    ,case
        when target = prediction and target = 1 then 'true positive'
        when target = prediction and target = 0 then 'true negative'
        when target <> prediction and target = 1 then 'false negative'
        else 'false positive'
    end as eval
from res_df
"""
res_df = sqldf(query)

Wall time: 831 ms


In [9]:
%matplotlib Qt
import matplotlib.pyplot as plt
plt.figure()
for i, partition in res_df.groupby('color'):
    plt.scatter(partition['education_num'], partition['age'], c=partition['color'], label=partition['eval'].iloc[0])
plt.legend()
plt.xlabel('Number of education classes')
plt.ylabel('Age')
plt.title('Evaluation of RandomForest classification')

Text(0.5, 1.0, 'Evaluation of RandomForest classification')

# Model 2 - model wykorzystujący zakodowane zmianne kategoryczne  
Zmienne:
('workclass', 'education', 'marital_status', 'occupation'
              , 'relationship', 'race', 'sex', 'native_country')
              


In [6]:
split_row = len(train_data.values)
print(split_row)
full_data = train_data.append(test_data)

32561


In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

nomin_vars = ('workclass', 'education', 'marital_status', 'occupation' , 'relationship', 'race', 'sex', 'native_country')
enc = OneHotEncoder(sparse=False)
transformer = ColumnTransformer([('nomin_vars', enc, nomin_vars)], remainder='passthrough')
encoded_data = transformer.fit_transform(full_data)

In [8]:
enc_train_data = encoded_data[:split_row, :]
enc_test_data = encoded_data[split_row:, :]

X_train = enc_train_data[:, :-1]
y_train = enc_train_data[:, -1]
y_train = y_train.astype('int')

X_test = enc_test_data[:, :-1]
y_test = enc_test_data[:, -1]

In [9]:
%%time
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier
esti = DecisionTreeClassifier(random_state=1)
feat_selector = RFECV(esti, verbose=1, n_jobs=4, cv=4)

feat_selector = feat_selector.fit(X_train, y_train)
X_train = feat_selector.transform(X_train)
X_test = feat_selector.transform(X_test)

Fitting estimator with 108 features.
Fitting estimator with 107 features.
Fitting estimator with 106 features.
Fitting estimator with 105 features.
Fitting estimator with 104 features.
Fitting estimator with 103 features.
Fitting estimator with 102 features.
Fitting estimator with 101 features.
Fitting estimator with 100 features.
Fitting estimator with 99 features.
Fitting estimator with 98 features.
Fitting estimator with 97 features.
Fitting estimator with 96 features.
Fitting estimator with 95 features.
Fitting estimator with 94 features.
Fitting estimator with 93 features.
Fitting estimator with 92 features.
Fitting estimator with 91 features.
Fitting estimator with 90 features.
Fitting estimator with 89 features.
Fitting estimator with 88 features.
Fitting estimator with 87 features.
Fitting estimator with 86 features.
Fitting estimator with 85 features.
Fitting estimator with 84 features.
Fitting estimator with 83 features.
Fitting estimator with 82 features.
Fitting estimator w

In [10]:
X_train.shape

(32561, 71)

In [24]:
%%time
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn import impute
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd 

ppl = Pipeline(steps=[
    ('imputation', impute.SimpleImputer(strategy='most_frequent'))
    ,('classification', RidgeClassifier())
])
cv_params = {
    'classification__alpha' : np.logspace(-3, 1, 100)
}
search = GridSearchCV(ppl, cv_params, n_jobs=-1, verbose=3)
search.fit(X_train, y_train)
search.best_params_

Fitting 5 folds for each of 100 candidates, totalling 500 fits


KeyboardInterrupt: 

In [38]:
%%time
search.score(X_train, y_train)

Wall time: 13.1 ms


0.838764165719726

In [39]:
%%time
search.score(X_test, y_test)

Wall time: 11.6 ms


0.8427000798476753

In [40]:
res = search.predict(X_test)
res_df = pd.DataFrame(np.concatenate((y_test[:, None], res[:, None]), axis=1))
res_df.columns = ['target', 'prediction']
from pandasql import sqldf
query = """
select
    eval as RESULT
    ,count(*) as CNT
from
    (select
        target
        ,prediction
        ,case
            when target = prediction and target = 1 then 'true positive'
            when target = prediction and target = 0 then 'true negative'
            when target <> prediction and target = 1 then 'false negative'
            else 'false positive'
        end as eval
    from res_df)
group by eval
"""
res_df = sqldf(query)

In [41]:
res_df

Unnamed: 0,RESULT,CNT
0,false negative,1882
1,false positive,679
2,true negative,11756
3,true positive,1964


In [44]:
%%time
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn import impute
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import pandas as pd 

ppl = Pipeline(steps=[
    ('imputation', impute.SimpleImputer(strategy='most_frequent'))
    ,('classification', LogisticRegression())
])
cv_params = {
    'classification__C' : np.logspace(-3, 2, 500)
}
search = GridSearchCV(ppl, cv_params, n_jobs=-1, verbose=3)
search.fit(X_train, y_train)
search.best_params_

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
Wall time: 2min 1s


{'classification__C': 0.029712958185773342}

In [45]:
%%time
search.score(X_train, y_train)

Wall time: 10.6 ms


0.7957679432449863

In [46]:
%%time
search.score(X_test, y_test)

Wall time: 10.1 ms


0.7978011178674529

In [47]:
res = search.predict(X_test)
res_df = pd.DataFrame(np.concatenate((y_test[:, None], res[:, None]), axis=1))
res_df.columns = ['target', 'prediction']
from pandasql import sqldf
query = """
select
    eval as RESULT
    ,count(*) as CNT
from
    (select
        target
        ,prediction
        ,case
            when target = prediction and target = 1 then 'true positive'
            when target = prediction and target = 0 then 'true negative'
            when target <> prediction and target = 1 then 'false negative'
            else 'false positive'
        end as eval
    from res_df)
group by eval
"""
res_df = sqldf(query)

In [48]:
res_df

Unnamed: 0,RESULT,CNT
0,false negative,2846
1,false positive,446
2,true negative,11989
3,true positive,1000


In [9]:
%%time
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn import impute
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import pandas as pd 

ppl = Pipeline(steps=[
    ('imputation', impute.SimpleImputer(strategy='most_frequent'))
    ,('scaling', StandardScaler())
    ,('classification', SVC(gamma='scale', class_weight='balanced'))
])
cv_params = {
    'classification__C' : np.logspace(-3, 1, 50)
    ,'classification__kernel': ['rbf', 'linear', 'poly', 'sigmoid']
    ,'classification__max_iter' : [1000]
}
search = GridSearchCV(ppl, cv_params, n_jobs=-1, verbose=3)
search.fit(X_train, y_train)
search.best_params_

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Wall time: 23min 25s




{'classification__C': 2.2229964825261934,
 'classification__kernel': 'sigmoid',
 'classification__max_iter': 1000}

In [10]:
%%time
search.score(X_train, y_train)

Wall time: 2.93 s


0.7809035349037192

In [11]:
%%time
search.score(X_test, y_test)

Wall time: 1.46 s


0.7805417357656164

In [14]:
res = search.predict(X_test)
res_df = pd.DataFrame(np.concatenate((y_test[:, None], res[:, None]), axis=1))
res_df.columns = ['target', 'prediction']
from pandasql import sqldf
query = """
select
    eval as RESULT
    ,count(*) as CNT
from
    (select
        target
        ,prediction
        ,case
            when target = prediction and target = 1 then 'true positive'
            when target = prediction and target = 0 then 'true negative'
            when target <> prediction and target = 1 then 'false negative'
            else 'false positive'
        end as eval
    from res_df)
group by eval
"""
sqldf(query)

Unnamed: 0,RESULT,CNT
0,false negative,1094
1,false positive,2479
2,true negative,9956
3,true positive,2752


In [30]:
%%time
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn import impute
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd 

ppl = Pipeline(steps=[
    ('imputation', impute.SimpleImputer(strategy='most_frequent'))
    ,('scaling', StandardScaler())
    ,('classification', KNeighborsClassifier(n_jobs=4))
])
cv_params = {
    'classification__n_neighbors' : [i for i in range(3, 11)]
    ,'classification__weights' : ['uniform', 'distance']
    , 'classification__p' : [2]
}
search = GridSearchCV(ppl, cv_params, n_jobs=4, verbose=3, cv=4)
search.fit(X_train, y_train)
search.best_params_

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Wall time: 3min 24s


{'classification__n_neighbors': 10,
 'classification__p': 2,
 'classification__weights': 'uniform'}

In [31]:
%%time
search.score(X_train, y_train)

Wall time: 34.2 s


0.8578053499585394

In [32]:
%%time
search.score(X_test, y_test)

Wall time: 16.9 s


0.8288802899084823

In [33]:
%%time
res = search.predict(X_test)
res_df = pd.DataFrame(np.concatenate((y_test[:, None], res[:, None]), axis=1))
res_df.columns = ['target', 'prediction']
from pandasql import sqldf
query = """
select
    eval as RESULT
    ,count(*) as CNT
from
    (select
        target
        ,prediction
        ,case
            when target = prediction and target = 1 then 'true positive'
            when target = prediction and target = 0 then 'true negative'
            when target <> prediction and target = 1 then 'false negative'
            else 'false positive'
        end as eval
    from res_df)
group by eval
"""
sqldf(query)

Unnamed: 0,RESULT,CNT
0,false negative,1888
1,false positive,898
2,true negative,11537
3,true positive,1958


In [19]:
%%time
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn import impute
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
import pandas as pd 

ppl = Pipeline(steps=[
    ('imputation', impute.SimpleImputer(strategy='most_frequent'))
    ,('scaling', StandardScaler())
    ,('classification', GaussianNB())
])
search = ppl
search.fit(X_train, y_train)

Wall time: 164 ms


Pipeline(steps=[('imputation', SimpleImputer(strategy='most_frequent')),
                ('scaling', StandardScaler()),
                ('classification', GaussianNB())])

In [20]:
%%time
search.score(X_train, y_train)

Wall time: 65.1 ms


0.7693559780105034

In [21]:
%%time
search.score(X_test, y_test)

Wall time: 36 ms


0.7705300657207789

In [23]:
%%time
res = search.predict(X_test)
res_df = pd.DataFrame(np.concatenate((y_test[:, None], res[:, None]), axis=1))
res_df.columns = ['target', 'prediction']
from pandasql import sqldf
query = """
select
    eval as RESULT
    ,count(*) as CNT
from
    (select
        target
        ,prediction
        ,case
            when target = prediction and target = 1 then 'true positive'
            when target = prediction and target = 0 then 'true negative'
            when target <> prediction and target = 1 then 'false negative'
            else 'false positive'
        end as eval
    from res_df)
group by eval
"""
sqldf(query)

Wall time: 280 ms


Unnamed: 0,RESULT,CNT
0,false negative,631
1,false positive,3105
2,true negative,9330
3,true positive,3215


In [10]:
%%time
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn import impute
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd 

ppl = Pipeline(steps=[
    ('imputation', impute.SimpleImputer(strategy='most_frequent'))
#     ,('scaling', StandardScaler())
    ,('classification', DecisionTreeClassifier(class_weight = 'balanced'))
])
cv_params = {
    'classification__criterion' : ['gini', 'entropy']
    ,'classification__ccp_alpha' : np.logspace(-6, 0, 1000)
}
search = GridSearchCV(ppl, cv_params, n_jobs=4, cv=4, verbose=3)
search.fit(X_train, y_train)
search.best_params_

Fitting 4 folds for each of 2000 candidates, totalling 8000 fits
Wall time: 16min 32s


{'classification__ccp_alpha': 1.567455410205595e-05,
 'classification__criterion': 'entropy'}

In [11]:
%%time
search.score(X_train, y_train)

Wall time: 29.7 ms


0.9999692884125181

In [12]:
%%time
search.score(X_test, y_test)

Wall time: 19.8 ms


0.8126036484245439

In [13]:
%%time
res = search.predict(X_test)
res_df = pd.DataFrame(np.concatenate((y_test[:, None], res[:, None]), axis=1))
res_df.columns = ['target', 'prediction']
from pandasql import sqldf
query = """
select
    eval as RESULT
    ,count(*) as CNT
from
    (select
        target
        ,prediction
        ,case
            when target = prediction and target = 1 then 'true positive'
            when target = prediction and target = 0 then 'true negative'
            when target <> prediction and target = 1 then 'false negative'
            else 'false positive'
        end as eval
    from res_df)
group by eval
"""
sqldf(query)

Wall time: 440 ms


Unnamed: 0,RESULT,CNT
0,false negative,1499
1,false positive,1552
2,true negative,10883
3,true positive,2347


In [16]:
%%time
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn import impute
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier

base_estim = DecisionTreeClassifier(class_weight='balanced', ccp_alpha=1e-4)
ppl = Pipeline(steps=[
    ('imputation', impute.SimpleImputer(strategy='most_frequent'))
#     ,('scaling', StandardScaler())
    ,('classification', BaggingClassifier(base_estim, n_jobs=4))
])
cv_params = {
    'classification__n_estimators' : [1,5,15,25,50,100,150,200,250,300,500,100]
}
search = GridSearchCV(ppl, cv_params, n_jobs=4, cv=4, verbose=3)
search.fit(X_train, y_train)
search.best_params_

Fitting 4 folds for each of 12 candidates, totalling 48 fits
Wall time: 5min 31s


{'classification__n_estimators': 250}

In [17]:
%%time
search.score(X_train, y_train)

Wall time: 2.71 s


0.9183993120604405

In [18]:
%%time
search.score(X_test, y_test)

Wall time: 965 ms


0.8475523616485474

In [19]:
%%time
res = search.predict(X_test)
res_df = pd.DataFrame(np.concatenate((y_test[:, None], res[:, None]), axis=1))
res_df.columns = ['target', 'prediction']
from pandasql import sqldf
query = """
select
    eval as RESULT
    ,count(*) as CNT
from
    (select
        target
        ,prediction
        ,case
            when target = prediction and target = 1 then 'true positive'
            when target = prediction and target = 0 then 'true negative'
            when target <> prediction and target = 1 then 'false negative'
            else 'false positive'
        end as eval
    from res_df)
group by eval
"""
sqldf(query)

Wall time: 1.09 s


Unnamed: 0,RESULT,CNT
0,false negative,904
1,false positive,1578
2,true negative,10857
3,true positive,2942


In [25]:
%%time
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn import impute
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

ppl = Pipeline(steps=[
    ('imputation', impute.SimpleImputer(strategy='most_frequent'))
#     ,('scaling', StandardScaler())
    ,('classification', RandomForestClassifier(class_weight='balanced', n_jobs=4))
])
cv_params = {
    'classification__n_estimators' : [1,5,15,25,50,100,150,200,250,300,500,100]
    ,'classification__ccp_alpha' : np.logspace(-6, 0, 20)
}
search = GridSearchCV(ppl, cv_params, n_jobs=4, cv=4, verbose=3)
search.fit(X_train, y_train)
search.best_params_

Fitting 4 folds for each of 240 candidates, totalling 960 fits
Wall time: 25min 2s


{'classification__ccp_alpha': 3.792690190732254e-05,
 'classification__n_estimators': 250}

In [26]:
%%time
search.score(X_train, y_train)

Wall time: 455 ms


0.9601670710359018

In [27]:
%%time
search.score(X_test, y_test)

Wall time: 238 ms


0.8538787543762668

In [28]:
%%time
res = search.predict(X_test)
res_df = pd.DataFrame(np.concatenate((y_test[:, None], res[:, None]), axis=1))
res_df.columns = ['target', 'prediction']
from pandasql import sqldf
query = """
select
    eval as RESULT
    ,count(*) as CNT
from
    (select
        target
        ,prediction
        ,case
            when target = prediction and target = 1 then 'true positive'
            when target = prediction and target = 0 then 'true negative'
            when target <> prediction and target = 1 then 'false negative'
            else 'false positive'
        end as eval
    from res_df)
group by eval
"""
sqldf(query)

Wall time: 334 ms


Unnamed: 0,RESULT,CNT
0,false negative,1111
1,false positive,1268
2,true negative,11167
3,true positive,2735


In [33]:
%%time
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn import impute
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier

ppl = Pipeline(steps=[
    ('imputation', impute.SimpleImputer(strategy='most_frequent'))
#     ,('scaling', StandardScaler())
    ,('classification', AdaBoostClassifier())
])
cv_params = {
    'classification__n_estimators' : [1,5,15,25,50,100,150,200,250,300,500,100]
    ,'classification__learning_rate' : np.logspace(-2, 1, 20)
}
search = GridSearchCV(ppl, cv_params, n_jobs=4, cv=4, verbose=3)
search.fit(X_train, y_train)
search.best_params_

Fitting 4 folds for each of 240 candidates, totalling 960 fits
Wall time: 29min 29s


{'classification__learning_rate': 1.1288378916846884,
 'classification__n_estimators': 500}

In [34]:
%%time
search.score(X_train, y_train)

Wall time: 3.15 s


0.873007585762108

In [35]:
%%time
search.score(X_test, y_test)

Wall time: 1.6 s


0.870523923591917

In [36]:
%%time
res = search.predict(X_test)
res_df = pd.DataFrame(np.concatenate((y_test[:, None], res[:, None]), axis=1))
res_df.columns = ['target', 'prediction']
from pandasql import sqldf
query = """
select
    eval as RESULT
    ,count(*) as CNT
from
    (select
        target
        ,prediction
        ,case
            when target = prediction and target = 1 then 'true positive'
            when target = prediction and target = 0 then 'true negative'
            when target <> prediction and target = 1 then 'false negative'
            else 'false positive'
        end as eval
    from res_df)
group by eval
"""
sqldf(query)

Wall time: 1.8 s


Unnamed: 0,RESULT,CNT
0,false negative,1362
1,false positive,746
2,true negative,11689
3,true positive,2484


In [11]:
%%time
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn import impute
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import HistGradientBoostingClassifier

ppl = Pipeline(steps=[
    ('imputation', impute.SimpleImputer(strategy='most_frequent'))
#     ,('scaling', StandardScaler())
    ,('classification', HistGradientBoostingClassifier())
])
cv_params = {
    'classification__learning_rate' : np.logspace(-3, 0, 20)
    ,'classification__l2_regularization' : np.logspace(-3, 0, 10)
}
search = GridSearchCV(ppl, cv_params, n_jobs=4, cv=4, verbose=3)
search.fit(X_train, y_train)
search.best_params_

Fitting 4 folds for each of 200 candidates, totalling 800 fits
Wall time: 9min 15s


{'classification__l2_regularization': 0.21544346900318823,
 'classification__learning_rate': 0.07847599703514611}

In [12]:
%%time
search.score(X_train, y_train)

Wall time: 180 ms


0.8831424096311539

In [13]:
%%time
search.score(X_test, y_test)

Wall time: 118 ms


0.873717830600086

In [15]:
%%time
import pandas as pd
res = search.predict(X_test)
res_df = pd.DataFrame(np.concatenate((y_test[:, None], res[:, None]), axis=1))
res_df.columns = ['target', 'prediction']
from pandasql import sqldf
query = """
select
    eval as RESULT
    ,count(*) as CNT
from
    (select
        target
        ,prediction
        ,case
            when target = prediction and target = 1 then 'true positive'
            when target = prediction and target = 0 then 'true negative'
            when target <> prediction and target = 1 then 'false negative'
            else 'false positive'
        end as eval
    from res_df)
group by eval
"""
sqldf(query)

Wall time: 438 ms


Unnamed: 0,RESULT,CNT
0,false negative,1338
1,false positive,718
2,true negative,11717
3,true positive,2508


In [17]:
%%time
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn import impute
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier

ppl = Pipeline(steps=[
    ('imputation', impute.SimpleImputer(strategy='most_frequent'))
    ,('scaling', MinMaxScaler())
    ,('classification', MLPClassifier(learning_rate='adaptive', random_state=1))
])
cv_params = {
    'classification__hidden_layer_sizes' : [(6), (10), (16), (24), (32), (50), 
                                            (50, 32), (50, 24), (50, 16)
                                           ,(32, 24), (32, 16), (32, 10)
                                           ,(24, 16), (24, 10), (24, 6)
                                           ,(16, 10), (16, 6), (10, 6)]
    ,'classification__activation' : ['relu', 'logistic', 'tanh']
    ,'classification__alpha' : np.logspace(-6, 0, 20)
}
search = GridSearchCV(ppl, cv_params, n_jobs=4, cv=4, verbose=3)
search.fit(X_train, y_train)
search.best_params_

Fitting 4 folds for each of 1080 candidates, totalling 4320 fits
Wall time: 4h 49min 5s


{'classification__activation': 'tanh',
 'classification__alpha': 0.05455594781168514,
 'classification__hidden_layer_sizes': (50, 16)}

In [None]:
%%time
search.score(X_train, y_train)

In [None]:
%%time
search.score(X_test, y_test)

In [None]:
%%time
import pandas as pd
res = search.predict(X_test)
res_df = pd.DataFrame(np.concatenate((y_test[:, None], res[:, None]), axis=1))
res_df.columns = ['target', 'prediction']
from pandasql import sqldf
query = """
select
    eval as RESULT
    ,count(*) as CNT
from
    (select
        target
        ,prediction
        ,case
            when target = prediction and target = 1 then 'true positive'
            when target = prediction and target = 0 then 'true negative'
            when target <> prediction and target = 1 then 'false negative'
            else 'false positive'
        end as eval
    from res_df)
group by eval
"""
sqldf(query)