In [1]:
import pandas as pd; pd.set_option('display.max_columns', None)
import numpy as np
import dill
import pickle
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix, log_loss
import lightgbm as lgbm

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

from sklearn.base import BaseEstimator, TransformerMixin

from scipy.sparse import hstack
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.pipeline import FeatureUnion


In [2]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in test_columns:
            if col_ not in self.columns:
                X[col_] = 0
        return X[self.columns]

In [3]:
df = pd.read_csv('./data/dataset.csv')

In [4]:
df

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
0,48,23.500000,70,2.707,0.467409,8.8071,9.702400,7.99585,417.114,1
1,83,20.690495,92,3.115,0.706897,8.8438,5.429285,4.06405,468.786,1
2,82,23.124670,91,4.498,1.009651,17.9393,22.432040,9.27715,554.697,1
3,68,21.367521,77,3.226,0.612725,9.8827,7.169560,12.76600,928.220,1
4,86,21.111111,92,3.549,0.805386,6.6994,4.819240,10.57635,773.920,1
...,...,...,...,...,...,...,...,...,...,...
111,45,26.850000,92,3.330,0.755688,54.6800,12.100000,10.96000,268.230,2
112,62,26.840000,100,4.530,1.117400,12.4500,21.420000,7.32000,330.160,2
113,65,32.050000,97,5.730,1.370998,61.4800,22.540000,10.33000,314.050,2
114,72,25.590000,82,2.820,0.570392,24.9600,33.750000,3.27000,392.460,2


In [16]:
df.describe()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP_1,target
count,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0
mean,57.301724,27.582111,97.793103,10.012086,2.694988,26.61508,10.180874,14.725966,534.647,0.551724
std,16.112766,5.020136,22.525162,10.067768,3.642043,19.183294,6.843341,12.390646,345.912663,0.499475
min,24.0,18.37,60.0,2.432,0.467409,4.311,1.65602,3.21,45.843,0.0
25%,45.0,22.973205,85.75,4.35925,0.917966,12.313675,5.474282,6.881763,269.97825,0.0
50%,56.0,27.662416,92.0,5.9245,1.380939,20.271,8.352692,10.82774,471.3225,1.0
75%,71.0,31.241442,102.0,11.18925,2.857787,37.3783,11.81597,17.755207,700.085,1.0
max,89.0,38.578759,201.0,58.46,25.050342,90.28,38.04,82.1,1698.44,1.0


In [5]:
df['Classification'] = np.where(df['Classification'] == 1, 0, df['Classification'])
df['Classification'] = np.where(df['Classification'] == 2, 1, df['Classification'])

In [6]:
df = df.rename(columns={df.columns[-1]: "target"})
df = df.rename(columns={df.columns[-2]: "MCP_1"})
df['target'].value_counts()

1    64
0    52
Name: target, dtype: int64

In [15]:
df

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP_1,target
0,48,23.500000,70,2.707,0.467409,8.8071,9.702400,7.99585,417.114,0
1,83,20.690495,92,3.115,0.706897,8.8438,5.429285,4.06405,468.786,0
2,82,23.124670,91,4.498,1.009651,17.9393,22.432040,9.27715,554.697,0
3,68,21.367521,77,3.226,0.612725,9.8827,7.169560,12.76600,928.220,0
4,86,21.111111,92,3.549,0.805386,6.6994,4.819240,10.57635,773.920,0
...,...,...,...,...,...,...,...,...,...,...
111,45,26.850000,92,3.330,0.755688,54.6800,12.100000,10.96000,268.230,1
112,62,26.840000,100,4.530,1.117400,12.4500,21.420000,7.32000,330.160,1
113,65,32.050000,97,5.730,1.370998,61.4800,22.540000,10.33000,314.050,1
114,72,25.590000,82,2.820,0.570392,24.9600,33.750000,3.27000,392.460,1


In [7]:
X_train, X_test, y_train, y_test = train_test_split(df, 
                                                    df['target'], test_size=0.3, random_state=42)
#save test
X_test.to_csv("X_test.csv", index=None)
y_test.to_csv("y_test.csv", index=None)
#save train
X_train.to_csv("X_train.csv", index=None)
y_train.to_csv("y_train.csv", index=None)

In [8]:
continuous_columns = df.select_dtypes(include=[np.number]).drop('target',1)
print(f"count of numeric_features {continuous_columns.shape[1]}")
continuous_columns = continuous_columns.columns.to_list()
continuous_columns


count of numeric_features 9


['Age',
 'BMI',
 'Glucose',
 'Insulin',
 'HOMA',
 'Leptin',
 'Adiponectin',
 'Resistin',
 'MCP_1']

In [9]:
cat_feature_num = [
    feature for feature in continuous_columns
    if len(df[feature].unique())<20
]
categorical_columns = df.select_dtypes(include=[np.object]).columns.to_list()
categorical_columns = list(categorical_columns + cat_feature_num)
categorical_columns

[]

In [10]:
transformers = list()

for col in continuous_columns:
    transformer = Pipeline([
                ('selector', NumberSelector(key=col)),
                ('scaler', StandardScaler())
            ])
    transformers.append((col, transformer))

In [11]:
feats_prep = FeatureUnion(transformers)

feature_processing = Pipeline([('feats_prep', feats_prep)])


In [12]:
pipeline = Pipeline([
    ('features',feature_processing),
    ('classifier', lgbm.LGBMClassifier(random_state = 42)),
])


#запустим кросс-валидацию
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=10 ,scoring='roc_auc')
cv_score = np.mean(cv_scores)
cv_score_std = np.std(cv_scores)
print('CV score is {}+-{}'.format(cv_score, cv_score_std))

#обучим пайплайн на всем тренировочном датасете
pipeline.fit(X_train, y_train)
y_predict = pipeline.predict(X_test)
y_score = pipeline.predict_proba(X_test)[:, 1]

CV score is 0.805+-0.1522607412740833


In [13]:
result = []
precision, recall, thresholds = precision_recall_curve(y_test, y_score)


fscore = (2 * precision * recall) / (precision + recall)
roc = roc_auc_score(y_test, y_predict)
log_los = log_loss(y_test, y_score)

# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Roc_auc=%.3f, Log_loss=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        roc,
                                                                        log_los,
                                                                        precision[ix],
                                                                        recall[ix]
                                                                                     ))
result.append({"method":"light_gbm_normal" ,"roc_auc" : roc, "fscore" : fscore[ix],
               "precision" : precision[ix], "recall" : recall[ix],
               "log_los" : log_los
              })

Best Threshold=0.400505, F-Score=0.789, Roc_auc=0.714, Log_loss=0.742, Precision=0.750, Recall=0.833


In [14]:
with open("C:/Users/Андрей/PycharmProjects/cource_proj_api/models/lightgbm_pipeline.dill", "wb") as f:
    dill.dump(pipeline, f)