# Домашнее задание

**Стандартная версия**
Нужно реализовать rest api на базе flask 

1. выбрать себе датасет (который интересен или нравится больше всего, можно глянуть здесь https://economic-caper-a4c.notion.site/d062c410b90145bca90fc23b1348c813), сделать pipeline (преобразования + модель), сохранить его на диск. Если не хочется пайплайн, то можно без него, но так вам же будет удобнее потом вызывать его из кода сервиса.
2. Реализовать ноутбук с сервером
3. Реализовать ноутбук с клиентом

### Step 1 - TRAIN

### Обучение пайплайна

1. Соберем пайплайн с простейшим препроцессингом (tfidf) на текстовых данных
2. Обучим логистическую регрессию и сохраним на диск предобученный пайплайн

In [1]:
import pandas as pd
import dill
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.linear_model import LogisticRegression
import lightgbm as lgbm, catboost as catb

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
from sklearn.metrics import f1_score

#normalizing data
from sklearn.preprocessing import StandardScaler

#pipeline
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import precision_score,recall_score

#imputer
from sklearn.impute import SimpleImputer

import sklearn.datasets

#flask
from flask import Flask, request, jsonify

import warnings
warnings.filterwarnings("ignore")

In [2]:
df_cardio = pd.read_csv("train_case2.csv", ';')
df_cardio.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [3]:
df_cardio['cardio'].value_counts()

0    35021
1    34979
Name: cardio, dtype: int64

Разделим данные на train/test и сохраним тестовую выборку на диск

In [4]:
#разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(df_cardio.drop('cardio', 1), 
                                                    df_cardio['cardio'], random_state=0)

# save test
X_test.to_csv("X_test.csv", index=None)
y_test.to_csv("y_test.csv", index=None)

# save train
X_train.to_csv("X_train.csv", index=None)
y_train.to_csv("y_train.csv", index=None)

К полям:
- age, weight, ap_lo - standardScaler

In [24]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [6]:
features = ['age', 'weight', 'ap_lo']
target = 'cardio'

Feature engineering

In [8]:
continuos_transformers = []
for cont_col in features:
    transfomer =  Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                ('standard', StandardScaler())
            ])
    continuos_transformers.append((cont_col, transfomer))   

In [9]:
feats = FeatureUnion(continuos_transformers)

Добавим простейший классификатор

In [10]:
%%time

pipeline = Pipeline([
    ('features', feats),
    ('classifier', LogisticRegression()),
])

pipeline.fit(X_train, y_train)

CPU times: total: 188 ms
Wall time: 114 ms


Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('age',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='age')),
                                                                 ('standard',
                                                                  StandardScaler())])),
                                                ('weight',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='weight')),
                                                                 ('standard',
                                                                  StandardScaler())])),
                                                ('ap_lo',
                                                 Pipeline(steps=[('selector',
                                         

Посмотрим, как выглядит наш pipeline

In [11]:
pipeline.steps

[('features',
  FeatureUnion(transformer_list=[('age',
                                  Pipeline(steps=[('selector',
                                                   NumberSelector(key='age')),
                                                  ('standard',
                                                   StandardScaler())])),
                                 ('weight',
                                  Pipeline(steps=[('selector',
                                                   NumberSelector(key='weight')),
                                                  ('standard',
                                                   StandardScaler())])),
                                 ('ap_lo',
                                  Pipeline(steps=[('selector',
                                                   NumberSelector(key='ap_lo')),
                                                  ('standard',
                                                   StandardScaler())]))])),
 ('classifier', 

Сохраним модель (пайплайн)

In [12]:
with open("logreg_pipeline.dill", "wb") as f:
    dill.dump(pipeline, f)

### Step 2 - PREDICT

### Проверка работоспособности и качества пайплайна

Загружаем модель (pipeline) напрямую и проверяем на отложенной (тестовой) выборке

In [13]:
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv("y_test.csv")

In [14]:
X_test.head(3)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,15505,15216,1,165,61.0,120,80,1,1,0,0,1
1,80282,19815,1,164,54.0,110,80,1,1,0,0,1
2,21224,20563,1,170,69.0,120,80,1,1,0,0,1


In [15]:
with open('logreg_pipeline.dill', 'rb') as in_strm:
    pipeline = dill.load(in_strm)

In [16]:
pipeline

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('age',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='age')),
                                                                 ('standard',
                                                                  StandardScaler())])),
                                                ('weight',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='weight')),
                                                                 ('standard',
                                                                  StandardScaler())])),
                                                ('ap_lo',
                                                 Pipeline(steps=[('selector',
                                         

In [17]:
preds = pipeline.predict_proba(X_test)[:, 1]

pred_df = pd.DataFrame({'preds': preds})
pred_df.to_csv("test_predictions.csv", index=None)

In [18]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.33737761192972876, F-Score=0.684, Precision=0.546, Recall=0.915


### Step 3 - FLASK

In [19]:
# Пробный запуск Flask

app = Flask(__name__)

@app.route("/a")
def hello():
    return "Hello World!"

if __name__ == '__main__':
    app.run(port=8082)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:8082/ (Press CTRL+C to quit)


### **Создаем сервис для обработки запросов к модели**

In [20]:
# Загружаем обученные модели
with open('logreg_pipeline.dill', 'rb') as in_strm:
    model = dill.load(in_strm)

In [21]:
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv("y_test.csv")

In [22]:
X_test

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,15505,15216,1,165,61.0,120,80,1,1,0,0,1
1,80282,19815,1,164,54.0,110,80,1,1,0,0,1
2,21224,20563,1,170,69.0,120,80,1,1,0,0,1
3,89571,16625,1,167,57.0,110,70,1,1,0,0,1
4,67392,14451,1,158,110.0,120,80,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
17495,65913,19616,1,165,63.0,120,80,2,2,0,0,0
17496,8133,21149,1,153,54.0,120,80,1,1,0,0,1
17497,97894,15972,2,172,64.0,110,70,1,1,0,0,1
17498,21541,18814,1,156,63.0,130,90,3,1,0,1,0


Запускаем сервер и работаем с ним

In [23]:
# Обработчики и запуск Flask
app = Flask(__name__)

@app.route("/", methods=["GET"])
def general():
    return "Welcome to prediction process"

@app.route('/predict', methods=['POST'])
def predict():
    data = {"success": False}

    # ensure an image was properly uploaded to our endpoint
    age, weight, ap_lo   = 0, 0, 0

    #active, gluc, alco, height

    request_json = request.get_json()
        
    if request_json["age"]:
        age = request_json['age']
                
    if request_json["weight"]:
        weight = request_json['weight']
    
    if request_json["ap_lo"]:
        ap_lo = request_json['ap_lo']
        
 #   if request_json["gluc"]:
  #      gluc = request_json['gluc']
    
 #   if request_json["alco"]:
 #       alco = request_json['alco']   
        
#    if request_json["height"]:      
#        height = request_json['height']    
    
    print(age)  
    preds = model.predict_proba(pd.DataFrame({"age": [age],
                                        "weight": [weight],
                                        "ap_lo": [ap_lo]}))
   #                                     "height": [height]
 #                                       "gluc": [gluc],
  #                                      "alco": [alco],
   #                                     
    data["predictions"] = preds[:, 1][0]
    data["age"] = age
        # indicate that the request was a success
    data["success"] = True
    print('OK')

        # return the data dictionary as a JSON response
    return jsonify(data)


if __name__ == '__main__':
    app.run(port=8082)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:8082/ (Press CTRL+C to quit)
127.0.0.1 - - [01/Jan/2024 21:49:55] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:21] "POST //predict HTTP/1.1" 200 -


19810
OK


127.0.0.1 - - [01/Jan/2024 21:52:35] "POST //predict HTTP/1.1" 200 -


19815.0
OK


127.0.0.1 - - [01/Jan/2024 21:52:42] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:42] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:42] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:42] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:42] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:42] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:42] "POST //predict HTTP/1.1" 200 -


15216.0
OK
19815.0
OK
20563.0
OK
16625.0
OK
14451.0
OK
18976.0
OK
16123.0
OK
19646.0
OK

127.0.0.1 - - [01/Jan/2024 21:52:42] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:42] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:42] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:42] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:42] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:42] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:42] "POST //predict HTTP/1.1" 200 -



21316.0
OK
15529.0
OK
18735.0
OK
19691.0
OK
18762.0
OK
21313.0
OK
21685.0
OK

127.0.0.1 - - [01/Jan/2024 21:52:42] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:42] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:42] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:42] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:42] "POST //predict HTTP/1.1" 200 -



21921.0
OK
23190.0
OK
18207.0
OK
18321.0
OK
14698.0


127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -


OK
20974.0
OK
14529.0
OK
15485.0
OK
16935.0
OK
18993.0
OK
23361.0
OK

127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -



19073.0
OK
20491.0
OK
17443.0
OK
20570.0
OK
19502.0
OK
18127.0
OK
22367.0
OK
20187.0

127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -



OK
19523.0
OK
14607.0
OK
21297.0
OK
18010.0
OK
15966.0
OK
23620.0
OK
21198.0
OK


127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -


21210.0
OK
23328.0
OK
19975.0
OK
20664.0
OK
17522.0
OK
22485.0
OK
21307.0
OK
15983.0

127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:43] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -



OK
18329.0
OK
18172.0
OK
21067.0
OK
17764.0
OK
20468.0
OK
18219.0
OK
20316.0
OK
21051.0

127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -



OK
16691.0
OK
18332.0
OK
19932.0
OK
21846.0
OK
21922.0
OK
18705.0
OK
22566.0
OK
19706.0
OK

127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -



22812.0
OK
15456.0
OK
16729.0
OK
19038.0
OK
20990.0
OK
22698.0
OK
22026.0
OK
21298.0

127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -



OK
19687.0
OK
20413.0
OK
19781.0
OK
17557.0
OK
20714.0
OK
19019.0
OK
15910.0
OK
14504.0
OK

127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -



19532.0
OK
15214.0
OK
18882.0
OK
21807.0
OK
20724.0
OK
15226.0
OK
15109.0
OK
21333.0
OK

127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -



19054.0
OK
23391.0
OK
21763.0
OK
16834.0
OK
21373.0
OK
21852.0
OK
15435.0
OK

127.0.0.1 - - [01/Jan/2024 21:52:44] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:45] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:45] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:45] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Jan/2024 21:52:45] "POST //predict HTTP/1.1" 200 -



21243.0
OK
20621.0
OK
18928.0
OK
21097.0
OK


In [None]:
years = totalDays//365
years