In [14]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [None]:
# 

In [33]:
class TubRiskModel:
    
    def __init__(self, df, params):
        '''
        Прогнозируем риск туберкулеза.
        
        Параметры:
            - df: характеристики дома(тип вентиляции, год постройки, ...)
                  год?
                  кол-во жильцов дома
                  кол-во болеющих
                  y - прибавилось ли больных по сравнению с прошлым годом??
            - params: гиперпараметры лог.регрессии
        '''
        self.df = df
        self.prepro()
        
        self.log_r = linear_model.LogisticRegression(params)
    
    
    def prepro(self):
        '''
        Предобрабатываем данные:
            - ...
            - обрабатываем категориальные переменные (OHE или ...)
            - разбиваем по выборкам
        '''
        
        self.x, self.y = self.df.iloc[:, :-1], self.df.iloc[:, -1]
        self.x_train, self.x_test, self.y_train, self.y_test = \
                            train_test_split(self.x, self.y, test_size=0.3, 
                                             random_state=42, shuffle=False)
        
        
    def fit(self):
        self.log_r.fit(self.x_train, self.y_train)

    
    def predict(self, x):
        y_predict = self.log_r.predict_proba(x) # ?
        
        return y_predict
        

In [16]:
rng = np.random.default_rng()

In [66]:
n = 1000
vent = rng.integers(low=0, high=3, size=n) 
b_type = rng.integers(low=0, high=5, size=n) 
year = rng.integers(low=2020, high=2022, size=n)
n_h = rng.integers(low=50, high=500, size=n) 
n_t = rng.integers(low=0, high=20, size=n) 
y = rng.integers(low=0, high=2, size=n) 

df = pd.DataFrame([vent,b_type,year,n_h,n_t]).T
df.columns = ['vent','b_type','year','n_h','n_t']

df = pd.get_dummies(df, columns=['vent','b_type'], drop_first=True)
df = pd.concat([df, pd.Series(y, name='y')], axis='columns')
df

Unnamed: 0,year,n_h,n_t,vent_1,vent_2,b_type_1,b_type_2,b_type_3,b_type_4,y
0,2021,188,17,True,False,False,False,True,False,0
1,2021,219,16,True,False,False,False,True,False,1
2,2021,494,14,True,False,False,False,False,True,1
3,2021,269,18,True,False,False,True,False,False,0
4,2020,76,19,True,False,False,False,False,True,0
...,...,...,...,...,...,...,...,...,...,...
995,2021,443,15,True,False,False,False,False,False,0
996,2020,292,15,True,False,False,False,False,False,1
997,2020,184,6,False,True,False,False,False,False,1
998,2021,384,3,False,False,False,False,False,False,0


In [67]:
df.iloc[:,-1].value_counts()

y
0    526
1    474
Name: count, dtype: int64

In [68]:
model = TubRiskModel(df, 'none')
model.fit()

In [69]:
model.log_r.coef_, model.log_r.intercept_

(array([[-2.08187342e-04,  3.04005263e-04,  9.47484704e-03,
          2.97635118e-01,  6.47697412e-02,  3.11407785e-02,
         -1.79631820e-01,  7.97582325e-02, -2.21359897e-02]]),
 array([-0.00021072]))

In [70]:
# 0, 1. Тогда здесь вторая колонка -- вероятность, что увеличатся заболевшие?
model.predict(model.x_test)[:10]

array([[0.52417442, 0.47582558],
       [0.51845574, 0.48154426],
       [0.51657167, 0.48342833],
       [0.48071438, 0.51928562],
       [0.5556297 , 0.4443703 ],
       [0.53980086, 0.46019914],
       [0.51301536, 0.48698464],
       [0.47264085, 0.52735915],
       [0.47229851, 0.52770149],
       [0.53957079, 0.46042921]])

In [71]:
# насколько совпадают
model.log_r.score(model.x_test, model.y_test)

0.5

In [72]:
from sklearn.metrics import f1_score

f1_score(model.y_test,  model.log_r.predict(model.x_test))

0.33628318584070793

In [75]:
from sklearn.metrics import RocCurveDisplay
RocCurveDisplay.from_predictions(model.y_test,  model.log_r.predict(model.x_test))

AttributeError: type object 'RocCurveDisplay' has no attribute 'from_predictions'

In [2]:
log_r = linear_model.LogisticRegression()
log_r.fit()
log_r.predict_proba() #?

In [8]:
params = ['l2']
log_r = linear_model.LogisticRegression(params)
log_r

LogisticRegression(penalty=['l2'])