In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('clean_data.csv')
data.head()

Unnamed: 0,Protocol_number,Product,Component_1,Component_2,Component_3,Component_4,Component_5,Component_6,Component_7,Component_8,...,Component_126,Component_127,Component_128,Component_129,Component_130,Component_131,Component_132,Days_analysis,Year,Samples_number
0,8107,427,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,23.0,2020,218
1,4553,427,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,23.0,2020,218
2,9831,427,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,18.0,2020,163
3,10580,427,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,16.0,2020,360
4,1238,427,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,27.0,2020,234


# Построение модели для определения вклада каждого компонента
Теперь построим модели для прогнозирования количества дней на основании состава продукта и количества образцов для анализа.

Для этого построим линейную регрессию.

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X = data.drop(['Protocol_number', 'Product', 'Days_analysis', 'Year'], axis=1)
Y = data['Days_analysis']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=18)

In [3]:
def rmse(y_pred, y):
    return np.sqrt(mean_squared_error(y_pred, y))

In [4]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
def err_check(y_train, y_train_pred, y_test, y_pred):   
    print('MAE train: ', np.round(mean_absolute_error(y_train, y_train_pred), 2))
    print('MAE test: ', np.round(mean_absolute_error(y_test, y_pred), 2))
    print('RMSE train: ', np.round(rmse(y_train, y_train_pred), 2))
    print('RMSE test: ', np.round(rmse(y_test, y_pred), 2))
    print('R2 train: ', np.round(r2_score(y_train, y_train_pred), 2))
    print('R2 test: ', np.round(r2_score(y_test, y_pred), 2))

In [6]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)
err_check(y_train, y_train_pred, y_test, y_pred)

MAE train:  8.42
MAE test:  8.67
RMSE train:  11.07
RMSE test:  11.31
R2 train:  0.31
R2 test:  0.29


In [7]:
model.coef_

array([ 1.19874917e+00, -2.92737001e+00,  3.02957733e+00,  1.01020464e+00,
       -8.38356931e-01, -1.92678078e+00, -1.16662012e+00,  5.08792939e+00,
        1.04162065e+00,  1.69213304e+00,  2.17378127e+00,  3.39255689e+00,
        5.11635751e-02,  3.65452777e+00, -3.91661710e+00,  1.67344297e+01,
       -8.45440443e-01, -4.77601398e+00, -4.97672616e+00, -3.25204771e+00,
       -5.90243410e+00, -6.03954065e+00, -1.15342437e+01,  1.89862682e+00,
        5.41620874e+00, -7.10407624e-01,  4.69915380e+00, -6.33168815e+00,
        2.65038638e+00, -1.78625427e+01, -6.50004498e-01, -2.04190226e+00,
       -6.50004498e-01,  1.47256426e+00, -5.61225917e+00, -6.41543544e+00,
        5.55387345e+00, -3.78464790e+00, -1.79556451e+00, -1.01488824e+00,
        1.75815081e+00, -3.51720846e+00, -2.44978845e+00, -8.75132191e-01,
       -3.02073006e-01,  7.54618313e-01,  1.49935262e+00,  3.33140318e+00,
       -3.82274555e+00,  7.34781520e+00, -4.17748361e+00,  4.44587597e+00,
        5.00193366e-01,  

In [8]:
feature_importance = pd.DataFrame(data=model.coef_, index=X.columns, columns=['Importance'])
feature_importance = feature_importance.sort_values('Importance', axis=0, ascending=False)
feature_importance.to_csv('feature importance.csv')
feature_importance.head(20)

Unnamed: 0,Importance
Component_130,22.293416
Component_126,19.15529
Component_16,16.73443
Component_94,14.16219
Component_117,13.813904
Component_129,11.312858
Component_103,10.173747
Component_90,8.28268
Component_102,7.402841
Component_100,7.402841


# Построение модели для прогнозирования

Теперь построим модели для прогнозирования количества дней на основании состава продукта и количества образцов для анализа.

In [9]:
np.round(data['Days_analysis'].mean(), 2)

19.31

Среднее количество дней на анализы продукта 19,31. Будем использовать это значение для оценки результатов.

Для получения эффективной модели используем градиентный бустинг. Подберем эффективные параметры.

In [10]:
X = data.drop(['Protocol_number', 'Product', 'Days_analysis', 'Year'], axis=1)
Y = data['Days_analysis']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=18)

In [8]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

In [11]:
base_model = GradientBoostingRegressor(random_state=15)
param_grid = {'n_estimators':[50,100,500,1000,250],
    'max_depth':[3,4,10],
    'min_samples_split':[2,5,10],
    'learning_rate':[0.01, 0.1, 0.05, 0.25, 1],
    'loss':['squared_error','absolute_error', 'huber', 'quantile']}
model = GridSearchCV(base_model,param_grid=param_grid)
model.fit(X_train, y_train)
print(model.best_params_)

{'learning_rate': 0.1, 'loss': 'squared_error', 'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 250}


In [13]:
base_model = GradientBoostingRegressor(random_state=15, learning_rate=0.1)
param_grid = {'n_estimators':[200, 250, 300],
    'max_depth':[7,10,15],
    'min_samples_split':[2,3,4]}
model = GridSearchCV(base_model,param_grid=param_grid)
model.fit(X_train, y_train)
print(model.best_params_)

{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 300}


In [15]:
base_model = GradientBoostingRegressor(random_state=15, learning_rate=0.1)
param_grid = {'n_estimators':[250, 300, 350, 400, 450],
    'max_depth':[10, 8, 12],
    'min_samples_split':[2]}
model = GridSearchCV(base_model,param_grid=param_grid)
model.fit(X_train, y_train)
print(model.best_params_)

{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 300}


In [11]:
model = GradientBoostingRegressor(random_state=15,
                                  learning_rate=0.1,
                                  n_estimators=300,
                                  max_depth=10,
                                  min_samples_split=2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)
err_check(y_train, y_train_pred, y_test, y_pred)

MAE train:  4.06
MAE test:  5.73
RMSE train:  6.63
RMSE test:  9.0
R2 train:  0.75
R2 test:  0.55
