In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model

## ПАНО

In [11]:
df_Time_AT = pd.read_csv('Time_AT_FR_LETI.csv', delimiter = ';')
df_Time_AT = df_Time_AT.drop(['Unnamed: 0', 'Unnamed: 4'], axis=1)
df_Time_AT

Unnamed: 0,FileNum,AT_time,fast_phase_time
0,0,475.0,915.0
1,1,585.0,1060.0
2,2,475.0,880.0
3,3,500.0,1130.0
4,4,620.0,1080.0
...,...,...,...
1488,1496,815.0,1145.0
1489,1497,830.0,1270.0
1490,1498,840.0,1130.0
1491,1499,885.0,1195.0


## МАССА

In [12]:
df_massa = pd.read_csv('D:/anaerobic_treashold/Massa.csv', delimiter = ';')
df_massa = df_massa.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis=1)
df_massa.head()

Unnamed: 0,massa,file_id
0,105.0,0
1,100.0,1
2,107.0,2
3,102.6,3
4,102.4,4


## Все вместе в покое

In [13]:
df_Values_rest = pd.read_csv('D:/anaerobic_treashold/Values_rest.csv')
df_all = df_Values_rest.drop(['file_id'], axis = 1)
massa_column= df_massa['massa']
df_Time_AT['massa'] = massa_column
df_all['massa'] = massa_column
AT_time_column= df_Time_AT['AT_time']
df_all['AT_time'] = AT_time_column
df_all

Unnamed: 0,HR_rest,V'E_rest,V'O2_rest,V'CO2_rest,RER_rest,O2/HR_rest,EqO2_rest,EqCO2_rest,PETO2_rest,PETCO2_rest,BF_rest,massa,AT_time
0,88.79,16.00,544.79,443.64,0.81,6.17,27.69,34.24,14.11,4.89,17.06,105.0,475.0
1,88.65,18.03,551.53,471.79,0.86,6.23,29.53,34.01,14.71,4.58,21.29,100.0,585.0
2,103.50,19.06,759.97,521.97,0.69,7.34,23.17,33.49,13.69,4.76,19.19,107.0,475.0
3,88.57,17.43,631.17,543.03,0.87,7.03,25.68,29.41,14.01,5.08,16.54,102.6,500.0
4,84.41,15.82,574.29,496.03,0.86,6.84,24.79,28.70,13.72,5.26,15.38,102.4,620.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1488,95.47,15.44,472.28,343.22,0.72,4.88,30.55,42.16,15.12,3.67,17.34,92.0,815.0
1489,74.32,11.00,363.41,276.29,0.77,4.81,24.42,31.74,14.87,4.17,19.24,69.0,830.0
1490,68.79,11.21,398.42,303.36,0.77,5.73,25.02,32.32,14.82,4.29,14.39,68.0,840.0
1491,73.91,13.35,437.82,333.88,0.78,5.91,26.13,33.26,14.47,4.11,19.41,68.0,885.0


## Обучаем модель (дерево решений) 

In [14]:
df_all = df_all.dropna(axis=0)
df_all.columns

Index(['HR_rest', 'V'E_rest', 'V'O2_rest', 'V'CO2_rest', 'RER_rest',
       'O2/HR_rest', 'EqO2_rest', 'EqCO2_rest', 'PETO2_rest', 'PETCO2_rest',
       'BF_rest', 'massa', 'AT_time'],
      dtype='object')

Делим на тренировочный и тестовый наборы

In [15]:
feature_columns = ['HR_rest', "V'E_rest", "V'O2_rest", "V'CO2_rest", 'RER_rest',
       'O2/HR_rest', 'EqO2_rest', 'EqCO2_rest', 'PETO2_rest', 'PETCO2_rest',
       'BF_rest', 'massa']
X = df_all[feature_columns]
y = df_all['AT_time']
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.33, random_state=42)


In [16]:
AT_time_model = DecisionTreeRegressor(random_state=42)

# Fit the model
AT_time_model.fit(train_X, train_y)

In [36]:
print("Making predictions for the following 5 AT:")
print(val_y.head())
print("The predictions are")
val_predictions = AT_time_model.predict(val_X)
print(AT_time_model.predict(val_X.head()))

Making predictions for the following 5 AT:
950     600.0
901     840.0
1339    465.0
984     790.0
959     635.0
Name: AT_time, dtype: float64
The predictions are
[675. 600. 585. 720. 730.]


**Найдем оптимальное количество листьев**

In [24]:
clf = DecisionTreeRegressor(random_state=42)
parametrs = { 'max_depth': range (1,20,2)}
grid = GridSearchCV(clf, parametrs)
grid.fit(train_X, train_y)
grid.best_params_


{'max_depth': 3}

**Переобучим** для лучшего значения МАЕ

In [25]:
final_AT_time_model = DecisionTreeRegressor(max_leaf_nodes = grid.best_params_.get('max_depth') ,random_state=42)

# Fit the model
final_AT_time_model.fit(train_X, train_y)

print("Making predictions for the following 5 AT:")
print(y)
print("The predictions are")
#val_predictions = AT_time_model.predict(val_X)
print(final_AT_time_model.predict(X))

Making predictions for the following 5 AT:
0       475.0
1       585.0
2       475.0
3       500.0
4       620.0
        ...  
1488    815.0
1489    830.0
1490    840.0
1491    885.0
1492    910.0
Name: AT_time, Length: 1492, dtype: float64
The predictions are
[661.20847652 661.20847652 661.20847652 ... 897.93478261 661.20847652
 897.93478261]


In [26]:
final_AT_time_model.score(val_X, val_y)

0.08860362326107718

**Графики**

In [62]:
#final_AT_time_model.plot(x = val_X, y = val_y', kind = 'scatter')
#plt.scatter(val_X, val_y, color="black")
#plt.plot(val_X, val_predictions, color="blue")


SyntaxError: unterminated string literal (detected at line 1) (530957630.py, line 1)

**Оценки**

In [40]:
val_mae = mean_absolute_error(val_y, val_predictions)
val_mse = mean_squared_error(val_y, val_predictions)
val_rmse = mean_squared_error(val_y, val_predictions, squared = False)

print("Validation MAE for Random Forest Model: {}".format(val_mae))
print("Validation MSE for Random Forest Model: {}".format(val_mse))
print("Validation RMSE for Random Forest Model: {}".format(val_rmse))


Validation MAE for Random Forest Model: 129.8580121703854
Validation MSE for Random Forest Model: 27921.501014198784
Validation RMSE for Random Forest Model: 167.09728009216303


## Обучаем модель (random forest) 

In [52]:

rf_model = RandomForestRegressor(random_state = 42)

# fit your model
rf_model.fit(train_X, train_y)
preds = rf_model.predict(val_X)

# Calculate the mean absolute error of your Random Forest model on the validation data
rf_val_mae = mean_absolute_error(val_y, preds)
rf_val_mse = mean_squared_error(val_y, preds)
rf_val_rmse = mean_squared_error(val_y, preds, squared = False)

print("Validation MAE for Random Forest Model: {}".format(rf_val_mae))
print("Validation MSE for Random Forest Model: {}".format(rf_val_mse))
print("Validation RMSE for Random Forest Model: {}".format(rf_val_rmse))

Validation MAE for Random Forest Model: 93.07139959432048
Validation MSE for Random Forest Model: 14498.281622718052
Validation RMSE for Random Forest Model: 120.4088104032178


In [43]:
rf_model.score(val_X, val_y)

0.2604996751337716

## Обучаем модель (линейная регрессия) 

In [21]:
reg = linear_model.LinearRegression()

In [44]:
reg.fit(train_X, train_y)

In [53]:
reg_y_pred = reg.predict(val_X)

In [45]:
reg.coef_

array([-2.05908221e+00, -9.45011059e+00,  2.99650569e-01, -1.23975561e-01,
       -8.60571859e+02,  2.57403457e+01,  2.50172058e+01, -1.83287097e+01,
        2.29366144e+01,  4.17473483e+01,  3.51651623e+00, -4.61747805e+00])

In [46]:
reg.intercept_

1138.1772759550045

**Оценки**

In [54]:
reg_val_mae = mean_absolute_error(val_y, reg_y_pred)
reg_val_mse = mean_squared_error(val_y, reg_y_pred)
reg_val_rmse = mean_squared_error(val_y, reg_y_pred, squared = False)

print("Validation MAE for Random Forest Model: {}".format(reg_val_mae))
print("Validation MSE for Random Forest Model: {}".format(reg_val_mse))
print("Validation RMSE for Random Forest Model: {}".format(reg_val_rmse))


Validation MAE for Random Forest Model: 97.81556678193724
Validation MSE for Random Forest Model: 15777.890408993244
Validation RMSE for Random Forest Model: 125.61007288029589


In [47]:
reg.score(val_X, val_y)

0.19523186355605837