In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
df = pd.read_csv('fish.csv')
df.head()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02
1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.73,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.444,5.134


In [6]:
df = pd.get_dummies(data=df, prefix='Species', drop_first=True)
df.head()

Unnamed: 0,Weight,Length1,Length2,Length3,Height,Width,Species_Parkki,Species_Perch,Species_Pike,Species_Roach,Species_Smelt,Species_Whitefish
0,242.0,23.2,25.4,30.0,11.52,4.02,0,0,0,0,0,0
1,290.0,24.0,26.3,31.2,12.48,4.3056,0,0,0,0,0,0
2,340.0,23.9,26.5,31.1,12.3778,4.6961,0,0,0,0,0,0
3,363.0,26.3,29.0,33.5,12.73,4.4555,0,0,0,0,0,0
4,430.0,26.5,29.0,34.0,12.444,5.134,0,0,0,0,0,0


In [7]:
X = df.drop('Weight', axis=1)
y = df['Weight']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=10)

sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

In [8]:
from sklearn.linear_model import LinearRegression

clf = LinearRegression()
clf.fit(X_train_std, y_train)

LinearRegression()

In [9]:
y_pred = clf.predict(X_test_std)
y_pred_train = clf.predict(X_train_std)

In [12]:
print(f'MSE Train: {mean_squared_error(y_train, y_pred_train).round()}')
print(f'MSE Test: {mean_squared_error(y_test, y_pred).round()}')

MSE Train: 8782.0
MSE Test: 6743.0


In [13]:
print(f'RMSE Train: {np.sqrt(mean_squared_error(y_train, y_pred_train)).round()}')
print(f'RMSE Test: {np.sqrt(mean_squared_error(y_test, y_pred)).round()}')

RMSE Train: 94.0
RMSE Test: 82.0


In [14]:
print(f'MAE Train: {mean_absolute_error(y_train, y_pred_train).round()}')
print(f'MAE Test: {mean_absolute_error(y_test, y_pred).round()}')

MAE Train: 70.0
MAE Test: 68.0


In [15]:
def r2_adjusted(y_true: pd.Series, y_pred: pd.Series,
                X_test: pd.DataFrame) -> float:
    """
    Коэффициент детерминации для множественной регрессии
    """
    n = len(y_true)
    h = X_test.shape[1]
    r2 = r2_score(y_true, y_pred)

    return 1 - (1 - r2) * (n - 1) / (n - h - 1)

In [16]:
print(f'R2 Train: {r2_adjusted(y_train, y_pred_train, X_train_std)}')
print(f'R2 Test: {r2_adjusted(y_test, y_pred, X_test_std)}')

R2 Train: 0.9327817851164735
R2 Test: 0.7274484887203888
