In [119]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import date
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import mean_absolute_error
from sklego.preprocessing import RepeatingBasisFunction

%matplotlib inline

In [6]:
!pip install sklego



In [215]:
df = pd.read_csv("https://raw.githubusercontent.com/katarina74/ml_lessons/main/lesson_2/data/techparams_train.csv")
df.head()

Unnamed: 0,index,back-suspension,battery-capacity,charge-time,compression,consumption-mixed,cylinders-order,cylinders-value,engine-feeding,engine-start,...,configurations_front-brake,configurations_safety-rating,configurations_seats,configurations_tank-volume,supergen_year-stop,models_country-from,models_group,models_light-and-commercial,models_male,target
0,0,9,-1.0,36457,9.0,4.3,0,3,4,2006,...,1,2,13,40.0,2018.0,16,3,0,1,2360
1,2,3,-1.0,44872,8.0,-1.0,3,7,4,1982,...,4,2,13,108.0,1993.0,34,3,0,1,3060
2,4,3,-1.0,55927,16.0,4.2,0,4,5,2014,...,4,2,13,55.0,2019.0,35,3,0,1,2648
3,5,0,-1.0,41405,10.3,-1.0,0,4,4,2000,...,4,2,13,55.0,2003.0,10,3,0,1,2513
4,7,8,-1.0,22523,19.0,-1.0,0,4,8,2000,...,4,1,13,62.0,2005.0,10,3,0,1,2703


In [217]:
df.shape

(43245, 32)

In [219]:
df.isna().sum()

index                             0
back-suspension                   0
battery-capacity                  0
charge-time                       0
compression                       0
consumption-mixed                 0
cylinders-order                   0
cylinders-value                   0
engine-feeding                    0
engine-start                      0
engine-stop                       0
engine-type                       0
gear-type                         0
luxury                            0
max-speed                         0
power-electro-kw                  0
supply-system                     0
valves                            0
valvetrain                        0
weight                            0
configurations_auto-premiere      0
configurations_back-wheel-base    0
configurations_front-brake        0
configurations_safety-rating      0
configurations_seats              0
configurations_tank-volume        0
supergen_year-stop                0
models_country-from         

In [221]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43245 entries, 0 to 43244
Data columns (total 32 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   index                           43245 non-null  int64  
 1   back-suspension                 43245 non-null  int64  
 2   battery-capacity                43245 non-null  float64
 3   charge-time                     43245 non-null  int64  
 4   compression                     43245 non-null  float64
 5   consumption-mixed               43245 non-null  float64
 6   cylinders-order                 43245 non-null  int64  
 7   cylinders-value                 43245 non-null  int64  
 8   engine-feeding                  43245 non-null  int64  
 9   engine-start                    43245 non-null  int64  
 10  engine-stop                     43245 non-null  int64  
 11  engine-type                     43245 non-null  int64  
 12  gear-type                       

Данные: технические характеристики разных конфигураций автомобилей

Таргет: длина колесной базы конфигурации (в миллиметрах).

Задача: добиться наименьшего MSE на test.

## Baseline

In [225]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [227]:
X = df.drop(["target"], axis=1)
y = df[["target"]]
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.33, random_state=42)

In [229]:
reg = LinearRegression().fit(train_X, train_y)

In [231]:
print('R^2_train =', reg.score(train_X, train_y))
print('R^2_test =',reg.score(test_X, test_y))
print('MSE_train =', mean_squared_error(reg.predict(train_X), train_y))
print('MSE_test =', mean_squared_error(reg.predict(test_X), test_y))

R^2_train = 0.45554067850400504
R^2_test = 0.4646243987792361
MSE_train = 29840.12125086529
MSE_test = 31021.024399854425


In [233]:
for col in df.columns: 
    print(f'Значения в столбце {col}:') 
    print(df[col].unique())

Значения в столбце index:
[    0     2     4 ... 61226 61227 61228]
Значения в столбце back-suspension:
[ 9  3  0  8 10  6  7  1  4  5  2]
Значения в столбце battery-capacity:
[ -1.    13.2   21.     1.56  41.    10.7    1.3   23.     6.5   90.
  18.4  100.    14.1    9.2  130.    24.    75.    13.5    1.4   17.6
  12.    21.3   31.3   58.    78.    60.    22.    18.8   40.     4.4
  30.    45.     8.7  120.    62.    50.    10.4    1.1   27.2  200.
  33.     6.8   93.4    1.31  64.    32.6    9.27  11.6    7.6    5.5
  14.4   16.    79.2   41.4   31.2   28.     1.5   13.    77.    17.1
   7.9   42.2   47.17  95.    37.    16.5   25.5   17.    11.2   10.8
  10.5   70.    39.2    7.7   71.  ]
Значения в столбце charge-time:
[36457 44872 55927 ...  8958 20293 52459]
Значения в столбце compression:
[ 9.    8.   16.   10.3  19.    9.6  16.5  -1.   21.    8.5   9.3   9.8
 18.   10.    9.9  11.   17.    9.5  10.4   8.8  10.2  10.5  10.8   9.2
 22.    9.7  17.9  10.7  10.1  18.3  11.2   8.2  

Среди значений столбца engine-stop присутствуют 0. Скорее всего это говорит что производство еще не завершено поэтому для модели будет лучше заменить на 2019 год (максимальный встречающийся)

In [236]:
df['engine-stop'] = df['engine-stop'].replace(0, 2019)

In [238]:
(df['battery-capacity'] == -1).sum()

43093

Во многих столбцах присутствует значение -1. Это может свидетельсвовать об ошибке в данных, однако в столбце battery-capacity их аномально много, так что так их удалять не будем. Также уберем некорректные значения по годам.  

In [241]:
df = df[df['max-speed'] > 0]
df = df[df['consumption-mixed'] > 0]
df = df[ df['engine-stop'] >= df['engine-start']]
df.shape

(22229, 32)

In [243]:
df['valves'].describe()

count    22229.000000
mean        29.534527
std        273.117459
min          0.000000
25%          2.000000
50%          2.000000
75%          2.000000
max       4134.000000
Name: valves, dtype: float64

In [253]:
df = df[df['valves'] <= df['valves'].quantile(0.99)]

In [255]:
df['valves'].describe()

count    21873.000000
mean         1.607644
std          1.138112
min          0.000000
25%          2.000000
50%          2.000000
75%          2.000000
max         52.000000
Name: valves, dtype: float64

In [259]:
X = df.drop(["target"], axis=1)
y = df[["target"]]
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.33, random_state=42)

In [261]:
reg = LinearRegression().fit(train_X, train_y)

In [263]:
print('R^2_train =', reg.score(train_X, train_y))
print('R^2_test =',reg.score(test_X, test_y))
print('MSE_train =', mean_squared_error(reg.predict(train_X), train_y))
print('MSE_test =', mean_squared_error(reg.predict(test_X), test_y))

R^2_train = 0.5736264659005756
R^2_test = 0.5772092424751429
MSE_train = 16498.690693339417
MSE_test = 15614.54269295461
