In [9]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import r2_score, mean_squared_error as MSE
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

Создадим новый pipeline для модели, которую будем экспортировать в Fast API сервис. В этот pipeline включим:
- приведение признаков `mileage`, `engine`, `max_power` и  к числовому типу
- заполнение пропусков в значениях признаков медианой
- удаление дубликатов
- удаление признака `selling_price` (поскольку изначально он есть в тестовой выборке)
- удаление признаков `name` и `torque`
- кодирование категориальных признаков при помоще one-hot-encoding
- нормализация числовых признаков

In [2]:
df_train = pd.read_csv('https://raw.githubusercontent.com/Murcha1990/MLDS_ML_2022/main/Hometasks/HT1/cars_train.csv')
df_test = pd.read_csv('https://raw.githubusercontent.com/Murcha1990/MLDS_ML_2022/main/Hometasks/HT1/cars_test.csv')

In [3]:
X_train = df_train.drop(columns=['selling_price', 'torque', 'name'], axis=1)
y_train = df_train['selling_price']
X_test = df_test.drop(columns=['selling_price', 'torque', 'name'], axis=1)
y_test = df_test['selling_price']

In [4]:
def extract_numeric_from_cols(df):
    df = df.copy()
    def extract_numeric(x):    
        if type(x) == str:
            x = ''.join(filter(lambda y: str.isdigit(y) or y == '.', x.split()[0]))
            if x != '': 
                return float(x)
            else:
                return np.nan    
        elif type(x) == float:
            return x
        else:
            return np.nan    
   
    for column in ['mileage', 'engine', 'max_power', 'seats']:
        df[column] = df[column].apply(extract_numeric)
    return df  

In [5]:
X_train

Unnamed: 0,year,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,2014,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,5.0
1,2014,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,5.0
2,2010,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,5.0
3,2007,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,5.0
4,2017,45000,Petrol,Individual,Manual,First Owner,20.14 kmpl,1197 CC,81.86 bhp,5.0
...,...,...,...,...,...,...,...,...,...,...
6994,2013,110000,Petrol,Individual,Manual,First Owner,18.5 kmpl,1197 CC,82.85 bhp,5.0
6995,2007,119000,Diesel,Individual,Manual,Fourth & Above Owner,16.8 kmpl,1493 CC,110 bhp,5.0
6996,2009,120000,Diesel,Individual,Manual,First Owner,19.3 kmpl,1248 CC,73.9 bhp,5.0
6997,2013,25000,Diesel,Individual,Manual,First Owner,23.57 kmpl,1396 CC,70 bhp,5.0


In [6]:
X_train = extract_numeric_from_cols(X_train)
X_train

Unnamed: 0,year,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,2014,145500,Diesel,Individual,Manual,First Owner,23.40,1248.0,74.00,5.0
1,2014,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52,5.0
2,2010,127000,Diesel,Individual,Manual,First Owner,23.00,1396.0,90.00,5.0
3,2007,120000,Petrol,Individual,Manual,First Owner,16.10,1298.0,88.20,5.0
4,2017,45000,Petrol,Individual,Manual,First Owner,20.14,1197.0,81.86,5.0
...,...,...,...,...,...,...,...,...,...,...
6994,2013,110000,Petrol,Individual,Manual,First Owner,18.50,1197.0,82.85,5.0
6995,2007,119000,Diesel,Individual,Manual,Fourth & Above Owner,16.80,1493.0,110.00,5.0
6996,2009,120000,Diesel,Individual,Manual,First Owner,19.30,1248.0,73.90,5.0
6997,2013,25000,Diesel,Individual,Manual,First Owner,23.57,1396.0,70.00,5.0


In [32]:
numeric_features = ['year', 'km_driven', 'mileage', 'engine', 'max_power']
categorical_features = ['fuel', 'seller_type', 'transmission', 'owner', 'seats']

# Преобразование числовых столбцов
numerical_transformer = Pipeline(steps=[   
    ('imputer', SimpleImputer(strategy='median')), 
    ('scaler', MinMaxScaler())  # Масштабирование признаков
])

# Преобразование категориальных столбцов
categorical_transformer = Pipeline(steps=[    
    ('ohe', OneHotEncoder(drop='first', handle_unknown='ignore', dtype='byte'))  # Масштабирование признаков
])

# Объединяем преобразования с помощью ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        # ('custom', custom_transformer, ['mileage', 'engine', 'max_power']),
        ('num', numerical_transformer, numeric_features),
        ('ohe', categorical_transformer, categorical_features)])
        
#Полный пайплайн с линейной регрессией
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge())
])

In [35]:
pipeline.fit(X_train, y_train)

In [34]:
X_test = extract_numeric_from_cols(X_test)
X_test

Unnamed: 0,year,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,2010,168000,Diesel,Individual,Manual,First Owner,14.00,2498.0,112.00,7.0
1,2017,25000,Diesel,Individual,Manual,First Owner,21.50,1497.0,108.50,5.0
2,2007,218463,Petrol,Individual,Automatic,First Owner,12.90,1799.0,130.00,5.0
3,2015,173000,Diesel,Individual,Manual,First Owner,25.10,1498.0,98.60,5.0
4,2011,70000,Petrol,Individual,Manual,Second Owner,16.50,1172.0,65.00,5.0
...,...,...,...,...,...,...,...,...,...,...
995,2008,100000,Petrol,Individual,Manual,Second Owner,19.81,1086.0,68.05,5.0
996,2017,50000,Petrol,Individual,Manual,Second Owner,18.60,1197.0,81.83,5.0
997,2009,40000,Diesel,Individual,Manual,First Owner,23.00,1396.0,90.00,5.0
998,2012,25000,Petrol,Individual,Manual,First Owner,20.36,1197.0,78.90,5.0


In [36]:
y_pred = pipeline.predict(X_test)
test_r2 = r2_score(y_test, y_pred)
print("R2 на тестовой выборке:", test_r2)

R2 на тестовой выборке: 0.6889718178774099
