In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [3]:
def parse_engine(engine: list[str]):
    engine_type = None
    volume = None
    match engine:
        case [eng, vol, c]:
            engine_type = eng
            volume = float(vol.split()[0])
        case ['бензин' | 'дизель', last]:
            engine_type = engine[0]
            volume = float(last.split()[0]) if last.endswith(' л') else None
        case ['бензин' | 'дизель']:
            engine_type = engine[0]
        case [str, _]:
            volume = float(engine[0].split()[0])
    return {'Двигатель': engine_type, 'Объем': volume}
        
# engine_info = pd.DataFrame(df['Двигатель'].str.split(', ').apply(parse_engine).values.tolist())
# engine_info.head()

In [8]:
df = pd.read_csv('hyundai.csv', index_col=0, parse_dates=['publication_date'])
model_year = pd.DataFrame([(model, int(year)) for model, year in df.title.str.split(', ')], columns=['model', 'year'])
df = pd.concat((model_year, df), axis='columns').drop('title', axis='columns')

engine_info = pd.DataFrame(df['Двигатель'].str.split(', ').apply(parse_engine).values.tolist())
df = pd.concat((df.drop('Двигатель', axis='columns'), engine_info), axis='columns')
df['Пробег'] = df['Пробег'].replace("новый автомобиль", "0").str.replace(" ", "").str.extract(r'(\d+)').astype(float)
df['Мощность'] = df['Мощность'].str.extract(r'(\d+)').astype(float)
df['price'] = df['price'].str.replace(" ", "").str.extract(r'(\d+)').astype(float)
df['age'] = df.publication_date.dt.year - df.year

df = df.dropna(subset=['Пробег', 'Объем'])

  df = pd.read_csv('hyundai.csv', index_col=0, parse_dates=['publication_date'])


In [9]:
df.head()

Unnamed: 0,model,year,link,Мощность,Коробка передач,Привод,Тип кузова,Цвет,Пробег,Руль,Поколение,Комплектация,page_title,price,publication_date,Особые отметки,Двигатель,Объем,age
0,Hyundai Solaris,2021,https://auto.drom.ru/moscow/hyundai/solaris/20...,123.0,АКПП,передний,седан,серый,53000.0,левый,"2 поколение, рестайлинг",1.6 AT Elegance,"Продажа Hyundai Solaris, 2021 год в Москве",1790000.0,2024-02-08,,бензин,1.6,3.0
1,Hyundai Palisade,2022,https://auto.drom.ru/moscow/hyundai/palisade/8...,200.0,АКПП,4WD,,черный,60000.0,левый,1 поколение,2.2 CRDi AT 4WD Prestige 7 мест,"Продажа Hyundai Palisade, 2022 год в Москве",4900000.0,2024-07-27,документы с проблемами или отсутствуют,дизель,2.2,2.0
2,Hyundai i40,2014,https://auto.drom.ru/moscow/hyundai/i40/816037...,150.0,АКПП,передний,универсал,черный,88000.0,левый,1 поколение,,"Продажа Hyundai i40, 2014 год в Москве",1379000.0,2024-07-31,,бензин,2.0,10.0
3,Hyundai Santa Fe,2019,https://auto.drom.ru/moscow/hyundai/santa_fe/8...,200.0,АКПП,4WD,,белый,135000.0,левый,4 поколение,2.2 CRDi AT 4WD High-Tech 5 мест,"Продажа Hyundai Santa Fe, 2019 год в Москве",3690000.0,2024-07-31,,дизель,2.2,5.0
4,Hyundai Solaris,2012,https://auto.drom.ru/moscow/hyundai/solaris/63...,123.0,АКПП,передний,седан,красный,188250.0,левый,1 поколение,1.6 AT Classic,"Продажа Hyundai Solaris, 2012 год в Москве",780000.0,2024-07-30,,бензин,1.6,12.0


In [10]:
categorical_features = [
#     'Двигатель',
#     'Коробка передач',
    'Привод',
    'Цвет',
#     'Руль',
    'Комплектация'
]
non_categorical_features = [
    'Мощность',
    'Пробег',
    'price',
    'Объем',
    'age'
]

In [11]:
encoder = OneHotEncoder(sparse_output=False)
encoded_categorical = encoder.fit_transform(df[categorical_features])

In [12]:
encoded_categorical_df = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out(categorical_features))
data = pd.concat([df[non_categorical_features].reset_index(drop=True), encoded_categorical_df.reset_index(drop=True)], axis=1)
# data = data.drop(categorical_features, axis=1)

In [13]:
data.head()

Unnamed: 0,Мощность,Пробег,price,Объем,age,Привод_4WD,Привод_задний,Привод_передний,Привод_nan,Цвет_бежевый,...,Комплектация_2.9 CRDi AT,Комплектация_3.0 AT,Комплектация_3.0 AT AWD Business,Комплектация_3.0 AT Elegance,Комплектация_3.0 CRDI AT 4WD Luxury,Комплектация_3.5 AT 4WD High-Tech 7 мест,Комплектация_3.5 MPI AT 4WD Luxe 7 мест,Комплектация_5.0 MPI AT 2WD Limousine,Комплектация_5.0 MPI AT 2WD Royal,Комплектация_nan
0,123.0,53000.0,1790000.0,1.6,3.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,200.0,60000.0,4900000.0,2.2,2.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,150.0,88000.0,1379000.0,2.0,10.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,200.0,135000.0,3690000.0,2.2,5.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,123.0,188250.0,780000.0,1.6,12.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression


In [19]:
X = data.drop('price', axis=1)
X['Мощность'] = X['Мощность'].fillna(X['Мощность'].mean())
y = data['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Обучение модели
model = LinearRegression()
model.fit(X_train, y_train)

# Оценка модели
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

Mean Absolute Error: 115691910.26756974


In [22]:
import statsmodels.api as sm

In [23]:
# Добавление константы для модели
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

# Обучение модели
model = sm.OLS(y_train, X_train).fit()

# Оценка модели
y_pred = model.predict(X_test)
mae = np.mean(np.abs(y_test - y_pred))
print(f'Mean Absolute Error: {mae}')

Mean Absolute Error: 264138.22004600905


In [24]:
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.868
Model:,OLS,Adj. R-squared:,0.831
Method:,Least Squares,F-statistic:,23.56
Date:,"Fri, 09 Aug 2024",Prob (F-statistic):,0.0
Time:,21:40:40,Log-Likelihood:,-19811.0
No. Observations:,1393,AIC:,40230.0
Df Residuals:,1088,BIC:,41830.0
Df Model:,304,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,8.11e+05,1.1e+05,7.354,0.000,5.95e+05,1.03e+06
Мощность,1.146e+04,1273.099,9.003,0.000,8963.260,1.4e+04
Пробег,-2.2569,0.220,-10.278,0.000,-2.688,-1.826
Объем,1.753e+05,1.01e+05,1.728,0.084,-2.38e+04,3.74e+05
age,-8.99e+04,4222.876,-21.289,0.000,-9.82e+04,-8.16e+04
Привод_4WD,3.238e+05,6.87e+04,4.715,0.000,1.89e+05,4.59e+05
Привод_задний,4.093e+05,1.18e+05,3.469,0.001,1.78e+05,6.41e+05
Привод_передний,-3.988e+04,5.83e+04,-0.684,0.494,-1.54e+05,7.45e+04
Привод_nan,1.178e+05,1.95e+05,0.603,0.546,-2.65e+05,5.01e+05

0,1,2,3
Omnibus:,1324.458,Durbin-Watson:,2.105
Prob(Omnibus):,0.0,Jarque-Bera (JB):,108347.446
Skew:,4.18,Prob(JB):,0.0
Kurtosis:,45.389,Cond. No.,8.94e+22
