In [1]:
# Импорт необходимых библиотек
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
# Загрузка данных
data = pd.read_csv('car_price_prediction.csv')

In [9]:
# Удаление строк с символом "-"
data = data[~data.applymap(lambda x: isinstance(x, str) and '-' in x).any(axis=1)]

  data = data[~data.applymap(lambda x: isinstance(x, str) and '-' in x).any(axis=1)]


In [10]:
# Разделение на признаки и целевую переменную
X = data.drop('Price', axis=1)
y = data['Price']

In [11]:
# Разделение на обучающий и тестовый наборы данных
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Обработка пропущенных значений
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [15]:
# Преобразование категориальных признаков в строковый тип данных
X_train[['Manufacturer', 'Fuel type']] = X_train[['Manufacturer', 'Fuel type']].astype(str)
X_test[['Manufacturer', 'Fuel type']] = X_test[['Manufacturer', 'Fuel type']].astype(str)

# Обновление предобработчика с использованием OneHotEncoder для категориальных признаков
from sklearn.preprocessing import OneHotEncoder

categorical_features = ['Manufacturer', 'Fuel type']
numeric_features = [col for col in X.columns if col not in categorical_features]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])


In [16]:
# Обучение и оценка моделей
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Support Vector Machine': SVR()
}

for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'{name}:')
    print(f'Mean Absolute Error: {mae}')
    print(f'Mean Squared Error: {mse}')
    print(f'R^2 Score: {r2}')
    print('------------------------')

ValueError: Cannot use mean strategy with non-numeric data:
could not convert string to float: '520 d xDrive Luxury'