In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn import metrics

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [28]:
df = pd.read_csv('car_dataset.csv', index_col=0, engine='python')

In [29]:
df.head()

Unnamed: 0,brand,model,vin,владельцы,год выпуска,госномер,двигатель,коробка,кузов,налог,...,руль,состояние,таможня,цвет,цена,обмен,гарантия,владение,кузов №,запас хода
0,vaz,granta,xta**************,2 владельца,2012,******|13,"1.6 л / 87 л.с. / бензин, газобаллонное оборуд...",механическая,седан,1 505 ₽ / год,...,левый,не требует ремонта,растаможен,голубой,245 000 ₽,,,,,
1,volkswagen,polo,xw8**************,1 владелец,2020,,1.6 л / 110 л.с. / бензин,автоматическая,лифтбек,2 200 ₽ / год,...,левый,не требует ремонта,растаможен,коричневый,1 050 000 ₽,рассмотрю варианты,,,,
2,kia,ceed,xwe**************,2 владельца,2018,,1.6 л / 130 л.с. / бензин,автоматическая,универсал 5 дв.,4 550 ₽ / год,...,левый,не требует ремонта,растаможен,белый,1 049 000 ₽,рассмотрю варианты,,,,
3,bmw,3er,x4x**************,3 или более,2017,******|22,2.0 л / 190 л.с. / дизель,автоматическая,седан,4 750 ₽ / год,...,левый,не требует ремонта,растаможен,белый,1 890 000 ₽,,,,,
4,ford,mondeo,wf0**************,3 или более,2001,******|96,2.0 л / 145 л.с. / бензин,механическая,седан,1 363 ₽ / год,...,левый,не требует ремонта,растаможен,серебристый,150 000 ₽,,,,,


In [31]:
df['двигатель'] = df['двигатель'].str.replace('\xa0', ' ')

In [34]:
df.columns

Index(['brand', 'model', 'vin', 'owners', 'year_of_release', 'gos_num',
       'engine', 'transmission', 'car_type', 'tax', 'description',
       'drive_type', 'mileage', 'ptc', 'steering_wheel', 'condition',
       'customhouse', 'color', 'price', 'exchange', 'guarantee', 'ownership',
       'body_nomber', 'capacity reserve'],
      dtype='object')

In [35]:
df = df.rename(columns={'владельцы': 'owners', 'год выпуска': 'year_of_release',
                        'двигатель': 'engine', 'коробка': 'transmission',
                        'кузов': 'car_type', 'налог': 'tax', 'привод': 'drive_type',
                        'описание': 'description', 'пробег': 'mileage', 'птс': 'ptc',
                        'руль': 'steering_wheel', 'состояние': 'condition', 
                        'таможня': 'customhouse', 'цвет': 'color', 'цена': 'price',
                        'обмен': 'exchange', 'гарантия': 'guarantee', 'владение': 'ownership',
                        'кузов №': 'body_type_number', 'запас хода': 'capacity reserve',
                        'госномер': 'gos_num', 'кузов №': 'body_nomber'})

In [36]:
columns_to_remove = ['gos_num','exchange', 'guarantee', 'ownership', 
                     'body_type_number', 'capacity reserve', 'vin', 
                     'engine', 'model', 'condition', 'customhouse', 
                     'body_nomber']

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2859 entries, 0 to 2836
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   brand             2838 non-null   object
 1   model             2838 non-null   object
 2   vin               2742 non-null   object
 3   owners            2838 non-null   object
 4   year_of_release   2838 non-null   object
 5   gos_num           2188 non-null   object
 6   engine            2838 non-null   object
 7   transmission      2838 non-null   object
 8   car_type          2838 non-null   object
 9   tax               2793 non-null   object
 10  description       2838 non-null   object
 11  drive_type        2836 non-null   object
 12  mileage           2836 non-null   object
 13  ptc               2836 non-null   object
 14  steering_wheel    2836 non-null   object
 15  condition         2836 non-null   object
 16  customhouse       2836 non-null   object
 17  color             2

In [42]:
def split_engine_be_features(engine_str):
    print(engine_str)
    engine_split = engine_str.split('/')
    
    engine_volume = engine_split[0].strip().split(' ')[0]
    engine_power = engine_split[1].strip().split(' ')[0]
    engine_type = engine_split[2].strip()
    
    return engine_volume, engine_power, engine_type

In [45]:
df[df['engine'] == None]

Unnamed: 0,brand,model,vin,owners,year_of_release,gos_num,engine,transmission,car_type,tax,...,steering_wheel,condition,customhouse,color,price,exchange,guarantee,ownership,body_nomber,capacity reserve


In [43]:
df['engine_volume'], df['engine_power'], df['engine_type'] = zip(*df['engine'].map(split_engine_be_features))

1.6 л / 87 л.с. / бензин, газобаллонное оборудование
1.6 л / 110 л.с. / бензин
1.6 л / 130 л.с. / бензин
2.0 л / 190 л.с. / дизель
2.0 л / 145 л.с. / бензин
4.0 л / 550 л.с. / бензин
5.7 л / 367 л.с. / бензин
1.6 л / 98 л.с. / бензин
2.0 л / 184 л.с. / бензин
1.6 л / 81 л.с. / бензин
3.0 л / 275 л.с. / дизель
2.0 л / 184 л.с. / бензин
2.0 л / 280 л.с. / бензин
2.0 л / 225 л.с. / дизель
2.0 л / 149 л.с. / бензин
2.0 л / 180 л.с. / бензин
1.5 л / 79 л.с. / бензин
2.5 л / 181 л.с. / бензин
2.7 л / 190 л.с. / дизель
4.7 л / 455 л.с. / бензин
2.0 л / 197 л.с. / бензин
2.5 л / 99 л.с. / дизель
1.8 л / 125 л.с. / бензин
0.8 л / 52 л.с. / бензин
1.3 л / 101 л.с. / бензин
3.0 л / 197 л.с. / бензин
3.5 л / 262 л.с. / бензин
1.6 л / 106 л.с. / бензин
1.6 л / 110 л.с. / бензин
1.4 л / 150 л.с. / бензин
1.4 л / 90 л.с. / бензин
0.8 л / 52 л.с. / бензин
1.5 л / 102 л.с. / бензин
2.0 л / 150 л.с. / бензин
1.6 л / 115 л.с. / дизель
2.0 л / 150 л.с. / бензин
6.8 л / 405 л.с. / бензин
2.0 л / 112 л.с. /

AttributeError: 'NoneType' object has no attribute 'split'

In [None]:
df = df.drop(columns=columns_to_remove)

In [None]:
df = df.dropna(how='any')

In [None]:
len(df), df.nunique()

In [None]:
df = df[~(df['engine_volume'].astype('float32') > 10.0)]

In [None]:
df = df.loc[~(df['car_type'].isin(df['car_type'].value_counts().loc[df['car_type'].value_counts() < 10].index))]

In [None]:
df['tax'].value_counts()

In [None]:
df['tax'] = df['tax'].apply(lambda x: x.split('/')[0].replace('₽', '').replace('\xa0', '')).astype('int32')

In [None]:
sns.displot(x='tax', data=df)

In [None]:
df.nunique()

In [None]:
df['mileage'].value_counts()

In [None]:
df['mileage'] = df['mileage'].apply(lambda x: x.replace('\xa0', '').replace('км', '')).astype('int32')

In [None]:
df = df.loc[~(df['color'].isin(df['color'].value_counts().loc[df['color'].value_counts() < 10].index))]

In [None]:
df['color'].value_counts()

In [None]:
df['price'] = df['price'].apply(lambda x: x.replace('\xa0', '').replace('₽', '')).astype('int32')

In [None]:
df.head()

In [None]:
df.nunique()

In [None]:
categorical_features = ['brand', 'owners', 'transmission', 'car_type', 'drive_type', 'ptc', 'steering_wheel', 'color', 'engine_type']

In [None]:
df = pd.get_dummies(data=df, columns=categorical_features)

In [None]:
iqr_1_5 = 1.5 * (df['price'].quantile(0.75) - df['price'].quantile(0.25))
above_iqr_1_5 = df['price'].median() + iqr_1_5

In [None]:
df = df.loc[df['price'] < above_iqr_1_5]

In [None]:
df['price'].describe()

### Split data

In [None]:
nlp_data = df[['description']]

In [None]:
df = df.drop(columns=['description'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['price']),
                                                    df['price'], 
                                                    test_size=0.25)

In [None]:
X_train.shape, X_test.shape

### Scaling

In [None]:
scaler_x = MinMaxScaler()
X_train[X_train.columns] = scaler_x.fit_transform(X_train)
X_test[X_test.columns] = scaler_x.transform(X_test)

## Modeling

### BaseLine

In [None]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def dataframe_metrics(y_test,y_pred):
    stats = [
       metrics.mean_absolute_error(y_test, y_pred),
       np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
       metrics.r2_score(y_test, y_pred),
       mean_absolute_percentage_error(y_test, y_pred)
    ]
    return stats

In [None]:
measured_metrics = pd.DataFrame({"error_type":["MAE", "RMSE", "R2", "MAPE"]})
measured_metrics.set_index("error_type")

In [None]:
median_train = y_train.median()
baseline = np.array([median_train] * len(y_test))
measured_metrics['baseline'] = dataframe_metrics(y_test, baseline)
measured_metrics

### Linear Reg

In [None]:
lin_reg = LinearRegression()

lin_reg.fit(X_train, y_train)

In [None]:
featureImportance = pd.DataFrame({"feature": X_train.columns, 
                                  "importance": lin_reg.coef_})

featureImportance.set_index('feature', inplace=True)
featureImportance.sort_values(["importance"], ascending=False, inplace=True)
featureImportance["importance"].plot(kind='bar', figsize=(20, 6));

In [None]:
predictions = lin_reg.predict(X_test)

In [None]:
measured_metrics["lin_reg"] = dataframe_metrics(y_test, predictions)
measured_metrics

In [None]:
df[['price']].describe()

## RidgeCV

In [None]:
ridge_cv = RidgeCV()

ridge_cv.fit(X_train, y_train)

In [None]:
featureImportance = pd.DataFrame({"feature": X_train.columns, 
                                  "importance": ridge_cv.coef_})

featureImportance.set_index('feature', inplace=True)
featureImportance.sort_values(["importance"], ascending=False, inplace=True)
featureImportance["importance"].plot(kind='bar', figsize=(18, 6));

In [None]:
predictions = ridge_cv.predict(X_test)
measured_metrics["ridge_cv"] = dataframe_metrics(y_test, predictions)
measured_metrics

lasso_cv = LassoCV()

lasso_cv.fit(X_train_scaled, y_train)## Lasso

In [None]:
lasso_cv = LassoCV()

lasso_cv.fit(X_train, y_train)

In [None]:
featureImportance = pd.DataFrame({"feature": X_train.columns[lasso_cv.coef_!=0], 
                                  "importance": lasso_cv.coef_[lasso_cv.coef_!=0]})

featureImportance.set_index('feature', inplace=True)
featureImportance.sort_values(["importance"], ascending=False, inplace=True)
featureImportance["importance"].plot(kind='bar', figsize=(18, 6));

measured_metrics["lasso_cv"] = dataframe_metrics(y_test, lasso_cv.predict(X_test))
measured_metrics

### Random Forest

In [None]:
rf_reg = RandomForestRegressor(n_jobs=-1)
rf_reg.fit(X_train, y_train)

measured_metrics["rf_reg"] = dataframe_metrics(y_test, rf_reg.predict(X_test))
measured_metrics