In [2]:
from tqdm import tqdm
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
import random
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [3]:
PATH_TRAIN = "datasets/train.csv"
PATH_VALID = "datasets/valid.csv"

In [4]:
data_train = pd.read_csv(PATH_TRAIN)
data_valid = pd.read_csv(PATH_VALID)

# Separate features (X) and target variable (y)
y_train = data_train['Pret']
X_train = data_train.drop('Pret', axis=1)

y_valid = data_valid['Pret']
X_valid = data_valid.drop('Pret', axis=1)

print(X_train.head())

                     Versiune  Vechime      Km  Putere  Capacitate cilindrica  \
0          D4 AWD Inscription      6.0  124000     190                   1969   
1      109 CDI Extralung Mixt      5.0   36000      90                   1461   
2  1.6 BlueHDi FAP STT Active      8.0  219000     120                   1560   
3      2.2 CRDi 2WD Signature      4.0   51500     202                   2151   
4                  1.8 TSI FR     11.0  228300     160                   1798   

           Marca     Model Combustibil     Transmisie Norma de poluare  \
0          Volvo     XC 60      Diesel  4x4 (automat)           Euro 6   
1  Mercedes-Benz     Citan      Diesel           Fata           Euro 6   
2        Peugeot       308      Diesel           Fata           Euro 6   
3        Hyundai  Santa Fe      Diesel           Fata           Euro 6   
4           Seat      Leon     Benzina           Fata           Euro 5   

  Cutie de viteze Tip Caroserie Optiuni culoare Tara de origine     

In [6]:
numeric_features = ["Km", "Vechime", "Capacitate cilindrica", "Putere"]
categorical_features = ["Marca", "Model", "Versiune", "Combustibil", "Transmisie", "Norma de poluare", "Tip Caroserie", "Cutie de viteze", "Stare", "Tara de origine", "Optiuni culoare"]

In [7]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [9]:
regressor = LinearRegression()

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', regressor)
])

model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Km', 'Vechime',
                                                   'Capacitate cilindrica',
                                                   'Putere']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                      

In [11]:
predictions = model.predict(X_valid)

mae = mean_absolute_error(y_valid, predictions)
print("The MAE score of the model on validation data is: " + str(mae))

rmse = mean_squared_error(y_valid, predictions, squared=False)
print("The RMSE score of the model on validation data is: " + str(rmse))

The MAE score of the model on validation data is: 3446.103228750248
The RMSE score of the model on validation data is: 8549.60924256812
