In [2]:
#importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
#Loading the dataset
data = pd.read_csv('data.csv')

In [4]:
#Display information about our data
print(data.head())  #Shows the first few rows of the data for an overview
print(data.info())  #shows information about the data including types and missing values

   Unnamed: 0       brand              model  color registration_date  year  \
0           0  alfa-romeo     Alfa Romeo GTV    red           10/1995  1995   
1           1  alfa-romeo     Alfa Romeo 164  black           02/1995  1995   
2           2  alfa-romeo  Alfa Romeo Spider  black           02/1995  1995   
3           3  alfa-romeo  Alfa Romeo Spider  black           07/1995  1995   
4           4  alfa-romeo     Alfa Romeo 164    red           11/1996  1996   

  price_in_euro power_kw power_ps transmission_type fuel_type  \
0          1300      148      201            Manual    Petrol   
1         24900      191      260            Manual    Petrol   
2          5900      110      150           Unknown    Petrol   
3          4900      110      150            Manual    Petrol   
4         17950      132      179            Manual    Petrol   

  fuel_consumption_l_100km fuel_consumption_g_km  mileage_in_km  \
0            10,9 l/100 km              260 g/km       160500.0   


In [5]:
#showing the year column before cleaning
unique_years = data['year'].unique()
print("Unique values in 'year':", unique_years)

Unique values in 'year': ['1995' '1996' '1997' '1998' '1999' '2000' '2001' '2002' '2003' '2004'
 '2005' '2006' '2007' '2008' '2009' '2010' '2011' '2012' '2013' '2014'
 '2015' '2016' '2017' '2018' '2019' '2020' '2021' '2022' '2023' 'Petrol'
 'Automatic' 'Manual' 'Diesel' '04/2017']


In [6]:
#Cleaning the 'year' column
def clean_year(year):
    try:
        year = int(year)
        if 1995 <= year <= 2023:
            return year
    except ValueError:
        return np.nan
    return np.nan

In [7]:
#cleaning the 'year' column using the cleaning function
data['year'] = data['year'].apply(clean_year)
data = data.dropna(subset=['year']) #Dropping rows where 'year' is NaN after cleaning
data['year'] = data['year'].astype(int) #Converting 'year' to integer type

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['year'] = data['year'].astype(int) #Converting 'year' to integer type


In [8]:
unique_years = data['year'].unique()
print("Unique values in 'year':", unique_years) #showing the year column after cleaning

Unique values in 'year': [1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008
 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022
 2023]


In [9]:
#convert other numerical columns to numeric datatype
def to_numeric(df, column):
    df[column] = pd.to_numeric(df[column], errors='coerce')

numeric_columns = ['price_in_euro', 'power_kw', 'power_ps', 'fuel_consumption_l_100km', 'mileage_in_km']
for col in numeric_columns:
    to_numeric(data, col)

In [10]:
#Handling missing values and scaling
#Defining numeric and categorical features
num_features = ['year', 'price_in_euro', 'power_kw', 'fuel_consumption_l_100km', 'mileage_in_km', 'power_ps']#numerical columns
cat_features = ['brand', 'model', 'color', 'transmission_type', 'fuel_type']#categprical columns

In [11]:
#Creating pipelines for numeric and categorical preprocessing

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  #Handles missing values in numeric columns by replacing with median(imputation)
    ('scaler', StandardScaler())  #Scaling numeric features to have mean=0 and std=1
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  #Handles missing values in categorical columns by replacing with most frequent value(mode)
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  #One-hot encode categorical features, ignore unknown categories
])

# Combine both pipelines into a single preprocessor
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])

In [12]:
#Handling outliers by replacing them with median values
for col in numeric_columns:
    col_zscore = (data[col] - data[col].mean()) / data[col].std()  #Calculate Z-scores
    outliers = data[np.abs(col_zscore) > 3]  #Identify outliers where Z-score > 3(threshold is 3)
    data.loc[outliers.index, col] = data[col].median()  #Replacing outliers with median value of the column

#Applying the preprocessing(transofrmations) pipelines to the data
data_preprocessed = preprocessor.fit_transform(data)

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [13]:
#Splitting the data into training and testing sets
X = data_preprocessed  #Features
y = data['price_in_euro']  #Target(price)

#Splitting the data into training and testing sets (80% for training , 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)