In [12]:
import pandas as pd

In [13]:
df = pd.read_csv('bmw_dataset.csv')

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Model                 49999 non-null  object 
 1   Year                  49998 non-null  float64
 2   Region                49998 non-null  object 
 3   Color                 49997 non-null  object 
 4   Fuel_Type             49999 non-null  object 
 5   Transmission          49996 non-null  object 
 6   Engine_Size_L         50000 non-null  float64
 7   Mileage_KM            49997 non-null  float64
 8   Price_USD             49996 non-null  float64
 9   Sales_Volume          49999 non-null  float64
 10  Sales_Classification  50000 non-null  object 
dtypes: float64(5), object(6)
memory usage: 4.2+ MB


In [15]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [16]:
def FillMissingValues(df):
    for col in df.columns:
        if df[col].isnull().any():
            if df[col].dtype == 'object':
                df[col].fillna(df[col].mode()[0], inplace=True)
            else:
                df[col].fillna(df[col].mean(), inplace=True)

In [17]:
def Encoder(df):
    encoder = LabelEncoder()
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = encoder.fit_transform(df[col])
    return df

In [18]:
def Scaler(df):
    scaler = MinMaxScaler()
    num_col = df.select_dtypes(include=['float64', 'int64']).columns.drop('Price_USD')
    df[num_col] = scaler.fit_transform(df[num_col])
    return df

In [19]:
FillMissingValues(df)
df = Encoder(df)
df = Scaler(df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Model                 50000 non-null  float64
 1   Year                  50000 non-null  float64
 2   Region                50000 non-null  float64
 3   Color                 50000 non-null  float64
 4   Fuel_Type             50000 non-null  float64
 5   Transmission          50000 non-null  float64
 6   Engine_Size_L         50000 non-null  float64
 7   Mileage_KM            50000 non-null  float64
 8   Price_USD             50000 non-null  float64
 9   Sales_Volume          50000 non-null  float64
 10  Sales_Classification  50000 non-null  float64
dtypes: float64(11)
memory usage: 4.2 MB


In [21]:
from sklearn.model_selection import train_test_split

x = df.drop('Price_USD', axis=1)
y = df['Price_USD']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [22]:
from sklearn.metrics import r2_score, mean_absolute_error

# Linear Regression

In [23]:
from sklearn.linear_model import LinearRegression

In [24]:
lr = LinearRegression()

In [25]:
lr.fit(x_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [26]:
y_pred = lr.predict(x_test)

In [27]:
lr_score = r2_score(y_test, y_pred)
lr_mae = mean_absolute_error(y_test, y_pred)

In [28]:
lr_score

-0.0007573373706792896

In [29]:
lr_mae

22570.186772556634

# Decision Tree

In [30]:
from sklearn.tree import DecisionTreeRegressor

In [31]:
dt = DecisionTreeRegressor()

In [32]:
dt.fit(x_train, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [33]:
y_pred = dt.predict(x_test)

In [34]:
dt_score = r2_score(y_test, y_pred)
dt_mae = mean_absolute_error(y_test, y_pred)

In [35]:
dt_score

-1.0413424568576648

In [36]:
dt_mae

30373.148229320348

# Random Forest

In [37]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()

In [38]:
rf.fit(x_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [39]:
y_pred = rf.predict(x_test)

In [40]:
rf_score = r2_score(y_test, y_pred)
rf_mae = mean_absolute_error(y_test, y_pred)

In [41]:
rf_score

-0.025896502628921292

In [42]:
rf_mae

22736.89594989515

# SVM

In [43]:
from sklearn.svm import SVR

In [44]:
svr = SVR(kernel='linear', C=50.0)

In [45]:
svr.fit(x_train, y_train)

0,1,2
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,tol,0.001
,C,50.0
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [46]:
y_pred = svr.predict(x_test)

In [47]:
svm_score = r2_score(y_test, y_pred)
svm_mae = mean_absolute_error(y_test, y_pred)

In [48]:
svm_score

-0.0012543287379063184

In [49]:
svm_mae

22572.736143724953

# KNN

In [50]:
from sklearn.neighbors import KNeighborsRegressor

In [51]:
knn = KNeighborsRegressor(n_neighbors=5)

In [52]:
knn.fit(x_train, y_train)

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [53]:
y_pred = knn.predict(x_test)

In [54]:
knn_score = r2_score(y_test, y_pred)
knn_mae = mean_absolute_error(y_test, y_pred)

In [55]:
knn_score

-0.19366749947026518

In [56]:
knn_mae

24015.58055413593

# Tabulate

In [57]:
from tabulate import tabulate

In [58]:
result = [
    ['Linear Regression', lr_score, lr_mae],
    ['Decision Tree', dt_score, dt_mae],
    ['Random Forest', rf_score, rf_mae],
    ['Support Vector Regressor', svm_score, svm_mae],
    ['K-Nearest Neighbors', knn_score, knn_mae]
]

headers = ['algorithm', 'r2_score', 'mean_absolute_error']

table = tabulate(result, headers=headers, tablefmt='grid', floatfmt='.3f')

In [59]:
print(table)

+--------------------------+------------+-----------------------+
| algorithm                |   r2_score |   mean_absolute_error |
| Linear Regression        |     -0.001 |             22570.187 |
+--------------------------+------------+-----------------------+
| Decision Tree            |     -1.041 |             30373.148 |
+--------------------------+------------+-----------------------+
| Random Forest            |     -0.026 |             22736.896 |
+--------------------------+------------+-----------------------+
| Support Vector Regressor |     -0.001 |             22572.736 |
+--------------------------+------------+-----------------------+
| K-Nearest Neighbors      |     -0.194 |             24015.581 |
+--------------------------+------------+-----------------------+
