In [175]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_absolute_error as MAE

# Lets make 1 combined csv with 2 car makes
The plan is to binary encode the model and one hot encode the brand.
The information the future scraper will have is:
* milage (km stand)
* transmission
* build year
* power (kW)
* energy source (petrol/diesel/electric)

In [176]:
df1 = pd.read_csv("Data/bmw.csv")
df2 = pd.read_csv("Data/vw.csv")
df1["brand"] = "bmw"
df2["brand"] = "vw"
df = pd.concat([df1, df2])
df.reset_index(drop=True, inplace=True)
del([df1, df2])
df

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,brand
0,5 Series,2014,11200,Automatic,67068,Diesel,125,57.6,2.0,bmw
1,6 Series,2018,27000,Automatic,14827,Petrol,145,42.8,2.0,bmw
2,5 Series,2016,16000,Automatic,62794,Diesel,160,51.4,3.0,bmw
3,1 Series,2017,12750,Automatic,26676,Diesel,145,72.4,1.5,bmw
4,7 Series,2014,14500,Automatic,39554,Diesel,160,50.4,3.0,bmw
...,...,...,...,...,...,...,...,...,...,...
25933,Eos,2012,5990,Manual,74000,Diesel,125,58.9,2.0,vw
25934,Fox,2008,1799,Manual,88102,Petrol,145,46.3,1.2,vw
25935,Fox,2009,1590,Manual,70000,Petrol,200,42.0,1.4,vw
25936,Fox,2006,1250,Manual,82704,Petrol,150,46.3,1.2,vw


# Prepare data for training

In [177]:
def prepare_data_label(data: pd.DataFrame, cols: list) -> pd.DataFrame:
    enc = LabelEncoder()
    for i in cols:
        data[i] = enc.fit_transform(data[i])
    return data

def prepare_data_onehot(data: pd.DataFrame, cols: list) -> pd.DataFrame:
    enc = OneHotEncoder(sparse_output=False)
    for i in cols:
        encoded_col = enc.fit_transform(data[[i]])
        encoded_df = pd.DataFrame(encoded_col, columns=enc.get_feature_names_out()).astype(int)
        data = pd.concat([data, encoded_df], axis=1)
        data = data.drop(i, axis=1)
    return data

# df = prepare_data_label(df, ["model"])
df = prepare_data_onehot(df, ["fuelType", "brand", "transmission", "model"])
# model transmission  fuelType      brand         
df

Unnamed: 0,year,price,mileage,tax,mpg,engineSize,fuelType_Diesel,fuelType_Electric,fuelType_Hybrid,fuelType_Other,...,model_ X2,model_ X3,model_ X4,model_ X5,model_ X6,model_ X7,model_ Z3,model_ Z4,model_ i3,model_ i8
0,2014,11200,67068,125,57.6,2.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2018,27000,14827,145,42.8,2.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2016,16000,62794,160,51.4,3.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2017,12750,26676,145,72.4,1.5,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2014,14500,39554,160,50.4,3.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25933,2012,5990,74000,125,58.9,2.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25934,2008,1799,88102,145,46.3,1.2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25935,2009,1590,70000,200,42.0,1.4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25936,2006,1250,82704,150,46.3,1.2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [178]:
X = df.copy()
y = X.pop("price")
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2)

In [179]:
regressor = RandomForestRegressor(n_estimators=500)
regressor.fit(X_train, y_train)
yhat = regressor.predict(X_test)

In [180]:
MAE(y_true=y_test, y_pred=yhat)

1257.1101841992445

In [181]:
X_test.iloc[2:4,:]

Unnamed: 0,year,mileage,tax,mpg,engineSize,fuelType_Diesel,fuelType_Electric,fuelType_Hybrid,fuelType_Other,fuelType_Petrol,...,model_ X2,model_ X3,model_ X4,model_ X5,model_ X6,model_ X7,model_ Z3,model_ Z4,model_ i3,model_ i8
8809,2017,42069,20,68.9,2.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13128,2017,17880,20,58.9,1.6,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [182]:
y_test[2:4]

8809     12988
13128    13882
Name: price, dtype: int64

In [183]:
regressor.predict(X_test.iloc[2:4,:])

array([12752.20133333, 13999.61      ])

In [1]:
import psycopg

In [9]:
with psycopg.connect("postgresql://postgres:example@172.18.45.234:5432/postgres") as conn:
    with conn.cursor() as cur:
        cur.execute('SELECT * FROM cars')
        rows = cur.fetchall()
        for record in rows:
            print(record)

UndefinedTable: relation "cars" does not exist
LINE 1: SELECT * FROM cars
                      ^