# Import and cleaning

In [None]:
!pip install fastparquet



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# file_path = 'https://static.home.stackken.com/Data/rate_of_sale_may_2023.snappy.parquet'
# df = pd.read_parquet(file_path)

In [None]:
# pd.read_csv('https://static.home.stackken.com/output.csv')

Unnamed: 0.1,Unnamed: 0,stock_item_id,last_date_seen,first_date_seen,days_to_sell,first_retailer_asking_price,last_retailer_asking_price,can_home_deliver,reviews_per_100_advertised_stock_last_12_months,segment,...,odometer_reading_miles,first_registration_date,attention_grabber,manufacturer_approved,price_indicator_rating,adjusted_retail_amount_gbp,predicted_mileage,number_of_images,advert_quality,postcode_area
0,0,52ae009b671ab58b3d4ff109a9fbdcf8d847de0fa190e1...,2023-05-05,2021-03-25,771,6995,6495.0,False,3.9,Independent,...,65000,2004-05-07,*IMMACULATE**FULL HISTORY*,False,NOANALYSIS,,,50,,AL
1,1,32b1bac6934b1f64ff43cffa9df5aa296ead8143c36f9f...,2023-05-09,2021-05-25,714,13725,14995.0,False,,Franchise,...,16018,2019-11-30,Sports Styling | Great Economy,True,GOOD,14848.0,26078.0,15,57.0,HP
2,2,21703d22d87eaa95c4dc81a60ba2c8cbe3b90ab659292c...,2023-05-12,2021-11-26,532,15499,13999.0,False,0.2,Independent,...,31093,2018-03-08,"Sat Nav,Leather,Auto,Euro 6",False,GREAT,14571.0,34732.0,22,61.0,SR
3,3,661acafc271373946cea7d30ac7f34257404ab89a1ad33...,2023-05-16,2022-02-17,453,10995,9995.0,False,7.9,Franchise,...,79000,2015-07-02,Viewing by APPOINTMENT ONLY,False,FAIR,9349.0,65684.0,30,61.0,FY
4,4,638216dc92410d965b416fea5b3cec9ca903368795fdde...,2023-05-04,2022-03-21,409,46000,37500.0,False,6.8,Franchise,...,10214,2022-03-03,Reserve Online,True,GOOD,37055.0,11765.0,22,48.0,LE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224550,224550,f93da2e68ed75b921e3edbb5800155851725db45dac4e2...,2023-05-26,2023-05-24,2,11045,11045.0,True,17.3,Independent,...,59000,2016-03-13,2 OWNERS LOW MILEAGE SATNAV,False,GOOD,11230.0,52196.0,9,63.0,BS
224551,224551,558367b9f89d83eee684559ab31f06d46332d119a008fd...,2023-05-28,2023-05-24,4,9000,9000.0,False,8.3,Franchise,...,37717,2014-11-25,Competitive finance available.,False,GOOD,8898.0,49703.0,19,31.0,PA
224552,224552,deb732e58db907f571ab161110a80506a4bb2ba53248a9...,2023-05-27,2023-05-25,2,11300,11300.0,False,1.7,Franchise,...,51488,2019-11-30,GREAT MPG,False,GREAT,11646.0,28134.0,4,30.0,DT
224553,224553,1ca206c9b6b1eb56363c1d6e3bbc9d283dc4a4df622a32...,2023-05-27,2023-05-25,2,4699,4699.0,False,8.7,Independent,...,38408,2011-09-13,,False,GOOD,4743.0,53249.0,30,63.0,PR


# Linear Regression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

df = pd.read_csv('https://static.home.stackken.com/output.csv')

numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'bool']).columns.tolist()

numerical_cols.remove('days_to_sell')

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

model = LinearRegression()

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

X = df.drop('days_to_sell', axis=1)
y = df['days_to_sell']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


Mean Absolute Error: 2.482060203440694
Mean Squared Error: 248.82923860341583
R^2 Score: 0.9216395567500285


# Random Forest Regressor

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor

df = pd.read_csv('https://static.home.stackken.com/output.csv').sample(frac=0.05)

numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'bool']).columns.tolist()

numerical_cols.remove('days_to_sell')

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

model = RandomForestRegressor(n_estimators=20)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

X = df.drop('days_to_sell', axis=1)
y = df['days_to_sell']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


# Extreme Gradient Boost regression

In [None]:
!pip install xgboost



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error


data = pd.read_csv('https://static.home.stackken.com/output.csv')


X = data.drop('days_to_sell', axis=1)
y = data['days_to_sell']

label_encoders = {}
for column in X.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    X[column] = label_encoders[column].fit_transform(X[column])

X = X.fillna(X.mean())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


xgb_model = XGBRegressor(objective='reg:squarederror', n_estimators=100)
xgb_model.fit(X_train, y_train)


y_pred = xgb_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')



Mean Absolute Error: 0.9626905027568647
Mean Squared Error: 112.0316272185063
R^2 Score: 0.9659133942400759


# Keras Sequential

In [None]:
!pip install tensorflow



In [None]:
!pip install keras



In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tensorflow.keras import layers, models

data = pd.read_csv('https://static.home.stackken.com/output.csv')

X = data.drop('days_to_sell', axis=1)
y = data['days_to_sell']

numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(exclude=['int64', 'float64']).columns

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.3)

X_train_full_preprocessed = preprocessor.fit_transform(X_train_full)
X_test_preprocessed = preprocessor.transform(X_test)

X_train, X_val, y_train, y_val = train_test_split(X_train_full_preprocessed, y_train_full, test_size=0.3)

model = models.Sequential([
    layers.Dense(64, activation='relu', input_shape=[X_train.shape[1]]),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)
])

model.compile(optimizer='adam', loss='mean_squared_error')

model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, batch_size=32)

y_pred = model.predict(X_test_preprocessed)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


