In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, ShuffleSplit
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_absolute_percentage_error, make_scorer, root_mean_squared_error, mean_absolute_error
import xgboost as xgb
import pickle
import datetime

# Experimenting

In [139]:
df = pd.read_csv("car_data.csv")
df = df.drop("guid", axis=1)
X = df.copy()
X = pd.get_dummies(X, columns=["make", "model", "fuel_type", "transmission"])
y = X.pop("price")
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

In [140]:
scaler = StandardScaler()
X_train[["age", "mileage"]] = scaler.fit_transform(X_train[["age", "mileage"]])
X_test[["age", "mileage"]] = scaler.transform(X_test[["age", "mileage"]])

## Random forrest

In [224]:
rfc = RandomForestRegressor(n_estimators=100)
rfc.fit(X_train, y_train)

In [None]:
rf = RandomForestRegressor()
import numpy as np
param_grid = {
    'n_estimators': range(500,1500),  # Number of trees in the forest
    'max_depth': list(range(1, 100)) + [None],  # Maximum depth of the tree
    'min_samples_split': range(1,11),  # Minimum number of samples required to split a node
    'min_samples_leaf': range(1,5),    # Minimum number of samples required at each leaf node
    'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider at every split
    'bootstrap': [True, False]        # Whether bootstrap samples are used
    # 'max_samples': [None, 0.8, 0.9]    # If bootstrap=True, number of samples to draw
}
scoring = make_scorer(mean_absolute_percentage_error, greater_is_better=False)
grid_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    scoring=scoring,
    n_iter = 100,
    cv=3,               # 5-fold cross-validation
    n_jobs=-1,          # Use all available cores
    verbose=3,          # Print progress
    refit=True          # Refit the best model on the entire dataset
)
grid_search.fit(X_train, y_train)

In [281]:
grid_search.best_params_

{'n_estimators': 1037,
 'min_samples_split': 6,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 97,
 'bootstrap': False}

In [282]:
yhat = grid_search.predict(X_test)

In [283]:
mean_absolute_percentage_error(y_test, yhat)

0.3457807085109395

In [289]:
y_test.head()

4207     64950
8273     11750
6189     12445
5923     17445
627     149950
Name: price, dtype: int64

In [288]:
yhat[:5]

array([71764.86176788, 11066.08733077, 11397.23118534, 18222.71658781,
       98008.03230473])

## XGB

In [150]:
df = pd.read_csv("car_data.csv")
df = df.drop("guid", axis=1)
X = df.copy()
y = X.pop("price")
# X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

In [151]:
scaler = StandardScaler()
# X_train[["age", "mileage"]] = scaler.fit_transform(X_train[["age", "mileage"]])
# X_test[["age", "mileage"]] = scaler.transform(X_test[["age", "mileage"]])
X_numeric = X[["age", "mileage"]]
X_numeric[["age", "mileage"]] = scaler.fit_transform(X_numeric)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_numeric[["age", "mileage"]] = scaler.fit_transform(X_numeric)


In [154]:
enc = OneHotEncoder(sparse_output=False)
X_cat = X[["make", "model", "fuel_type", "transmission"]]
X_cat = pd.DataFrame(enc.fit_transform(X_cat)).astype(bool)

In [None]:
X_combined = X_numeric.join(X_cat)
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, shuffle=True, random_state=1)

In [141]:
import xgboost as xgb
reg = xgb.XGBRegressor(n_estimators=5000, max_depth=7, eta=0.1)
reg.fit(X_train, y_train)

In [142]:
yhat2 = reg.predict(X_test)

In [145]:
from sklearn.metrics import mean_absolute_error
mean_absolute_percentage_error(y_test, yhat2)

0.27939990162849426

In [158]:
# averaging with a loop
results = []
for _ in range(10):
    df = pd.read_csv("car_data.csv")
    df = df.drop("guid", axis=1)
    X = df.copy()
    X = pd.get_dummies(X, columns=["make", "model", "fuel_type", "transmission"])
    y = X.pop("price")
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)
    scaler = StandardScaler()
    X_train[["age", "mileage"]] = scaler.fit_transform(X_train[["age", "mileage"]])
    X_test[["age", "mileage"]] = scaler.transform(X_test[["age", "mileage"]])
    reg = xgb.XGBRegressor(n_estimators=5000, max_depth=7, eta=0.1)
    reg.fit(X_train, y_train)
    yhat2 = reg.predict(X_test)
    results.append(root_mean_squared_error(y_test,yhat2))
print(np.mean(results))

18225.65927734375


In [None]:
# averaging with a loop
results = []
for _ in range(10):
    df = pd.read_csv("car_data.csv")
    df = df.drop("guid", axis=1)
    X = df.copy()
    y = X.pop("price")
    scaler = StandardScaler()
    X_numeric = X[["age", "mileage"]]
    X_numeric[["age", "mileage"]] = scaler.fit_transform(X_numeric)
    enc = OneHotEncoder(sparse_output=False)
    X_cat = X[["make", "model", "fuel_type", "transmission"]]
    X_cat = pd.DataFrame(enc.fit_transform(X_cat)).astype(bool)
    X_combined = X_numeric.join(X_cat)
    X_train, X_test, y_train, y_test = train_test_split(X_combined, y, shuffle=True, random_state=1)
    reg = xgb.XGBRegressor(n_estimators=5000, max_depth=7, eta=0.1)
    reg.fit(X_train, y_train)
    yhat2 = reg.predict(X_test)
    results.append(root_mean_squared_error(y_test,yhat2))
print(np.mean(results))

## GBR

In [166]:
xnum = X[["age", "mileage"]].head()
xcat = X[["make", "model", "fuel_type", "transmission"]].head()
xcat = pd.DataFrame(enc.transform(xcat)).astype(bool)
xnum[["age", "mileage"]] = scaler.transform(xnum)
xcomb = xnum.join(xcat)
reg.predict(xcomb)

array([ 2678.5576,  6005.7197,  3147.0212, 16688.838 ,   909.983 ],
      dtype=float32)

In [170]:
y.head()

0     2495
1     4950
2     2888
3    16495
4      490
Name: price, dtype: int64

In [113]:
from sklearn.ensemble import GradientBoostingRegressor
reg2 = GradientBoostingRegressor(n_estimators=500, max_depth=7, learning_rate=0.1)
reg2.fit(X_train, y_train)

In [None]:
yhat2 = reg2.predict(X_test)
mean_absolute_percentage_error(y_test, yhat2)

3.7626030445098877

# Pipeline making

## misc

In [None]:
# averaging with a loop
results = []
params = {'alpha': 0,
 'lambda': 1,
 'learning_rate': 0.1,
 'max_depth': 6,
 'min_child_weight': 1,
 'n_estimators': 2000,
 'gamma': 0.01}

for _ in range(10):
    df = pd.read_csv("car_data.csv")
    df = df.drop("guid", axis=1)
    X = df.copy()
    y = X.pop("price")
    scaler = StandardScaler()
    X_numeric = X[["age", "mileage"]]
    X_numeric[["age", "mileage"]] = scaler.fit_transform(X_numeric)
    enc = OneHotEncoder(sparse_output=False)
    X_cat = X[["make", "model", "fuel_type", "transmission"]]
    X_cat = pd.DataFrame(enc.fit_transform(X_cat)).astype(bool)
    X_combined = X_numeric.join(X_cat)
    X_train, X_test, y_train, y_test = train_test_split(X_combined, y, shuffle=True)
    reg = xgb.XGBRegressor(**params)
    reg.fit(X_train, y_train)
    yhat2 = reg.predict(X_test)
    results.append(mean_absolute_percentage_error(y_test,yhat2))
print(np.mean(results))

## train/test

In [2]:
def load_data():
    df = pd.read_csv("car_data.csv")
    df = df.drop("guid", axis=1)
    X = df.copy()
    y = X.pop("price")
    return X,y

def scale_encode_training(X,y, y_scaler:StandardScaler):
    scaler = StandardScaler()
    enc = OneHotEncoder(sparse_output=False)

    y = y_scaler.fit_transform(pd.DataFrame(y))

    X_numeric = X[["age", "mileage"]]
    X_numeric[["age", "mileage"]] = scaler.fit_transform(X_numeric)

    X_cat = X[["make", "model", "fuel_type", "transmission"]]
    X_cat = pd.DataFrame(enc.fit_transform(X_cat)).astype(bool)
    X_combined = X_numeric.join(X_cat)

    return X_combined, y, y_scaler

scaler = StandardScaler()
X, y = load_data()
X,y,scaler = scale_encode_training(X,y,scaler)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_numeric[["age", "mileage"]] = scaler.fit_transform(X_numeric)


In [None]:
from sklearn.model_selection import KFold
reg = xgb.XGBRegressor()
n_estimators = range(500, 5001, 500)
param_grid = dict(n_estimators=n_estimators)
kfold = KFold(n_splits=3, shuffle=True, random_state=7)
grid_search = GridSearchCV(reg, param_grid, scoring="neg_mean_absolute_error", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X_train, y_train)

from matplotlib import pyplot
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
pyplot.errorbar(n_estimators, means, yerr=stds)
pyplot.title("XGBoost n_estimators vs Log Loss")
pyplot.xlabel('n_estimators')
pyplot.ylabel('MAE Loss')

In [None]:
reg = xgb.XGBRegressor(n_estimators)
param_grid = {
    'n_estimators': (1000, 2000),
    'learning_rate': (0.3, 0.1, 0.01),
    # 'max_depth': (3, 6, 9),
    # 'min_child_weight': (1, 3, 6),
    'gamma': (0, 0.1, 0.01),
    # 'alpha': (0, 0.1),
    # 'lambda': [1, 1.5]
}


# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=reg, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=2, refit=False)
# grid_search = RandomizedSearchCV(estimator=reg, param_distributions=param_grid, cv=3, n_iter=10, scoring='neg_mean_squared_error', n_jobs=4, verbose=2, refit=True)

In [66]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [None]:
yhat = grid_search.best_estimator_.predict(X_test)
yhat_norm = scaler.inverse_transform(pd.DataFrame(yhat))
y_test_norm = scaler.inverse_transform(pd.DataFrame(y_test))
mean_absolute_percentage_error(y_true=y_test_norm, y_pred=yhat_norm)

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [68]:
grid_search.best_params_

{'gamma': 0.01, 'learning_rate': 0.1, 'n_estimators': 2000}

In [78]:
params = {'alpha': 0,
 'lambda': 1,
 'learning_rate': 0.1,
 'max_depth': 6,
 'min_child_weight': 1,
 'n_estimators': 2000,
 'gamma': 0.01}
reg = xgb.XGBRegressor(**params)
reg.fit(X_train, y_train)

KeyboardInterrupt: 

In [77]:
yhat = reg.predict(X_test)
yhat_norm = scaler.inverse_transform(pd.DataFrame(yhat))
y_test_norm = scaler.inverse_transform(pd.DataFrame(y_test))
mean_absolute_percentage_error(y_true=y_test_norm, y_pred=yhat_norm)

0.30843647789295725

# Prod

In [70]:
df = pd.read_csv("car_data.csv")
df.loc[(df["price"]<100000) & (df["price"] >500)].to_csv("car_data_clean.csv", index=False)

In [3]:
def load_data():
    df = pd.read_csv("car_data_clean.csv")
    df = df.drop("guid", axis=1)
    X = df.copy()
    y = X.pop("price")
    return X,y

def scale_encode_training(X,y, yscaler:StandardScaler, xscaler:StandardScaler, enc:OneHotEncoder):
    y = yscaler.fit_transform(pd.DataFrame(y))

    X_numeric = X[["age", "mileage"]]
    X_numeric[["age", "mileage"]] = xscaler.fit_transform(X_numeric)

    X_cat = X[["make", "model", "fuel_type", "transmission"]]
    X_cat = pd.DataFrame(enc.fit_transform(X_cat)).astype(bool)
    X_combined = X_numeric.join(X_cat)

    return X_combined, y, yscaler, xscaler, enc

def scale_encode_predicting(X, xscaler:StandardScaler, enc:OneHotEncoder):
    # y = yscaler.transform(pd.DataFrame(y))

    X_numeric = X[["age", "mileage"]]
    X_numeric[["age", "mileage"]] = xscaler.transform(X_numeric)

    X_cat = X[["make", "model", "fuel_type", "transmission"]]
    X_cat = pd.DataFrame(enc.transform(X_cat)).astype(bool)
    X_combined = X_numeric.join(X_cat)

    return X_combined, xscaler, enc

# scaler = StandardScaler()
# enc = OneHotEncoder(sparse_output=False)
# X, y = load_data()
# X,y,scaler, enc = scale_encode_training(X,y,scaler, enc)
# X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

In [4]:
def train():
    yscaler = StandardScaler()
    xscaler = StandardScaler()
    enc = OneHotEncoder(sparse_output=False)
    X, y = load_data()
    X,y,yscaler,xscaler,enc = scale_encode_training(X,y,yscaler,xscaler,enc)
    with open('ystandardscaler.pickle', 'wb') as f:
        pickle.dump(yscaler, f)
    with open('xstandardscaler.pickle', 'wb') as f:
        pickle.dump(xscaler, f)
    with open('encoder.pickle', 'wb') as f:
        pickle.dump(enc, f)

    params = {'alpha': 0,
    'lambda': 1,
    'learning_rate': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,
    'n_estimators': 2000,
    'gamma': 0.01}
    reg = xgb.XGBRegressor(**params)
    reg.fit(X, y)
    with open('regressor.pickle', 'wb') as f:
        pickle.dump(reg, f)
    print("Training complete.")
train()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_numeric[["age", "mileage"]] = xscaler.fit_transform(X_numeric)


Training complete.


In [None]:
def load_and_predict(df_prediction:pd.DataFrame):
    df_prediction["age"] = df_prediction["age"].apply(lambda x: (datetime.datetime.now() - datetime.datetime.fromisoformat(x)).days)
    with open("xstandardscaler.pickle", "rb") as f:
        xscaler:StandardScaler = pickle.load(f)
    with open("encoder.pickle", "rb") as f:
        enc:OneHotEncoder = pickle.load(f)
    with open("regressor.pickle", "rb") as f:
        reg:xgb.XGBRegressor = pickle.load(f)
    
    X = df_prediction.copy()
    # y = X.pop("price")

    X,_,_ = scale_encode_predicting(X,xscaler,enc)
    yhat = reg.predict(X)
    return yhat
def reverse_target(yhat):
    with open("ystandardscaler.pickle", "rb") as f:
        yscaler:StandardScaler = pickle.load(f)
    yhat = pd.DataFrame(yhat)
    return yscaler.inverse_transform(yhat)

In [19]:
df = pd.DataFrame({"make": "renault","model":"clio","mileage": 125674,"fuel_type": "b","age": "2012-06-01","transmission": "Handgeschakeld"}, index=[0])
yhat = load_and_predict(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_numeric[["age", "mileage"]] = xscaler.transform(X_numeric)


In [36]:
df = pd.read_csv("car_data.csv")
df["fuel_type"].unique()

array(['b', 'd', 'l', '2', 'e', 'c', '3', 'h'], dtype=object)

In [None]:
yhat = reverse_target(yhat)
int(yhat[0][0])

5328

In [106]:
df = pd.read_csv("car_data.csv")
df[df["model"] == "fiesta"]["price"].mean()

np.float64(9953.339805825242)