# Basic Models

In [13]:
import warnings
warnings.filterwarnings('ignore')

In [14]:
import pandas as pd
data = pd.read_csv("../data/scout_data/Case_Study_Data_CLEANED.csv", sep=',')
description = pd.read_csv("../data/scout_data/Data_Description.csv", sep=';')

In [15]:
pd.set_option('max_colwidth', 1000)
description

Unnamed: 0,column name,description
0,article_id,unique article identifier
1,product_tier,premium status of the article
2,make_name,name of the car manufacturer
3,price,price of the article
4,first_zip_digit,first digit of the zip code of the region the article is offered in
5,first_registration_year,year of the first registration of the article
6,created_date,creation date of the listing
7,deleted_date,deletion date of the listing
8,search_views,number of times the article has been shown as a search result
9,detail_views,number of times the article has been clicked on


In [16]:
data.describe()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78320 entries, 0 to 78319
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   product_tier     78320 non-null  object 
 1   make_name        78320 non-null  object 
 2   car_age          78320 non-null  int64  
 3   price            78320 non-null  int64  
 4   search_views     78320 non-null  float64
 5   detail_views     78320 non-null  float64
 6   stock_days       78320 non-null  int64  
 7   ctr              78320 non-null  object 
 8   article_id       78320 non-null  int64  
 9   first_zip_digit  78320 non-null  int64  
 10  created_date     78320 non-null  object 
 11  deleted_date     78320 non-null  object 
dtypes: float64(2), int64(5), object(5)
memory usage: 7.2+ MB


In [17]:
data.head()

Unnamed: 0,product_tier,make_name,car_age,price,search_views,detail_views,stock_days,ctr,article_id,first_zip_digit,created_date,deleted_date
0,Basic,Mitsubishi,11,16750,3091.0,123.0,30,0.037803299902944,350625839,5,24.07.18,24.08.18
1,Basic,Mercedes-Benz,9,35950,3283.0,223.0,52,0.06792567773378,354412280,4,16.08.18,07.10.18
2,Basic,Mercedes-Benz,26,11950,3247.0,265.0,51,0.0816137973514013,349572992,3,16.07.18,05.09.18
3,Basic,Ford,21,1750,1856.0,26.0,101,0.0140086206896551,350266763,6,20.07.18,29.10.18
4,Basic,Mercedes-Benz,10,26500,490.0,20.0,12,0.0408163265306122,355688985,3,28.08.18,08.09.18


# Linear Regression - Predicting expected stock days 


In [33]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

y = data.stock_days
X = data[['price', 'car_age']]
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2)
reg = LinearRegression().fit(X_train, y_train)

## Scoring
The R² score is a measure of how well the model's predictions match the actual values. It indicates the proportion of variance in the dependent variable that is predictable from the independent variables. 

$R^2 = 1 − \frac{SS_{tot}}{​SS_{res}}$

The R² score (coefficient of determination) ranges between negative values and 1:
- 1: The model explains 100% of the variance in the target variable. Perfect fit.
- 0: The model is no better than simply predicting the mean of the target variable.
- Negative Values: The model is worse than predicting the mean. <br>
It means that the model's predictions deviate significantly from the true values, to the point where simply predicting the average target value would give a better result.

In [31]:
print(f"Score R^2: {reg.score(X_test, y_test):.4f}")
print(f"Coefficients: {[f'{coef:.4f}' for coef in reg.coef_]}") 
print(f"Intercept: {reg.intercept_:.4f}")

Score R^2: 0.0107
Coefficients: ['0.0001', '-0.3033']
Intercept: 37.8922


## Polynomials
<detail>

A simple linear regression can be extended by constructing polynomial features from the coefficients. In the standard linear regression case, you might have a model that looks like this for two-dimensional data:

$$\hat{y}(w, x) = w_0 + w_1 x_1 + w_2 x_2$$

If we want to fit a paraboloid to the data instead of a plane, we can combine the features in second-order polynomials, so that the model looks like this:
$$\hat{y}(w, x) = w_0 + w_1 x_1 + w_2 x_2 + w_3 x_1 x_2 + w_4 x_1^2 + w_5 x_2^2$$

The (sometimes surprising) observation is that this is still a linear model: to see this, imagine creating a new set of features
$$z = [x_1, x_2, x_1 x_2, x_1^2, x_2^2]$$

With this re-labeling of the data, our problem can be written
$$\hat{y}(w, z) = w_0 + w_1 z_1 + w_2 z_2 + w_3 z_3 + w_4 z_4 + w_5 z_5$$

We see that the resulting polynomial regression is in the same class of linear models we considered above (i.e. the model is linear in $w$) and can be solved by the same techniques. By considering linear fits within a higher-dimensional space built with these basis functions, the model has the flexibility to fit a much broader range of data.
</detail>

In [32]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

# Creating the polynomial linear regression pipeline
pipe = Pipeline([
    ('poly', PolynomialFeatures(degree=2)),
    ('linear', LinearRegression(fit_intercept=False))
])

# Fitting the model
pipe.fit(X_train, y_train)

# Making predictions
y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)

# Evaluating the model
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f'Train MSE: {train_mse:.2f}, Train R^2: {train_r2:.4f}')
print(f'Test MSE: {test_mse:.2f}, Test R^2: {test_r2:.4f}')

Train MSE: 1018.97, Train R^2: 0.0166
Test MSE: 1031.01, Test R^2: 0.0123


## Standardizing

In [34]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Ridge (L2)
The ridge coefficients minimize a penalized residual sum of squares:
$$\min_{w} || X w - y||_2^2 + \alpha ||w||_{2}^2$$
The complexity parameter $\alpha \geq 0$ controls the amount of shrinkage: <br>
the larger the value of $\alpha$, the greater the amount of shrinkage and thus the coefficients become more robust to collinearity.

It is particularly useful to mitigate the problem of multicollinearity in linear regression, which commonly occurs in models with large numbers of parameters.

 In general, the method provides improved efficiency in parameter estimation problems in exchange for a tolerable amount of bias (see bias–variance tradeoff).

In [35]:
from sklearn.linear_model import Ridge

ridge_reg = Ridge(alpha=1.0)  # You can tune alpha via cross-validation
ridge_reg.fit(X_train_scaled, y_train)
print(f"R^2: {ridge_reg.score(X_test_scaled, y_test):.4f}")

R^2: 0.0141


## Gridsearch
Exhaustive search over specified parameter values for an estimator.

In [38]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

param_grid = {'alpha': [0.1, 1.0, 10.0, 100.0]}
ridge = Ridge()
grid_search = GridSearchCV(ridge, param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)
print(f'Best alpha: {grid_search.best_params_}')

Best alpha: {'alpha': 100.0}


In [39]:
ridge_best = Ridge(alpha=100)  # You can tune alpha via cross-validation
ridge_best.fit(X_train_scaled, y_train)
print(f"{ridge_best.score(X_test_scaled, y_test):.4f}")

0.0141


## Cross validation

Resampling method that iteratively partitions data into mutually exclusive ‘train’ and ‘test’ subsets so model performance can be evaluated on unseen data. 

This conserves data as avoids the need to hold out a ‘validation’ dataset and accounts for variability as multiple rounds of cross validation are generally performed.

This helps in avoiding the randomness associated with the train-test split and gives a better estimate evaluates more robustly how well the model will generalize.

In [40]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(ridge_reg, X_test_scaled, y_test, cv=5)
print(f'Cross-validated score: {cv_scores.mean():.4f}')

Cross-validated score: 0.0139


## Lasso (L1)
The Lasso is a linear model that estimates sparse coefficients. It prefers solutions with fewer non-zero coefficients, effectively reducing the number of features upon which the given solution is dependent. .

Mathematically, it consists of a linear model with an added regularization term. The objective function to minimize is:
$$\min_{w} { \frac{1}{2n_{\text{samples}}} ||X w - y||_2 ^ 2 + \alpha ||w||_1}$$

The lasso estimate thus solves the minimization of the least-squares penalty with $(\alpha ||w||_1)$ added, where $(\alpha)$ is a constant and $(||w||_1)$ is the $(\ell_1)$-norm of the coefficient vector.

In [41]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

lasso_reg = Lasso(alpha=0.1)  
lasso_reg.fit(X_train_scaled, y_train)
lasso_reg.score(X_test_scaled, y_test)

# search for a good alpha
param_grid = {'alpha': [0.1, 1.0, 10.0, 100.0]}
grid_search = GridSearchCV(lasso_reg, param_grid, cv=5) 
grid_search.fit(X_train_scaled, y_train)

print(f'Best alpha: {grid_search.best_params_}')

Best alpha: {'alpha': 0.1}


In [44]:
# Train Lasso with best alpha
lasso_best = Ridge(alpha=0.1)  
lasso_best.fit(X_train_scaled, y_train)
print(f"{lasso_best.score(X_test_scaled, y_test):.6f}")

0.014086


In [46]:
cv_scores = cross_val_score(lasso_best, X_test_scaled, y_test, cv=5)
print(f'Cross-validated score: {cv_scores.mean():.6f}')

Cross-validated score: 0.013892


# Random Forest

A random forest fits a number of decision tree regressors on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. 

Trees in the forest use the best split strategy, i.e. equivalent to passing _splitter="best"_ to the underlying DecisionTreeRegressor. 

The sub-sample size is controlled with the max_samples parameter if _bootstrap=True_ (default), otherwise the whole dataset is used to build each tree.

**n_estimators**: int, default=100, The number of trees in the forest.

**criterion**: {“squared_error”, “absolute_error”, “friedman_mse”, “poisson”}, default=”squared_error”

**max_dept**:  hint, default=None, The maximum depth of the tree.

**n_jobs**: int, default=None, The number of jobs to run in parallel. None means 1,  -1 means using all processors. 

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor


In [47]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, criterion="friedman_mse", n_jobs=-1)
rf.fit(X_train_scaled, y_train)
rf.score(X_test_scaled, y_test)

cv_scores = cross_val_score(rf, X_test_scaled, y_test, cv=5)
print(f'Cross-validated score: {cv_scores.mean():.4f}')

Cross-validated score: -0.2338


# Bad Fit and Good Fix

## Possible reasons for negative R² scores:

- **Overfitting**: The model may have memorized the training data well, but performs poorly on the unseen validation sets.  
- **Poor Feature Selection**: The input features (X_test_scaled) might not be informative enough for the model to predict the target variable accurately.
- **Data Scaling/Preprocessing**: Scaling or preprocessing issues might exist, or the features may need more transformation.
- **Data Issues**: The dataset might contain noise, or the relationship between the features and the target variable might not be easily captured by the Random Forest model.
- **Model Complexity**: Random Forest models are usually robust, but they can still struggle with certain data types or relationships, particularly if the problem is inherently complex or the data has high variance.

## How to address this issue:

- **Check for Data Quality**: Make sure there are no data entry errors, missing values, or highly imbalanced classes that could skew the model's learning.
- **Tune Hyperparameters**: Use hyperparameter tuning (e.g., GridSearchCV or RandomizedSearchCV) to search for the best hyperparameters for your Random Forest model (like n_estimators, max_depth, etc.).
- **Feature Engineering**: Experiment with feature selection, engineering, and transformation. Adding interaction terms, normalizing, or reducing dimensionality with methods like PCA could help.
- **Try Other Models**: Sometimes, the Random Forest might not be the best model for the task. You could try other models like Gradient Boosting, XGBoost, or even Linear Regression to see if they perform better.
- **Cross-validation strategy**: Make sure your cross-validation is set up correctly and that you are not leaking information between the folds.

# Categorical Features - Encoding

To integrate categorical features into your model, you need to encode them into a numerical format. 

The two most common encoding methods for categorical features are One-Hot Encoding and Label Encoding. 

One-Hot Encoding is typically preferred because it ensures that the model treats the categorical feature as non-ordinal (i.e., the model doesn't assume any inherent order among categories).

- StandardScaler() is applied to the numerical features ['price', 'car_age'].

- OneHotEncoder(drop='first') is used to encode the categorical features ['product_tier', 'make_name']. 

- The drop='first' option ensures we avoid the dummy variable trap by dropping one category from each categorical variable.

- ColumnTransformer allows you to apply these transformations to the specific columns you selected.


In [4]:
data.make_name.unique()

array(['Mitsubishi', 'Mercedes-Benz', 'Ford', 'Volkswagen', 'Fiat',
       'Renault', 'Mazda', 'Peugeot', 'Opel', 'Toyota', 'Jaguar', 'Volvo',
       'Dacia', 'MINI', 'Porsche', 'Nissan', 'BMW', 'Land Rover', 'Audi',
       'Citroen', 'Hyundai', 'Suzuki', 'Alfa Romeo', 'Chevrolet',
       'Daewoo', 'Kia', 'Maserati', 'Skoda', 'Caravans-Wohnm', 'SEAT',
       'Honda', 'Daihatsu', 'Chrysler', 'smart', 'Saab', 'Jeep',
       'Others ', 'Lexus', 'Aixam', 'Ligier', 'Lancia', 'Oldtimer',
       'Chatenet', 'Subaru', 'Triumph', 'Ferrari', 'Rolls-Royce', 'Dodge',
       'MG', 'Cadillac', 'DS Automobiles', 'Iveco', 'Bentley',
       'SsangYong', 'Tesla', 'Trucks-Lkw', 'TVR', 'Aston Martin',
       'Abarth', 'HUMMER', 'Lincoln', 'Isuzu', 'Microcar', 'Buick', 'AC',
       'Alpina', 'Corvette', 'McLaren', 'Rover', 'Austin', 'De Tomaso',
       'FISKER', 'Infiniti', 'Lotus', 'Morgan', 'GMC', 'Oldsmobile',
       'Donkervoort', 'Alpine', 'Daimler', 'Lamborghini', 'Grecav',
       'Casalini', 'Pontia

In [53]:
data.product_tier.unique()

array(['Basic', 'Premium', 'Plus'], dtype=object)

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

y = data['stock_days']
X = data[['price', 'car_age', 'product_tier', 'make_name']]

# Extract all possible categories from the entire dataset (for both training and test purposes)
possible_categories = {
    'product_tier': X['product_tier'].unique(),
    'make_name': X['make_name'].unique()
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['price', 'car_age']),  # Scale numerical features
        ('cat', OneHotEncoder(categories=[possible_categories['product_tier'], 
                                          possible_categories['make_name']],
                              handle_unknown='ignore'), ['product_tier', 'make_name'])  # One-Hot encode categorical features with predefined categories
    ]
)

# Pipeline that first applies the preprocessor and then the model
pipe = Pipeline([('preprocessor', preprocessor),
                 ('regressor', LinearRegression())])

# Fit the pipeline on the training data
pipe.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = pipe.predict(X_test)

In [11]:
print(f"R² score: {pipe.score(X_test, y_test):.4f}")
print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred):.4f}")

cv_scores = cross_val_score(pipe, X_test, y_test, cv=5)
print(f'Cross-validated score: {cv_scores.mean():.4f}')

R² score: 0.0109
Mean Squared Error: 1032.4841
Cross-validated score: 0.0067
