In [1]:
from src.embeddings.car_make import CarMakeEmbedding
from sklearn.preprocessing import OneHotEncoder
from datetime import date
import pandas as pd

In [2]:
cme = CarMakeEmbedding(
        label_encoder='/Users/ignasi/Documents/_03_MDS_/_01_ADSDB_/src/embeddings/encoder.pkl',
        pkl_path='/Users/ignasi/Documents/_03_MDS_/_01_ADSDB_/src/embeddings/embedding.pkl',
    )

In [26]:
X

Unnamed: 0,make_0,make_1,make_2,make_3,make_4,make_5,make_6,make_7,make_8,make_9,vehicle_age
0,1.079697,1.275756,-2.120022,0.443211,-0.032822,-1.873419,0.642528,1.523962,1.413312,0.522230,22.0
1,1.079697,1.275756,-2.120022,0.443211,-0.032822,-1.873419,0.642528,1.523962,1.413312,0.522230,22.0
2,1.079697,1.275756,-2.120022,0.443211,-0.032822,-1.873419,0.642528,1.523962,1.413312,0.522230,22.0
3,1.079697,1.275756,-2.120022,0.443211,-0.032822,-1.873419,0.642528,1.523962,1.413312,0.522230,22.0
4,1.079697,1.275756,-2.120022,0.443211,-0.032822,-1.873419,0.642528,1.523962,1.413312,0.522230,22.0
...,...,...,...,...,...,...,...,...,...,...,...
6561,1.079697,1.275756,-2.120022,0.443211,-0.032822,-1.873419,0.642528,1.523962,1.413312,0.522230,23.0
6562,1.079697,1.275756,-2.120022,0.443211,-0.032822,-1.873419,0.642528,1.523962,1.413312,0.522230,23.0
6563,-0.858480,0.283772,1.957316,0.434062,0.475781,0.007459,-0.194566,-0.355091,0.147343,1.034901,12.0
6564,0.898987,0.058487,1.387583,-0.509846,-0.590852,0.437885,-0.678264,1.134799,1.981396,-1.393860,9.0


## Analytical sandbox

We read the dataset with the following columns:
'person_id', 'person_age', 'person_sex', 'vehicle_make', 'vehicle_year', 'overall_rating'
From these columns and as we want to analyze the vehicle safety rating, we will delete the information related to the driver, so we will delete 'person_id', 'person_age', 'person_sex'. More specifically, we will try to find a relation between makes + years ~ safety. Looking for a pattern in this variables and computing clustering or regressions. 

In [3]:
df = pd.read_csv('/Users/ignasi/Documents/_03_MDS_/_01_ADSDB_/ui/sample_tables/mvc_safety_rating_by_accidents.csv')
df = df[['vehicle_make', 'vehicle_year', 'overall_rating']]

In [7]:
df['vehicle_year'].max()

2024.0

## Feature engineering

In [8]:
df['vehicle_age'] = date.today().year - df['vehicle_year']
df['norm_overall_rating'] = df['overall_rating'] / 5

In [9]:
df['vehicle_make_embedding'] = df['vehicle_make'].apply(lambda x: cme.execute(x)[0])
make_df = pd.DataFrame(df['vehicle_make_embedding'].tolist(), columns=[f'make_{i}' for i in range(len(df['vehicle_make_embedding'][0]))])
df = pd.concat([df.drop(columns=['vehicle_make_embedding']), make_df], axis=1)
df.dropna(inplace=True)

In [10]:
df = df[[f'make_{i}' for i in range(10)] + ['vehicle_age', 'overall_rating']]

In [11]:
X, y = df[[f'make_{i}' for i in range(10)] + ['vehicle_age']], df['overall_rating']

In [12]:
y_onehot = OneHotEncoder().fit_transform(list(map(lambda x: [round(x, 0)], y))).toarray()

In [13]:
y_class = list(map(lambda x: round(x, 0), y))

## Modeling

In [14]:
import numpy as np
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X, y)
reg.score(X, y)

0.14106601670008756

In [15]:
from sklearn.linear_model import Ridge
import numpy as np
clf = Ridge(alpha=1.0)
clf.fit(X, y)
clf.score(X, y)

0.14105732492932876

In [16]:
from sklearn import linear_model
clf = linear_model.Lasso(alpha=0.1)
clf.fit(X, y)
clf.score(X, y)

0.07102075624098958

In [17]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor()
regr.fit(X, y)
regr.score(X, y)

0.21405112201610055

In [18]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [19]:
models = {
    'RandomForest': {
        'model': RandomForestClassifier(),
        'param_grid': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 5, 10],
            'min_samples_split': [2, 5, 10]
        }
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'param_grid': {
            'n_neighbors': [3, 5, 7, 9],
            'weights': ['uniform', 'distance']
        }
    }
}

# Loop through the models and perform grid search
for model_name, model_data in models.items():
    print(f"Tuning {model_name}...")

    # Create a GridSearchCV object
    grid_search = GridSearchCV(estimator=model_data['model'],
                               param_grid=model_data['param_grid'],
                               cv=5)

    # Fit the grid search to the training data
    grid_search.fit(X, y_class)

    # Print the best parameters and best score
    print(f"Best parameters found for {model_name}: ", grid_search.best_params_)
    print(f"Best cross-validation score for {model_name}: ", grid_search.best_score_)

    # Evaluate the model on the test set
    score = grid_search.score(X, y_class)
    print(f"Test set score for {model_name}: ", score)
    print("-" * 50)


Tuning RandomForest...




Best parameters found for RandomForest:  {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}
Best cross-validation score for RandomForest:  0.567896994042421
Test set score for RandomForest:  0.5898410504492052
--------------------------------------------------
Tuning KNN...




Best parameters found for KNN:  {'n_neighbors': 7, 'weights': 'uniform'}
Best cross-validation score for KNN:  0.5585684793171548
Test set score for KNN:  0.5345542501727713
--------------------------------------------------


In [20]:
rf = RandomForestClassifier()
rf.fit(X, y_class)
rf.score(X, y_class)

0.5898410504492052

In [21]:
import pickle

filename = 'rf_rands.pkl'
with open(filename, 'wb') as file:
    pickle.dump(rf, file)

In [28]:
df['vehicle_age'].max()

28.0