## Model training

In [1]:
import os
import joblib
import warnings
import numpy as np
import pandas as pd
from scipy.stats import norm, skew
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

warnings.filterwarnings('ignore')

In [2]:
def compute_mae(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2):
    mae = mean_absolute_error(y_test, y_pred)
    return {'mae': f"{round(mae, precision)}"}

def save_object(obj, filename):
    current_working_dir = os.path.abspath(os.getcwd())
    file_path = os.path.join(current_working_dir, 'models', filename)
    joblib.dump(obj, file_path)
    
def load_object(filename):
    current_working_dir = os.path.abspath(os.getcwd())
    file_path = os.path.join(current_working_dir, 'models', filename)
    obj = joblib.load(file_path)
    return obj

In [3]:
selected_features = ['powerPS', 'vehicleType', 'brand', 'fuelType', 'kilometer', 'price']
selected_predictors = ['powerPS', 'vehicleType', 'brand', 'fuelType', 'kilometer']
continuous_features = ['powerPS', 'kilometer', 'price']
continuous_predictors = ['powerPS', 'kilometer']
categorical_features = ['vehicleType', 'brand', 'fuelType']
target_feature = 'price'

imputer_filename = 'imputer.joblib'
pipeline_filename = 'pipeline.joblib'
skewed_predictors = 'skewed_predictors.joblib'

In [4]:
def fix_data_type(data):
    fixed_data = data.astype({
        'powerPS': object
    })
    return fixed_data

def update_data_features(data_raw, features):
    return data_raw[features]

def clean_data(data_raw):
    updated_data_raw = update_data_features(data_raw, selected_features)
    fixed_type_data_raw = fix_data_type(updated_data_raw)
    return fixed_type_data_raw

def clean_inference_data(data):
    updated_data_raw = update_data_features(data, selected_predictors)
    fixed_type_data_raw = fix_data_type(updated_data_raw)
    return fixed_type_data_raw

In [5]:
def fix_data(data):
    data.drop_duplicates(inplace=True)
    data.drop(data[data['powerPS'] == 0].index, inplace=True)
    data.reset_index(drop=True, inplace=True)
    return data

In [6]:
def handing_missing_values(data):
    si = SimpleImputer(strategy='constant', fill_value='any')
    si.fit(data[selected_predictors])
    save_object(si, imputer_filename)

def fix_missing_values(data):
    si = load_object(imputer_filename)
    data.loc[:, selected_predictors] = si.transform(data[selected_predictors])
    return data

In [7]:
def find_skewed_predictors(data):
    skewness = data[continuous_predictors].apply(lambda x: skew(x))
    skewness = skewness[abs(skewness) > 0.5]
    skewed_features = skewness.index
    save_object(skewed_features, skewed_predictors)

def fix_predictor_skewness(data):
    skewed_features = load_object(skewed_predictors)
    data[skewed_features] = np.log1p(data[skewed_features].astype(float))
    return data

def fix_target_skewness(data):
    data[target_feature] = np.log1p(data[target_feature].astype(float))
    return data

In [8]:
def preprocess_for_build(data):
    fixed_data = fix_data(data)
    handing_missing_values(fixed_data)
    no_missing_array = fix_missing_values(fixed_data)
    no_missing_data = pd.DataFrame(no_missing_array, columns=selected_features)
    find_skewed_predictors(no_missing_data)
    unskewed_predictors_data = fix_predictor_skewness(no_missing_data)
    unskewed_data = fix_target_skewness(unskewed_predictors_data)
    return unskewed_data

def preprocess_for_prediction(data):
    no_missing_data = fix_missing_values(data)
    unskewed_data = fix_predictor_skewness(no_missing_data)
    return unskewed_data

In [9]:
def split_train_data_raw(data, target_feature):
    X = data.drop(target_feature, axis=1)
    y = data[target_feature]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = 0)
    
    return X_train, X_val, y_train, y_val

In [10]:
def build_pipeline():
    categorical_transformer = OneHotEncoder(handle_unknown="ignore")
    numeric_transformer = StandardScaler()

    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", categorical_transformer, categorical_features),
            ("num", numeric_transformer, continuous_predictors),
        ]
    )

    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestRegressor(n_estimators=50, max_depth=10, max_features="sqrt"))
    ])
    return pipe
    
def train_model(X, y):
    # Train
    pipeline = build_pipeline()
    pipeline.fit(X, y)
    save_object(pipeline, pipeline_filename)
    
def predict(data):
    pipeline = load_object(pipeline_filename)
    y_pred = pipeline.predict(data)
    return y_pred

In [11]:
def build_model(data_raw: pd.DataFrame):
    cleaned_data = clean_data(data_raw)
    preprocessed_data = preprocess_for_build(cleaned_data)
    X_train, X_val, y_train, y_val = split_train_data_raw(preprocessed_data, target_feature)
    # Train model and predict
    train_model(X_train, y_train)
    y_pred = predict(X_val)
    result = mean_absolute_error(y_val, y_pred)
    return result

In [13]:
raw = pd.read_csv('./data/autos.csv', encoding='ISO-8859-1')
build_model(raw)

0.7708193113907246

In [47]:
raw = pd.read_csv('./data/autos.csv', encoding='ISO-8859-1')
cleaned_data = clean_data(raw)
preprocessed_data = preprocess_for_build(cleaned_data)
preprocessed_data.sample(n=5)
X_train, X_val, y_train, y_val = split_train_data_raw(preprocessed_data, target_feature)
# Train model and predict
train_model(X_train, y_train)
y_pred = predict(X_val)
result = mean_absolute_error(y_val, y_pred)

In [48]:
result

0.6665314754939098

## Model Inference

In [12]:
def make_predictions(input_data_raw: pd.DataFrame):
    # the model and all the data preparation objects (encoder, etc) should be loaded from the models folder
    data = clean_inference_data(input_data_raw)
    preprocessed_data = preprocess_for_prediction(data)
    y_pred = predict(preprocessed_data)
    return np.expm1(y_pred)

data_raw = pd.read_csv('./data/test.csv')
y_pred = make_predictions(data_raw)
y_pred

KeyError: 118