### Import libraries

In [None]:
import pandas as pd
import numpy as np

from fuzzywuzzy import process #https://github.com/seatgeek/fuzzywuzzy
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.ensemble import  GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.impute import KNNImputer
import warnings


### Functions

In [2]:
def correct_brand_name (df, col, brand_list):
    """
    Correcting the brand names of the cars that are not spelled correctly
    
    Parameters
    ----------
    df : Dataframe
        The dataframe of the data
    col : String
        The name of the column that we want to correct
    brand_list : List
        List of the correct names of the brands

    Returns
    -------
    df : Dataframe
        The main dataframe with the names spelled correctly

    """
    for brand in brand_list:
        matches = process.extract(brand, df[col], limit = df.shape[0])
        for potential_match in matches: 
            if potential_match[1]>70:
                df.loc[df['brand_name']==potential_match[0], col]=brand
                
    return df

def encode_string (df, list_with_features, encoder):
    """
    Encode the string columns to transform them to numerical

    Parameters
    ----------
    df : Dataframe
        The dataframe
    col : String
        Name of the feature to encode
    encoder : Class, Encoder
        Name of the encoder that we will use

    Returns
    -------
    df : Dataframe
        The dataframe with the feature encoded

    """
    for feature in list_with_features:
        df[f'encoded_{feature}'] = encoder.fit_transform(df[feature])
    return df

def fill_na(df, imputer):
    """
    Filling the NaN values of the dataframe with an imputer

    Parameters
    ----------
    df : Dataframe
        The main dataframe
    imputer : Class (KNNImputer)
        The imputer we will use

    Returns
    -------
    df : Dataframe
        The dataframe with no NaN values

    """
    features = df.drop(columns=['encoded_brand_name', 'car name']).columns.tolist()
    array = imputer.fit_transform(df[features])
    df = pd.DataFrame(array, columns=features)
    return df

def drop_outliers(df, feature, threshold):
    """
    Function to drop the outliers 

    Parameters
    ----------
    df : Dataframe
        The main Dataframe
    cols_with_outliers : List
        List with the features that have outliers

    Returns
    -------
    df : Dataframe
        The main dataframe without the outliers

    """

    df = df[~(df[feature]>= df[feature].quantile(0.75) + threshold*(df[feature].quantile(0.75)-df[feature].quantile(0.25)))]
    return df

def train_predict(model, X_train, X_test, Y_train, Y_test):
    """
    Predict on train and test set so that we can optimize the model after

    Parameters
    ----------
    Model : Class (GradientBoostingRegressor)
        The regressor we will use to predict the target
    X_train : 2D numpy array
        Values of the features of the train set
    X_test : 2D numpy array
        Values of the features of the test set
    Y_train : 1D numpy array
        The target values of the train set 
    Y_test : 1D numpy array
        The target values of the test set to compare the predictions

    Returns
    -------
    Printing the accuracy of the model on train and test set predictions

    """
    model.fit(X_train, Y_train)
    predict_train = model.predict(X_train)
    predict_test = model.predict(X_test)
    print(f'Train and predict using: {model}')
    print(f'The root mean squared error on the train set is: {np.sqrt(mean_squared_error(Y_train, predict_train))}')
    print(f'The root mean squared error on the test set is: {np.sqrt(mean_squared_error(Y_test, predict_test))}')
    print(f'The mean absolute error on train set is: {mean_absolute_error(Y_train, predict_train)}')
    print(f'The mean absolute error on test set is: {mean_absolute_error(Y_test, predict_test)}')
    print(f'The R2 accuracy on train set is: {r2_score(Y_train, predict_train)}')
    print(f'The R2 accuracy on test set is: {r2_score(Y_test, predict_test)}')

def sanity_check (X_train, Y_train, X_test, Y_test):
    """
    Checking the shapes of the train and test sets 
    Input:
        All the sets we need
    Output:
        Printing the shapes to check that everything is great
    """
    print ('Shape of X_train=>',X_train.shape)
    print ('Shape of X_test=>',X_test.shape)
    print ('Shape of Y_train=>',Y_train.shape)
    print ('Shape of Y_test=>',Y_test.shape)

# Main

### Preprocessing

In [3]:
# Reading Data
path = 'C:/Users/Maverick/Documents/Development/Regeneration/Project/Data/mpg.data.xlsx'
df = pd.read_excel(path)

In [4]:
# Dropping columns that are not useful
df=df.drop(['Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12'], axis = 1)

In [5]:
# Checking for NaN values in the dataframe
df.columns = ['mpg',	'cylinders',	'displacements', 	'horsepower',	'weight',	'acceleration',	'model year',	'origin',	'car name']
print(df.isna().any())

mpg               True
cylinders        False
displacements    False
horsepower        True
weight           False
acceleration     False
model year       False
origin           False
car name         False
dtype: bool


In [6]:
# Seperating the brand name from the car name
df['brand_name'] = df['car name'].str.split(" ").str.get(0)

In [7]:
# Correcting the brand names
brand_list = ['chevrolet', 'mazda', 'mercedes-benz', 'toyota', 'vw']
df = correct_brand_name(df, 'brand_name', brand_list)


In [8]:
# Encode string feature to numerical ones
list_of_features_to_encode = ['cylinders', 'origin', 'brand_name', 'model year']
df = encode_string(df, list_of_features_to_encode, LabelEncoder())

In [9]:
# Droping the string feature that we encoded
drop = ['cylinders', 'origin', 'brand_name', 'model year']
df.drop(columns=drop, inplace=True)

In [10]:
# Filling NaN values with the KNNImputer
df = fill_na(df, KNNImputer(n_neighbors=2, weights='uniform'))

In [11]:
# From EDA we know there are outliers in acceleration and in horsepower
# We create a dataframe with no outliers
df = drop_outliers(df, 'horsepower', 1.8)
df = drop_outliers(df, 'acceleration', 2)

# Training

In [12]:
# Splitting the data to train and test sets with target the MPG
seed = 21
X_train, X_test, Y_train, Y_test = train_test_split(df.drop('mpg', axis=1), df['mpg'], test_size=0.2, random_state=seed)


In [13]:
# Sanity check of the arrays' shapes
sanity_check(X_train, Y_train, X_test, Y_test)

Shape of X_train=> (322, 7)
Shape of X_test=> (81, 7)
Shape of Y_train=> (322,)
Shape of Y_test=> (81,)


In [14]:
# Scaling the imput data with standardscaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [15]:
# Training the model on train set and predicting the target on test set
train_predict(GradientBoostingRegressor(), X_train, X_test, Y_train, Y_test)

Train and predict using: GradientBoostingRegressor()
The root mean squared error on the train set is: 1.359596997389613
The root mean squared error on the test set is: 2.0953893155085828
The mean absolute error on train set is: 1.0504774508948949
The mean absolute error on test set is: 1.619263056210526
The R2 accuracy on train set is: 0.969870195401348
The R2 accuracy on test set is: 0.9242876751396395


In [None]:
# Finding the best parameters for the model with grid search
search_grid = {'n_estimators':[500,1000,2000], 
               'learning_rate':[.001,0.01,.1], 
               'max_depth':[1,2,4], 
               'subsample':[.5,.75,1], 
               'random_state':[1]}
gsv = GridSearchCV(estimator=GradientBoostingRegressor(), param_grid=search_grid, scoring='neg_mean_squared_error', cv=5)
gsv.fit(X_train, Y_train)
print(f'The best score from the grid search is: {gsv.best_score_}')
print(f'The best parameters from the grid search is: {gsv.best_params_}')

In [None]:
# Optimimal parameters for the model to train and predict
model = GradientBoostingRegressor(learning_rate=0.01, max_depth=2, n_estimators=1000, random_state=1, subsample=0.5)
train_predict(model, X_train, X_test, Y_train, Y_test)