# Data engineering and Model fitting

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Scikit learn packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler #for scaling of mileage

# Statsmodel packages
import statsmodels.api as sm
import statsmodels.tools

from statsmodels.stats.outliers_influence import variance_inflation_factor

#for normality test
from scipy import stats

## Defining useful functions

In [None]:
def dummies(columns_to_dummy, df):
    """Function that returns dummies for the columns specified"""
    df = pd.get_dummies(data=df, columns=columns_to_dummy,
                        drop_first=True, prefix = columns_to_dummy)

    return df

In [None]:
def apr(y_pred, y_real):
    accuracy = metrics.accuracy_score(y_real, y_pred)
    precision = metrics.precision_score(y_real, y_pred)
    recall = metrics.recall_score(y_real, y_pred)
    f1 = metrics.f1_score(y_real, y_pred)
    
    print(f"Accuracy:{accuracy}")
    print(f"Precision:{precision}")
    print(f"Recall:{recall}")
    print(f"F1:{f1}")
    return accuracy, precision, recall, f1


In [None]:
def produce_confusion(positive_label, negative_label, cut_off, df, y_pred_name, y_real_name):
    
    #Set pred to 0 or 1 depending on whether it's higher than the cut_off point.
    
    if cut_off != 'binary':      
        df['pred_binary'] = np.where(df[y_pred_name] > cut_off , 1, 0)
    else: 
        df['pred_binary'] = df[y_pred_name]
    
    #Build the CM
    cm = confusion_matrix(df[y_real_name], df['pred_binary'])  
    
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax=ax, fmt='g'); 

    # labels, title, ticks
    ax.set_xlabel('Predicted labels');ax.set_ylabel('Real labels'); 
    ax.set_title('Confusion Matrix'); 
    ax.xaxis.set_ticklabels([negative_label, positive_label])
    ax.yaxis.set_ticklabels([negative_label, positive_label]);

    print('Test accuracy = ', accuracy_score(df[y_real_name], df['pred_binary']))

    return accuracy_score(df[y_real_name], df['pred_binary'])

In [None]:
#function converting int to numeric based on index
def get_index(df, x):
    unique_items= list(df[x].unique())
    unique_items.sort()
    
    def index_n(n):
        return unique_items.index(n) + 1 
    
    df[x] = df[x].apply(index_n)
    return df

## Test Train Split

In [None]:
feature_cols = ['column1', 'column2', 'column3']

In [None]:
# Create X, and y.

X = df[feature_cols] ##subset with all the features from feature_cols
y = df[['target']]

In [None]:
# train test split with random seed set to 42 and 20% test size
X_train, X_test, y_train, y_test = train_test_split(X.copy(), #features
                                                    y.copy(), #target
                                                    test_size = 0.2, #20% of the dataframe Audi is test
                                                    random_state = 42) #random seed