# List of Defined Functions

## Pre-modelling Tests

In [3]:
# creates a heatmap from a dataframe

def heatmap(df):
    plt.figure(figsize = (10,8))
    sns.heatmap(df.corr().abs(), annot=True);
    return

In [2]:
# evaluates the Variance Inflation Factor of X_train variables

def vif(X_train):
    from statsmodels.stats.outliers_influence import variance_inflation_factor
    # defining an empty dataframe to capture the VIF scores
    vif = pd.DataFrame()

    # For each column,run a variance_inflaction_factor against all other columns to get a VIF Factor score
    vif["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(len(X_train.columns))]

    # label the scores with their related columns
    vif["features"] = X_train.columns
    
    # print out the vif table and return
    print(vif)
    return

## Post-Modelling Tests

In [4]:
def qq(x_test, x_train, y_test, y_train):
    # QQ plots are generally great tools for checking for normality.
    import statsmodels.api as sm
    from sklearn.linear_model import LinearRegression
    
    # Calculating residuals
    lr = LinearRegression()
    lr.fit(x_train, y_train)
    y_hat = lr.predict(x_test)
    
    residuals = y_test - y_hat
    
    
    sm.qqplot(residuals, line = 'r');
    return

In [6]:
def sk_metrics(y, model):
    from sklearn.metrics import mean_squared_error, mean_absolute_error

    print("Metrics:")
    # R2
    print(f"R2: {r2_score(y, model):.3f}")
    # MAE
    print(f"Mean Absolute Error: {mean_absolute_error(y, model):.3f}")
    # MSE
    print(f"Mean Squared Error: {mean_squared_error(y, model):.3f}")
    # RMSE - just MSE but set squared=False
    print(f"Root Mean Squared Error: {mean_squared_error(y, model, squared=False):.3f}")
    return

In [1]:
def sm_metrics(model, y, x):
    # import associated tools
    from statsmodels.tools.eval_measures import rmse, mse, meanabs
    
    # now generate predictions
    ypred = model.predict(X)

    # Print values
    print('Metrics:')
    # MAE
    print(f"Mean Absolute Error: {meanabs(y, ypred):.3f}")
    # MSE
    print(f"Mean Squared Error: {mse(y, ypred):.3f}")
    # RMSE
    print(f"Root Mean Squared Error: {rmse(y, ypred):.3f}")
    return

In [2]:
def lr_model_metrics(x_test, x_train, y_test, y_train, cat_vars):
    # Create scalers with non-cat vars
    ss = StandardScaler()
    ss.fit(x_train.drop(cat_vars, axis=1))
    x_train_scale = pd.DataFrame(ss.transform(x_train.drop(cat_vars, axis=1)))
    x_test_scale = pd.DataFrame(ss.transform(x_test.drop(cat_vars, axis=1)))
    
    # One hot encoding cat vars
    onehot = OneHotEncoder(sparse=False)
    x_train_cat = pd.DataFrame(onehot.fit_transform(x_train[cat_vars]))
    x_train_cat.columns = onehot.get_feature_names(cat_vars)
    x_test_cat = pd.DataFrame(onehot.transform(x_test[cat_vars]))
    x_test_cat.columns = onehot.get_feature_names(cat_vars)
    
    # Combine dummied cat vars with scaled vars
    x_train_df = x_train_cat.join(x_train_scale)
    x_test_df = x_test_cat.join(x_test_scale)
    
    # Run linear regression model for data
    lr = LinearRegression()
    model = lr.fit(x_train_df, y_train)
    
    print('Train Data')
    sk_metrics(y_train, model.predict(x_train_df.values))
    
    print('Test Data')
    sk_metrics(y_test, model.predict(x_test_df.values))
    return