In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.color_palette("tab10")
from scipy import stats
from sklearn.model_selection import train_test_split
import os
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.linear_model import LinearRegression, TweedieRegressor, LassoLars
from sklearn.metrics import explained_variance_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
from sklearn.feature_selection import SelectKBest, RFE, f_regression
from sklearn.linear_model import LinearRegression
seed = 1349
target = 'quality'

In [2]:
def acquire():
    '''
    Obtains the vanilla version of both the red and white wine dataframe
    INPUT:
    NONE
    OUTPUT:
    red = pandas dataframe with red wine data
    white = pandas dataframe with white wine data
    '''
    red = pd.read_csv('https://query.data.world/s/k6viyg23e4usmgc2joiodhf2pvcvao?dws=00000')
    white = pd.read_csv('https://query.data.world/s/d5jg7efmkn3kq7cmrvvfkx2ww7epq7?dws=00000')
    return red, white

In [3]:
def prepare_mvp():
    '''
    Takes in the vanilla red and white wine dataframes and returns a cleaned version that is ready 
    for exploration and further analysis
    INPUT:
    NONE
    OUTPUT:
    wines = pandas dataframe with both red and white wine prepped for exploration
    '''
    red, white = acquire()
    red['is_red'] = 1
    white['is_red'] = 0
    wines = pd.concat([red, white], ignore_index = True)
    return wines

In [4]:
def split_wines(df):
    '''
    This function takes in a dataframe and splits it into 3 data sets
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    #split_db class verision with random seed
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed)
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed)
    return train, validate, test

In [5]:
def wrangle():
    '''
    Function that acquires, prepares, and splits the wines dataframe for use as well as 
    creating a csv.
    INPUT:
    NONE
    OUTPUT:
    .csv = ONLY IF FILE NONEXISTANT
    wines = pandas dataframe with both red and white wine prepped for exploration
    '''
    if os.path.exists('wines.csv'):
        wines = pd.read_csv('wines.csv', index_col=0)
        train, validate, test = split_wines(wines)
        return train, validate, test
    else:
        red, white = acquire()
        wines = pd.concat([red, white], ignore_index = True)
        wines.to_csv('wines.csv')
        train, validate, test = split_wines(wines)
        return train, validate, test
    

In [6]:
wines = prepare_mvp()
df = wines
wines

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,is_red
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,1
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,0
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,0
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,0
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,0


In [7]:
wines

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,is_red
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,1
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,0
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,0
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,0
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,0


In [8]:
wines.to_csv("wines.csv", index=False)

In [9]:
train, validate, test = wrangle()

In [10]:
scores = pd.DataFrame(columns=['model_name', 'features', 'scaling',
                               'RMSE_train', 'R2_train', 'RMSE_validate', 'R2_validate', 'RMSE_difference', 'R2_difference'])

In [11]:
def run_model(X_train, X_validate, scaling):
    
    '''
    general function to run models with X_train and X_validate that were scaled
    '''

    for f in features:
        for key in models:
            # create a model
            model = models[key]
            # fit the model
            model.fit(X_train[features[f]], y_train)
            # predictions of the train set
            y_hat_train = model.predict(X_train[features[f]])
            # predictions of the validate set
            y_hat_validate = model.predict(X_validate[features[f]])
            # add train set predictions to the data frame
            predictions_train[key] = y_hat_train
            # add validate set predictions to the data frame
            predictions_validate[key] = y_hat_validate

            # calculate scores train set
            RMSE, R2 = regression_errors(y_train, y_hat_train)
            # calculate scores validation set
            RMSE_val, R2_val = regression_errors(y_validate, y_hat_validate)
            diff = np.abs(RMSE - RMSE_val)
            # calculate R2 difference
            R2_diff = R2 - R2_val
            
            # add the score results to the scores Data Frame
            scores.loc[len(scores.index)] = [key, f, scaling, RMSE, R2, RMSE_val, R2_val, diff, R2_diff]
            

In [12]:
def select_kbest(X, y, k):
    '''
    the function accepts the X_train data set, y_train array and k-number of features to select
    runs the SelectKBest algorithm and returns the list of features to be selected for the modeling
    !KBest doesn't depend on the model
    '''
    kbest = SelectKBest(f_regression, k=k)
    kbest.fit(X, y)
    return X.columns[kbest.get_support()].tolist()

In [13]:
models = {
    'Linear Regression': LinearRegression(),
    'Generalized Linear Model': TweedieRegressor(power=2, alpha = 0.5),
    'Gradient Boosting Regression': GradientBoostingRegressor(random_state=seed),
    'Decision Tree Regression': DecisionTreeRegressor(max_depth=4, random_state=seed),
    'Random Forest Regression':RandomForestRegressor(max_depth=4, random_state=seed),
    'LassoLars Regression':LassoLars(alpha=0.5)
    }

In [14]:
def full_split_wines(train, validate, test, target):
    '''
    accepts train, validate, test data sets and the name of the target variable as a parameter
    splits the data frame into:
    X_train, X_validate, X_test, y_train, y_validate, y_test
    '''
    #train, validate, test = train_validate_test_split(df, target)

    #save target column
    y_train = train[target]
    y_validate = validate[target]
    y_test = test[target]

    #remove target column from the sets
    train.drop(columns = target, inplace=True)
    validate.drop(columns = target, inplace=True)
    test.drop(columns = target, inplace=True)

    return train, validate, test, y_train, y_validate, y_test

In [15]:
X_train, X_validate, X_test, y_train, y_validate, y_test = full_split_wines(train, validate, test, target)

In [16]:
def standard_scale_wines(train, validate, test):
    '''
    accepts train, validate, test data sets
    scales the data in each of them
    returns transformed data sets
    '''

    col = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol']
    
    # create scalers
    scaler = StandardScaler()    
    #qt = QuantileTransformer(output_distribution='normal')
    scaler.fit(train[col])
    train[col] = scaler.transform(train[col])
    validate[col] = scaler.transform(validate[col])
    test[col] = scaler.transform(test[col])
    
    return train, validate, test

In [17]:
X1, X2, X3 = standard_scale_wines(X_train, X_validate, X_test)

In [18]:
def run_model_standard():
    # runs regression models on the X_train scaled with StandardScaler()
    X1, X2, _ = standard_scale_wines(X_train, X_validate, X_test)
    run_model(X1, X2, 'standard')

In [19]:
X_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
2112,0.146846,0.104621,0.092269,-0.844283,-0.870769,-0.196004,-1.077971,-1.066327,-0.054803,-1.351899,0.608591
802,-1.644581,1.457879,-2.185447,-0.781804,-0.352976,-0.951496,-0.525263,-0.691239,2.111036,2.739836,2.033866
3146,-0.398371,-0.256248,1.196617,-0.136192,-0.625499,0.32703,0.758446,-0.079254,-0.611733,-0.546968,-0.481326
5286,-0.943588,-1.098276,-0.321861,1.363294,-0.734508,0.501374,-0.311312,-0.237186,-0.42609,-1.284822,0.273232
1884,0.068958,-0.135958,-0.459904,0.363637,-0.025948,0.966292,1.150691,0.447184,-0.859258,-0.412812,-1.403563


In [20]:
f1 = ['volatile acidity', 'chlorides', 'density']
f2 = ['volatile acidity', 'chlorides']
f3 = ['volatile acidity', 'chlorides', 'density', 'alcohol']
f4 = ['volatile acidity', 'chlorides', 'density', 'alcohol', 'residual sugar']
f5 = ['volatile acidity', 'chlorides', 'density', 'residual sugar', 'density', 'fixed acidity']
f6 = ['volatile acidity', 'chlorides', 'density', 'residual sugar', 'density', 'fixed acidity', 'citric acid', 'residual sugar', 'free sulfur dioxide', 'total sulfur dioxide', 'pH', 'sulphates']

# create a dictionary with features
features = {
    'f1':f1,
    'f2':f2,
    'f3':f3,
    'f4':f4,
    'f5':f5,
    'f6':f6,
}

In [21]:
baseline = y_train.median()


predictions_train = pd.DataFrame(y_train)
predictions_validate = pd.DataFrame(y_validate)
predictions_train['baseline'] = baseline
predictions_validate['baseline'] = baseline

In [22]:
def regression_errors(y_actual, y_predicted):
    '''
    Calculates RMSE and R2 for regression models
    '''
    RMSE = np.sqrt(mean_squared_error(y_actual, y_predicted))
    R2 = r2_score(y_actual, y_predicted)
    return RMSE, R2

In [23]:
import warnings
warnings.filterwarnings("ignore")

In [24]:
run_model_standard()

In [25]:
scores

Unnamed: 0,model_name,features,scaling,RMSE_train,R2_train,RMSE_validate,R2_validate,RMSE_difference,R2_difference
0,Linear Regression,f1,standard,0.808903,0.127267,0.828247,0.12086,0.019344,0.006407
1,Generalized Linear Model,f1,standard,0.812672,0.119115,0.833317,0.110064,0.020645,0.009051
2,Gradient Boosting Regression,f1,standard,0.724326,0.300228,0.798467,0.182945,0.074141,0.117284
3,Decision Tree Regression,f1,standard,0.771725,0.205646,0.808852,0.161552,0.037127,0.044094
4,Random Forest Regression,f1,standard,0.761008,0.227557,0.799305,0.181228,0.038297,0.046328
5,LassoLars Regression,f1,standard,0.865876,0.0,0.883536,-0.000429,0.01766,0.000429
6,Linear Regression,f2,standard,0.832119,0.076452,0.85014,0.073769,0.018021,0.002683
7,Generalized Linear Model,f2,standard,0.834598,0.070941,0.853326,0.066814,0.018728,0.004127
8,Gradient Boosting Regression,f2,standard,0.766527,0.216311,0.833307,0.110086,0.06678,0.106224
9,Decision Tree Regression,f2,standard,0.79598,0.15493,0.834578,0.10737,0.038598,0.04756


In [26]:
def run_best_model():
    '''
    the function runs the best model on the train, test and validate data sets 
    and returns scores in the data frame
    '''
    # create a data frame for test set results
    predictions_test = pd.DataFrame(y_test)
    predictions_test['baseline'] = baseline

    f = f6
    poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
    poly.fit(X1[f])

    # create a df with transformed features of the train set
    X1_poly = pd.DataFrame(
                poly.transform(X1[f]),
                columns=poly.get_feature_names(X1[f].columns),
                index=X1.index)
    X1_poly = pd.concat([X1_poly, X1.iloc[:, 2:]], axis=1)

    # create a df with transformed features for the validate set
    X2_poly = pd.DataFrame(
                poly.transform(X2[f]),
                columns=poly.get_feature_names(X2[f].columns),
                index=X2.index)
    X2_poly = pd.concat([X2_poly, X2.iloc[:, 2:]], axis=1)

    # create a df with transformed features for the validate set
    X2_poly = pd.DataFrame(
                poly.transform(X2[f]),
                columns=poly.get_feature_names(X2[f].columns),
                index=X2.index)
    X2_poly = pd.concat([X2_poly, X2.iloc[:, 2:]], axis=1)

    # create. df with transformed features for the test set
    X3_poly = pd.DataFrame(
                poly.transform(X3[f]),
                columns=poly.get_feature_names(X3[f].columns),
                index=X3.index)
    X3_poly = pd.concat([X3_poly, X3.iloc[:, 2:]], axis=1)

    # create a Gradient Boosting Regression model
    model = GradientBoostingRegressor()
    # fit the model
    model.fit(X1_poly, y_train)
    # predictions of the train set
    y_hat_train = model.predict(X1_poly)
    # predictions of the validate set
    y_hat_validate = model.predict(X2_poly)
    # add train set predictions to the data frame
    y_hat_test = model.predict(X3_poly)
    predictions_test['predictions'] = y_hat_test

    # calculate scores train set
    RMSE_train, R2_train = regression_errors(y_train, y_hat_train)
    # calculate scores validation set
    RMSE_val, R2_val = regression_errors(y_validate, y_hat_validate)
    # calculate scores test set
    RMSE_test, R2_test = regression_errors(y_test, y_hat_test)
    RMSE_bl, _ = regression_errors(y_test, predictions_test.baseline)
    
    # save final score into a dictionary
    res = {
        'Features': str(f),
        'RMSE Train Set': RMSE_train,
        'RMSE Validation Set':RMSE_val,
        'RMSE Test Set':RMSE_test,
        'R2 Train Set':R2_train,
        'R2 Validation Set':R2_val,
        'R2 Test':R2_test,
        'Beats a basline by:':str(f'{round((RMSE_bl - RMSE_test) / RMSE_bl * 100, 1)}%')
    }

    # add the score results to the scores Data Frame
    final_test = pd.DataFrame({'Gradient Bosting Regression': list(res.keys()), 'Scores': list(res.values())})

    return final_test

In [27]:
run_best_model()

Unnamed: 0,Gradient Bosting Regression,Scores
0,Features,"['volatile acidity', 'chlorides', 'density', '..."
1,RMSE Train Set,0.580315
2,RMSE Validation Set,0.683297
3,RMSE Test Set,0.690911
4,R2 Train Set,0.550824
5,R2 Validation Set,0.401648
6,R2 Test,0.384832
7,Beats a basline by:,22.9%


In [28]:
def run_single():
    # create a list ['bedrooms', 'bathrooms', 'sq_feet', 'lot_sqft', 'house_age']
    single_corr = X1.iloc[:, :-3].columns.tolist()

    # for every single feature in the list
    for f in single_corr:
        # create a linear regression model
        model = LinearRegression()
        # fit the model
        model.fit(X1[[f]], y_train)
        # predictions of the train set
        y_hat_train = model.predict(X1[[f]])
        # predictions of the validate set
        y_hat_validate = model.predict(X2[[f]])
        # add train set predictions to the data frame
        predictions_train['single'] = y_hat_train
        # add validate set predictions to the data frame
        predictions_validate['single'] = y_hat_validate

        # calculate scores train set
        RMSE, R2 = regression_errors(y_train, y_hat_train)
        # calculate scores validation set
        RMSE_val, R2_val = regression_errors(y_validate, y_hat_validate)
        diff = np.abs(RMSE - RMSE_val)
        # calculate R2 difference
        R2_diff = R2 - R2_val
            
        # add the score results to the scores Data Frame
        scores.loc[len(scores.index)] = ['Single Linear Regression', f, 'standard', RMSE, R2, RMSE_val, R2_val, diff, R2_diff]


In [29]:
def run_polynomial():

    
    for i in range(1,5):
        # features[f] gives an access to the list of features in the dictionary
        #length = len(features[f])
        # create a Polynomial feature transformer
        poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
        poly.fit(X1.iloc[:, :i])
        # create a df with transformed features of the train set
        X1_poly = pd.DataFrame(
            poly.transform(X1.iloc[:, :i]),
            columns=poly.get_feature_names(X1.iloc[:, :i].columns),
            index=X1.index)
        X1_poly = pd.concat([X1_poly, X1.iloc[:, i:]], axis=1)
        #X1_poly = pd.concat([X1_poly, X1], axis=1)
        
        #display(X1_poly.head(1)) #testing the columns
        
        # create a df with transformed features for the validate set
        X2_poly = pd.DataFrame(
            poly.transform(X2.iloc[:, :i]),
            columns=poly.get_feature_names(X2.iloc[:, :i].columns),
            index=X2.index)
        X2_poly = pd.concat([X2_poly, X2.iloc[:, i:]], axis=1)
        #X2_poly = pd.concat([X2_poly, X2], axis=1)
                             
        feature_name = 'poly'+str(i)
        
        for key in models:
            # create a model
            model = models[key]
            # fit the model
            model.fit(X1_poly, y_train)
            # predictions of the train set
            y_hat_train = model.predict(X1_poly)
            # predictions of the validate set
            y_hat_validate = model.predict(X2_poly)
            # add train set predictions to the data frame
            predictions_train[key] = y_hat_train
            # add validate set predictions to the data frame
            predictions_validate[key] = y_hat_validate

            # calculate scores train set
            RMSE, R2 = regression_errors(y_train, y_hat_train)
            # calculate scores validation set
            RMSE_val, R2_val = regression_errors(y_validate, y_hat_validate)
            diff = np.abs(RMSE - RMSE_val)
            # calculate R2 difference
            R2_diff = R2 - R2_val
            # add the score results to the scores Data Frame
            scores.loc[len(scores.index)] = [key, feature_name, 'standard', RMSE, R2, RMSE_val, R2_val, diff, R2_diff]

In [30]:
def run_rfe():
    '''
    The function accepts the X_train data set, y_train array and k-number of features to select
    runs the RFE algorithm and returns the list of features to be selected for the modeling
    !RFE depends on the model.
    This function uses Linear regression
    '''
    # scale the data
    #X1, X2, _ = wr.standard_scale_zillow(X_train, X_validate, X_test)
    
    for key in models:
        # create a model
        model = models[key]
        
        # create a RFE feature selector
        rfe = RFE(model, n_features_to_select=4)
        rfe.fit(X1, y_train)
        
        # get the optimal features for every particular model
        f = X1.columns[rfe.get_support()].tolist()
        
        # fit the model with RFE features
        model.fit(X1[f], y_train)
        # predictions of the train set
        y_hat_train = model.predict(X1[f])
        # predictions of the validate set
        y_hat_validate = model.predict(X2[f])
        # add train set predictions to the data frame
        col_name = str(key)+'_rfe'
        predictions_train[col_name] = y_hat_train
        # add validate set predictions to the data frame
        predictions_validate[col_name] = y_hat_validate

        # calculate scores train set
        RMSE, R2 = regression_errors(y_train, y_hat_train)
        # calculate scores validation set
        RMSE_val, R2_val = regression_errors(y_validate, y_hat_validate)
        diff = np.abs(RMSE - RMSE_val)
        # calculate R2 difference
        R2_diff = R2 - R2_val
        # add the score results to the scores Data Frame
        scores.loc[len(scores.index)] = [key, 'rfe', 'standard', RMSE, R2, RMSE_val, R2_val, diff, R2_diff]


In [31]:
def scale_wines_quantile(train, validate, test):
    '''
    accepts train, validate, test data sets
    scales the data in each of them
    returns transformed data sets
    '''
    #count_columns = ['bedroomcnt', 'bathroomcnt']
    
    #col = train.columns[1:-1]
    col = ['volatile acidity', 'chlorides', 'density', 'residual sugar', 'density', 'fixed acidity']
    
    # create scalers
    #min_max_scaler = MinMaxScaler()    
    qt = QuantileTransformer(output_distribution='normal')
    qt.fit(train[col])
    train[col] = qt.transform(train[col])
    validate[col] = qt.transform(validate[col])
    test[col] = qt.transform(test[col])
    
    return train, validate, test

In [32]:
def run_model_quantile():
    XQ1, XQ2, _ = scale_wines_quantile(X_train, X_validate, X_test)
    run_model(XQ1, XQ2, 'quantile')

In [33]:
def run_all_models():
    '''
    the function runs all models and saves the results to csv file
    '''
    run_model_standard()
    run_model_quantile()
    run_rfe()
    run_polynomial()
    run_single()
    scores.to_csv('regression_results.csv')


In [34]:
scores

Unnamed: 0,model_name,features,scaling,RMSE_train,R2_train,RMSE_validate,R2_validate,RMSE_difference,R2_difference
0,Linear Regression,f1,standard,0.808903,0.127267,0.828247,0.12086,0.019344,0.006407
1,Generalized Linear Model,f1,standard,0.812672,0.119115,0.833317,0.110064,0.020645,0.009051
2,Gradient Boosting Regression,f1,standard,0.724326,0.300228,0.798467,0.182945,0.074141,0.117284
3,Decision Tree Regression,f1,standard,0.771725,0.205646,0.808852,0.161552,0.037127,0.044094
4,Random Forest Regression,f1,standard,0.761008,0.227557,0.799305,0.181228,0.038297,0.046328
5,LassoLars Regression,f1,standard,0.865876,0.0,0.883536,-0.000429,0.01766,0.000429
6,Linear Regression,f2,standard,0.832119,0.076452,0.85014,0.073769,0.018021,0.002683
7,Generalized Linear Model,f2,standard,0.834598,0.070941,0.853326,0.066814,0.018728,0.004127
8,Gradient Boosting Regression,f2,standard,0.766527,0.216311,0.833307,0.110086,0.06678,0.106224
9,Decision Tree Regression,f2,standard,0.79598,0.15493,0.834578,0.10737,0.038598,0.04756


In [35]:
run_all_models()

In [36]:
scores.sort_values(by='model_name')

Unnamed: 0,model_name,features,scaling,RMSE_train,R2_train,RMSE_validate,R2_validate,RMSE_difference,R2_difference
27,Decision Tree Regression,f5,standard,0.759399,0.230818,0.799444,0.180943,0.040045,0.049875
69,Decision Tree Regression,f6,standard,0.737690,0.274168,0.778681,0.222935,0.040992,0.051233
123,Decision Tree Regression,poly2,standard,0.709980,0.327672,0.752327,0.274644,0.042346,0.053027
75,Decision Tree Regression,f1,quantile,0.771725,0.205646,0.808852,0.161552,0.037127,0.044094
39,Decision Tree Regression,f1,standard,0.771725,0.205646,0.808852,0.161552,0.037127,0.044094
...,...,...,...,...,...,...,...,...,...
140,Single Linear Regression,citric acid,standard,0.863552,0.005361,0.879839,0.007926,0.016287,-0.002565
141,Single Linear Regression,residual sugar,standard,0.865738,0.000318,0.884020,-0.001525,0.018282,0.001843
142,Single Linear Regression,chlorides,standard,0.833100,0.074273,0.850957,0.071989,0.017857,0.002284
143,Single Linear Regression,free sulfur dioxide,standard,0.865218,0.001519,0.881946,0.003168,0.016728,-0.001649


In [37]:
scores

Unnamed: 0,model_name,features,scaling,RMSE_train,R2_train,RMSE_validate,R2_validate,RMSE_difference,R2_difference
0,Linear Regression,f1,standard,0.808903,0.127267,0.828247,0.120860,0.019344,0.006407
1,Generalized Linear Model,f1,standard,0.812672,0.119115,0.833317,0.110064,0.020645,0.009051
2,Gradient Boosting Regression,f1,standard,0.724326,0.300228,0.798467,0.182945,0.074141,0.117284
3,Decision Tree Regression,f1,standard,0.771725,0.205646,0.808852,0.161552,0.037127,0.044094
4,Random Forest Regression,f1,standard,0.761008,0.227557,0.799305,0.181228,0.038297,0.046328
...,...,...,...,...,...,...,...,...,...
141,Single Linear Regression,residual sugar,standard,0.865738,0.000318,0.884020,-0.001525,0.018282,0.001843
142,Single Linear Regression,chlorides,standard,0.833100,0.074273,0.850957,0.071989,0.017857,0.002284
143,Single Linear Regression,free sulfur dioxide,standard,0.865218,0.001519,0.881946,0.003168,0.016728,-0.001649
144,Single Linear Regression,total sulfur dioxide,standard,0.864734,0.002635,0.883630,-0.000641,0.018895,0.003276


In [38]:
scores.to_csv("my_scores_garrett.csv", index=False)

In [39]:
def select_best_model_R2(scores):
    # select top 20 models based on the RMSE score of the train set
    top_20 = scores.sort_values(by='R2_train').head(20)
    # select top 5 models based on the RMSE score of the validate set
    top_5 = top_20.sort_values(by=['R2_validate']).head(5)
    # display top 5 models
    display(top_5)
    # select the best model with the smallest difference in the RMSE scores
    best_model = top_5.sort_values(by='R2_difference').head(1)
    return best_model


In [40]:
select_best_model_R2(scores)

Unnamed: 0,model_name,features,scaling,RMSE_train,R2_train,RMSE_validate,R2_validate,RMSE_difference,R2_difference
101,LassoLars Regression,f5,quantile,0.865876,0.0,0.883536,-0.000429,0.01766,0.000429
125,LassoLars Regression,poly2,standard,0.865876,0.0,0.883536,-0.000429,0.01766,0.000429
41,LassoLars Regression,f1,standard,0.865876,0.0,0.883536,-0.000429,0.01766,0.000429
113,LassoLars Regression,rfe,standard,0.865876,0.0,0.883536,-0.000429,0.01766,0.000429
17,LassoLars Regression,f3,standard,0.865876,0.0,0.883536,-0.000429,0.01766,0.000429


Unnamed: 0,model_name,features,scaling,RMSE_train,R2_train,RMSE_validate,R2_validate,RMSE_difference,R2_difference
101,LassoLars Regression,f5,quantile,0.865876,0.0,0.883536,-0.000429,0.01766,0.000429


In [41]:
def select_best_model_RMSE(scores):
    # select top 20 models based on the RMSE score of the train set
    top_20 = scores.sort_values(by='RMSE_train').head(20)
    # select top 5 models based on the RMSE score of the validate set
    top_5 = top_20.sort_values(by=['RMSE_validate']).head(5)
    # display top 5 models
    display(top_5)
    # select the best model with the smallest difference in the RMSE scores
    best_model = top_5.sort_values(by='RMSE_difference').head(1)
    return best_model


In [42]:
select_best_model_RMSE(scores)

Unnamed: 0,model_name,features,scaling,RMSE_train,R2_train,RMSE_validate,R2_validate,RMSE_difference,R2_difference
134,Gradient Boosting Regression,poly4,standard,0.605767,0.510559,0.691042,0.388006,0.085275,0.122554
128,Gradient Boosting Regression,poly3,standard,0.609096,0.505166,0.692394,0.38561,0.083298,0.119556
116,Gradient Boosting Regression,poly1,standard,0.614517,0.496318,0.693864,0.382997,0.079347,0.113321
122,Gradient Boosting Regression,poly2,standard,0.611938,0.500536,0.694786,0.381357,0.082848,0.119179
104,Gradient Boosting Regression,f6,quantile,0.621335,0.485079,0.704842,0.36332,0.083506,0.121759


Unnamed: 0,model_name,features,scaling,RMSE_train,R2_train,RMSE_validate,R2_validate,RMSE_difference,R2_difference
116,Gradient Boosting Regression,poly1,standard,0.614517,0.496318,0.693864,0.382997,0.079347,0.113321


In [43]:
def run_best_model():
    '''
    the function runs the best model on the train, test and validate data sets 
    and returns scores in the data frame
    '''
    # create a data frame for test set results
    predictions_test = pd.DataFrame(y_test)
    predictions_test['baseline'] = baseline

    f = f6
    poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
    poly.fit(X1[f])

    # create a df with transformed features of the train set
    X1_poly = pd.DataFrame(
                poly.transform(X1[f]),
                columns=poly.get_feature_names(X1[f].columns),
                index=X1.index)
    X1_poly = pd.concat([X1_poly, X1.iloc[:, 2:]], axis=1)

    # create a df with transformed features for the validate set
    X2_poly = pd.DataFrame(
                poly.transform(X2[f]),
                columns=poly.get_feature_names(X2[f].columns),
                index=X2.index)
    X2_poly = pd.concat([X2_poly, X2.iloc[:, 2:]], axis=1)

    # create a df with transformed features for the validate set
    X2_poly = pd.DataFrame(
                poly.transform(X2[f]),
                columns=poly.get_feature_names(X2[f].columns),
                index=X2.index)
    X2_poly = pd.concat([X2_poly, X2.iloc[:, 2:]], axis=1)

    # create. df with transformed features for the test set
    X3_poly = pd.DataFrame(
                poly.transform(X3[f]),
                columns=poly.get_feature_names(X3[f].columns),
                index=X3.index)
    X3_poly = pd.concat([X3_poly, X3.iloc[:, 2:]], axis=1)

    # create a Gradient Boosting Regression model
    model = GradientBoostingRegressor()
    # fit the model
    model.fit(X1_poly, y_train)
    # predictions of the train set
    y_hat_train = model.predict(X1_poly)
    # predictions of the validate set
    y_hat_validate = model.predict(X2_poly)
    # add train set predictions to the data frame
    y_hat_test = model.predict(X3_poly)
    predictions_test['predictions'] = y_hat_test

    # calculate scores train set
    RMSE_train, R2_train = regression_errors(y_train, y_hat_train)
    # calculate scores validation set
    RMSE_val, R2_val = regression_errors(y_validate, y_hat_validate)
    # calculate scores test set
    RMSE_test, R2_test = regression_errors(y_test, y_hat_test)
    RMSE_bl, _ = regression_errors(y_test, predictions_test.baseline)
    
    # save final score into a dictionary
    res = {
        'Features': str(f),
        'RMSE Train Set': RMSE_train,
        'RMSE Validation Set':RMSE_val,
        'RMSE Test Set':RMSE_test,
        'R2 Train Set':R2_train,
        'R2 Validation Set':R2_val,
        'R2 Test':R2_test,
        'Beats a basline by:':str(f'{round((RMSE_bl - RMSE_test) / RMSE_bl * 100, 1)}%')
    }

    # add the score results to the scores Data Frame
    final_test = pd.DataFrame({'Gradient Bosting Regression': list(res.keys()), 'Scores': list(res.values())})

    return final_test

In [44]:
run_best_model()

Unnamed: 0,Gradient Bosting Regression,Scores
0,Features,"['volatile acidity', 'chlorides', 'density', '..."
1,RMSE Train Set,0.580226
2,RMSE Validation Set,0.683442
3,RMSE Test Set,0.693532
4,R2 Train Set,0.550961
5,R2 Validation Set,0.401393
6,R2 Test,0.380155
7,Beats a basline by:,22.6%


In [45]:
train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
2112,0.441766,0.443149,0.092269,-1.169401,-1.976943,-0.196004,-1.077971,-0.969427,-0.054803,-1.351899,0.608591
802,-2.307607,1.236652,-2.185447,-0.817902,-0.243255,-0.951496,-0.525263,-0.603946,2.111036,2.739836,2.033866
3146,-0.328223,0.03388,1.196617,0.22263,-1.027247,0.32703,0.758446,-0.109365,-0.611733,-0.546968,-0.481326
5286,-1.225943,-1.606755,-0.321861,1.125776,-1.425011,0.501374,-0.311312,-0.202098,-0.42609,-1.284822,0.273232
1884,0.341493,0.18931,-0.459904,0.533636,0.440383,0.966292,1.150691,0.374936,-0.859258,-0.412812,-1.403563


In [48]:
def general_regressor_model():
    '''
    the function runs the best model on the train, test and validate data sets 
    and returns scores in the data frame
    '''
    # create a data frame for test set results
    predictions_test = pd.DataFrame(y_test)
    predictions_test['baseline'] = baseline

    f = f6
    poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
    poly.fit(X1[f])

    # create a df with transformed features of the train set
    X1_poly = pd.DataFrame(
                poly.transform(X1[f]),
                columns=poly.get_feature_names(X1[f].columns),
                index=X1.index)
    X1_poly = pd.concat([X1_poly, X1.iloc[:, 2:]], axis=1)

    # create a df with transformed features for the validate set
    X2_poly = pd.DataFrame(
                poly.transform(X2[f]),
                columns=poly.get_feature_names(X2[f].columns),
                index=X2.index)
    X2_poly = pd.concat([X2_poly, X2.iloc[:, 2:]], axis=1)

    # create a df with transformed features for the validate set
    X2_poly = pd.DataFrame(
                poly.transform(X2[f]),
                columns=poly.get_feature_names(X2[f].columns),
                index=X2.index)
    X2_poly = pd.concat([X2_poly, X2.iloc[:, 2:]], axis=1)

    # create. df with transformed features for the test set
    X3_poly = pd.DataFrame(
                poly.transform(X3[f]),
                columns=poly.get_feature_names(X3[f].columns),
                index=X3.index)
    X3_poly = pd.concat([X3_poly, X3.iloc[:, 2:]], axis=1)

    # create a Gradient Boosting Regression model
    model = TweedieRegressor()
    # fit the model
    model.fit(X1_poly, y_train)
    # predictions of the train set
    y_hat_train = model.predict(X1_poly)
    # predictions of the validate set
    y_hat_validate = model.predict(X2_poly)
    # add train set predictions to the data frame
    y_hat_test = model.predict(X3_poly)
    predictions_test['predictions'] = y_hat_test

    # calculate scores train set
    RMSE_train, R2_train = regression_errors(y_train, y_hat_train)
    # calculate scores validation set
    RMSE_val, R2_val = regression_errors(y_validate, y_hat_validate)
    # calculate scores test set
    RMSE_test, R2_test = regression_errors(y_test, y_hat_test)
    RMSE_bl, _ = regression_errors(y_test, predictions_test.baseline)
    
    # save final score into a dictionary
    res = {
        'Features': str(f),
        'RMSE Train Set': RMSE_train,
        'RMSE Validation Set':RMSE_val,
        'RMSE Test Set':RMSE_test,
        'R2 Train Set':R2_train,
        'R2 Validation Set':R2_val,
        'R2 Test':R2_test,
        'Beats a basline by:':str(f'{round((RMSE_bl - RMSE_test) / RMSE_bl * 100, 1)}%')
    }

    # add the score results to the scores Data Frame
    final_test = pd.DataFrame({'General Regressor Model': list(res.keys()), 'Scores': list(res.values())})

    return final_test

In [49]:
general_regressor_model()

Unnamed: 0,General Regressor Model,Scores
0,Features,"['volatile acidity', 'chlorides', 'density', '..."
1,RMSE Train Set,0.726816
2,RMSE Validation Set,0.751841
3,RMSE Test Set,0.740572
4,R2 Train Set,0.295409
5,R2 Validation Set,0.275581
6,R2 Test,0.293219
7,Beats a basline by:,17.4%
