<a href="https://colab.research.google.com/github/Laura-Neff/MultipleTypesOfRegression/blob/main/MultipleTypesOfRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [None]:
data = pd.read_csv('diamonds_processed.csv', index_col=0)

data.head()

Unnamed: 0_level_0,price,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,color_H,color_I,color_J,carat,depth,table,x,y,z
clarity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
5,2347,0,0,1,0,0,0,0,0,1,0,0,0,-0.426762,0.183806,-0.215603,-0.297549,-0.252463,-0.237666
3,17108,0,0,1,0,0,0,0,1,0,0,0,0,1.588656,0.183806,-1.102293,1.480844,1.486613,1.420743
5,1838,0,0,1,0,0,0,0,0,1,0,0,0,-0.636701,0.183806,-0.658948,-0.582092,-0.540852,-0.505151
3,3625,0,0,1,0,0,1,0,0,0,0,0,0,-0.111853,-0.020553,-1.102293,0.067022,0.088361,0.069942
3,5729,1,0,0,0,0,0,1,0,0,0,0,0,0.412995,3.589792,-1.545638,0.431592,0.376751,0.818901


In [None]:
from sklearn.linear_model import LinearRegression
def linear_model(x_train, y_train):
    
    print("Linear Regression ")
    linear_regression = LinearRegression()
    
    linear_regression.fit(x_train, y_train)
    
    return linear_regression

In [None]:
from sklearn.linear_model import Lasso

def lasso_model(x_train, y_train):
    
    print("Lasso Regression")
    lasso_regression = Lasso(alpha=0.8, max_iter=10000)
    
    lasso_regression.fit(x_train, y_train)
    
    return lasso_regression

#alpha determines the strength of the regularization; constant that multiplies the regularization term 

In [None]:
from sklearn.linear_model import Ridge

def ridge_model(x_train, y_train):
    
    print("Ridge Regression")
    ridge_regression = Ridge(alpha=0.9)
    
    ridge_regression.fit(x_train, y_train)
    
    return ridge_regression

    #regularization penalty in terms of ridge regression is the sum of the squares of the coefficients in our regression analysis

In [None]:
def build_and_train_model(data, target_name, reg_fn):
    
    X = data.drop(target_name, axis=1)
    Y = data[target_name]
    
    x_train, x_test, y_train, y_test = \
        train_test_split(X, Y, test_size = 0.2, random_state=0)

    model = reg_fn(x_train, y_train)
    
    score = model.score(x_train, y_train)
    print("Training Score : ", score)

    y_pred = model.predict(x_test)
    r_score = r2_score(y_test, y_pred)
    print("Testing Score : ", r_score)
    
    return {'model' : model, 
            'x_train' : x_train, 'x_test' : x_test,
            'y_train' : y_train, 'y_test' : y_test, 
            'y_pred' : y_pred
           }

In [None]:
linear_reg = build_and_train_model(data, "price", linear_model)

Linear Regression 
Training Score :  0.8769773439203701
Testing Score :  0.858678497083373


In [None]:
lasso_reg = build_and_train_model(data, "price", lasso_model)

Lasso Regression
Training Score :  0.8769660880729163
Testing Score :  0.8588299001123165


In [None]:
ridge_reg = build_and_train_model(data, "price", ridge_model)

#because the data we are working with is fairly simple, using regularized models did not really help. 
#The original linear regression wasn't really overfitted on the data

Ridge Regression
Training Score :  0.8769767455315431
Testing Score :  0.8587808694573578


In [None]:
from sklearn.linear_model import SGDRegressor

def sgd_model(x_train, y_train):
    
    print("SGD Regression")
    sgd_regression = SGDRegressor(max_iter=2000)
    
    sgd_regression.fit(x_train, y_train)
    
    return sgd_regression

In [None]:
sgd_reg = build_and_train_model(data, "price", sgd_model)

SGD Regression
Training Score :  0.8767292955773758
Testing Score :  0.8583285667274763
