# Diamonds

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import linear_model, metrics
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import Lasso, Ridge, LassoCV, RidgeCV, LinearRegression

from models import *

%matplotlib inline

In [2]:
df = pd.read_csv('diamonds.csv', index_col = 'Unnamed: 0')

In [3]:
evaluation = eval_df()

In [4]:
regression(df, ['carat','x','y','z','depth','table'], 'price', evaluation, regtype = 'Multivariate')

Unnamed: 0,Model,Power,Hyper Parameters,Root Mean Squared Error (RMSE),R-squared (training),Adjusted R-squared (training),R-squared (test),Adjusted R-squared (test),5-Fold Cross Validation
0,Multivariate Linear Regression,1.0,,1486.771,0.859,0.859,0.859,0.859,0.854


In [5]:
regression(df, ['carat','x','y','z','depth','table'], 'price', evaluation, regtype = 'Lasso')

How many interations? 100000


Unnamed: 0,Model,Power,Hyper Parameters,Root Mean Squared Error (RMSE),R-squared (training),Adjusted R-squared (training),R-squared (test),Adjusted R-squared (test),5-Fold Cross Validation
0,Multivariate Linear Regression,1.0,,1486.771,0.859,0.859,0.859,0.859,0.854
1,Lasso Linear Regression,1.0,Iterations: 100000 and Alpha: 3.685019951033...,1488.173,0.859,0.859,0.859,0.859,0.859


In [6]:
regression(df, ['carat','x','y','z','depth','table'], 'price', evaluation, regtype = 'Multivariate', poly_deg = 2)

Unnamed: 0,Model,Power,Hyper Parameters,Root Mean Squared Error (RMSE),R-squared (training),Adjusted R-squared (training),R-squared (test),Adjusted R-squared (test),5-Fold Cross Validation
0,Multivariate Linear Regression,1.0,,1486.771,0.859,0.859,0.859,0.859,0.854
1,Lasso Linear Regression,1.0,Iterations: 100000 and Alpha: 3.685019951033...,1488.173,0.859,0.859,0.859,0.859,0.859
2,Multivariate Polynomial Regression,2.0,,1886.163,0.876,0.876,0.774,0.773,-42.203


In [7]:
regression(df, ['carat','x','y','z','depth','table'], 'price', evaluation, regtype = 'Ridge')

Enter alpha values seperated by spaces 0.1 0.2 0.3 0.4 0.7 0.8 0.9 1.3 1.6


Unnamed: 0,Model,Power,Hyper Parameters,Root Mean Squared Error (RMSE),R-squared (training),Adjusted R-squared (training),R-squared (test),Adjusted R-squared (test),5-Fold Cross Validation
0,Multivariate Linear Regression,1.0,,1486.771,0.859,0.859,0.859,0.859,0.854
1,Lasso Linear Regression,1.0,Iterations: 100000 and Alpha: 3.685019951033...,1488.173,0.859,0.859,0.859,0.859,0.859
2,Multivariate Polynomial Regression,2.0,,1886.163,0.876,0.876,0.774,0.773,-42.203
3,Ridge Linear Regression,1.0,Alpha: 1.6,1486.8,0.859,0.859,0.859,0.859,0.855


In [8]:
evaluation

Unnamed: 0,Model,Power,Hyper Parameters,Root Mean Squared Error (RMSE),R-squared (training),Adjusted R-squared (training),R-squared (test),Adjusted R-squared (test),5-Fold Cross Validation
0,Multivariate Linear Regression,1.0,,1486.771,0.859,0.859,0.859,0.859,0.854
1,Lasso Linear Regression,1.0,Iterations: 100000 and Alpha: 3.685019951033...,1488.173,0.859,0.859,0.859,0.859,0.859
2,Multivariate Polynomial Regression,2.0,,1886.163,0.876,0.876,0.774,0.773,-42.203
3,Ridge Linear Regression,1.0,Alpha: 1.6,1486.8,0.859,0.859,0.859,0.859,0.855


In [None]:
df.head()

In [None]:
df = df.dropna()
con_features = ['bmi','hiv','school', 'death_ratio', 'gdp']
X = df.loc[:, con_features]
x = df.loc[:, ['alcohol', 'hepatitisB', 'measles', 'bmi',
       'polio', 'diphtheria', 'hiv', 'total_expenditure', 'gdp',
       'thinness_till19', 'inf_death', 'school', 'population', 'pop_size', 'death_ratio']]
y = df.loc[:, 'life_expectancy']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.20, random_state= 42)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.20, random_state= 42)

In [None]:
#set up the model comparison dataframe

def eval_df():
    """
    creates a dataframe containing all evalutation metrics
    
    """
    evaluation = pd.DataFrame({'Model': [],
                           'Power': [],
                           'Root Mean Squared Error (RMSE)':[],
                           'R-squared (training)':[],
                           'Adjusted R-squared (training)':[],
                           'R-squared (test)':[],
                           'Adjusted R-squared (test)':[],
                           '5-Fold Cross Validation':[]})
    return evaluation




For the simple bad linear reg model we used carat as it had the highest pearson coeff

In [None]:
def adjustedR2(r2,n,p):
    return 1 - (1-r2)*((n-1)/(n-p -1))

In [None]:
def train_test(data, features, target, testsize=0.25):
    """
    Assumes features are continuous and forms part of the dataframe
    creates default test size of 0.25
    """
    X = df.loc[:, features]
    y = df.loc[:, target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= testsize, random_state= 42)
    return X_train, X_test, y_train, y_test
    
    

In [None]:
def regress_df(data, features, target, regtype= ['Multiple', 'Polynomial', 'Lasso', 'Ridge'], testsize=0.25, poly_deg = 3, standard = True):
    """
    Assumes features are continuous and forms part of the dataframe
    creates default test size of 0.25
    """
    
    X_train, X_test, y_train, y_test= train_test(data, features, target, testsize=testsize)
    
    if standard == True:
        scale = StandardScaler()
        X_train = scale.fit_transform(X_train)
        X_test = scale.transform(X_test)
    
    if 'Polynomial' in regtype:
        for i in range(2,poly_deg+1):
            poly = PoylnomialFeatures(i)
            X_train = poly.fit_transform(X_train)
            X_test = poly.transform(X_test)
            
    
    complex_model_1 = linear_model.LinearRegression()
    complex_model_1.fit(X_train, y_train)

    coefficients = complex_model_1.coef_
    coeff = dict(zip(con_features, coefficients))

    pred = complex_model_1.predict(X_test)
    root_mean_square_error = float(format(np.sqrt(metrics.mean_squared_error(y_test, pred)),'.3f'))
    r2_train = float(format(complex_model_1.score(X_train, y_train),'.3f'))
    r2_adj_train = float(format(adjustedR2(complex_model_1.score(X_train, y_train),X_train.shape[0],len(con_features)),'.3f'))
    r2_test = float(format(complex_model_1.score(X_test, y_test),'.3f'))
    r2_adj_test = float(format(adjustedR2(complex_model_1.score(X_test, y_test),X_test.shape[0],len(con_features)),'.3f'))

    crossvalidation = KFold(n_splits = 5, shuffle = True, random_state=42)
    cv = float(format(cross_val_score(complex_model_1, X_train, y_train, cv=crossvalidation).mean(),'.3f'))

    print ("Average Price for Test Data: {:.2f}".format(y_test.mean()))
    print('Intercept: {:.2f}'.format(complex_model_1.intercept_))
    print('Coefficient: {}'.format(complex_model_1.coef_))
    evaluation = eval_df()

    r = evaluation.shape[0]
    evaluation.loc[r] = ['Multivariate Linear Regression', 1, root_mean_square_error,
                         r2_train, r2_adj_train, r2_test, r2_adj_test, cv]
    return evaluation

    
    

In [None]:
regress_df(df, con_features , 'life_expectancy', standard=False)

In [None]:
regtype= ['Multiple', 'Polynomial', 'Lasso', 'Ridge']
if 'Polynomial' in regtype:
    print('yes')

In [None]:
lr = linear_model.LinearRegression()

lr.fit(X_train['gdp'].values.reshape(-1,1), y_train)


pred = lr.predict(X_test['gdp'].values.reshape(-1,1))

rmsesm = float(format(np.sqrt(metrics.mean_squared_error(y_test, pred)),'.3f'))
rtrsm = float(format(lr.score(X_train['gdp'].values.reshape(-1,1), y_train),'.3f'))
rtesm = float(format(lr.score(X_test['gdp'].values.reshape(-1,1), y_test),'.3f'))
crossvalidation = KFold(n_splits = 5, shuffle = True, random_state=1)
cv = float(format(cross_val_score(lr,X_train[['gdp']],y_train,cv=crossvalidation).mean(),'.3f'))

print ("Average Price for Test Data: {:.2f}".format(y_test.mean()))
print('Intercept: {:.2f}'.format(lr.intercept_))
print('Coefficient: {}'.format(lr.coef_))

evaluation = eval_df()
r = evaluation.shape[0]
evaluation.loc[r] = ['Simple Linear Regression', 1,'gdp', rmsesm, rtrsm,'-', rtesm,'-', cv]
evaluation

In [None]:
complex_model_1 = linear_model.LinearRegression()
complex_model_1.fit(x_train, y_train)

coefficients = complex_model_1.coef_
coeff = dict(zip(con_features, coefficients))

pred = complex_model_1.predict(x_test)

rmsesm = float(format(np.sqrt(metrics.mean_squared_error(y_test, pred)),'.3f'))

rtrcm = float(format(complex_model_1.score(x_train, y_train),'.3f'))
artrcm = float(format(adjustedR2(complex_model_1.score(x_train, y_train),x_train.shape[0],len(con_features)),'.3f'))
rtesm = float(format(complex_model_1.score(x_test, y_test),'.3f'))
artecm = float(format(adjustedR2(complex_model_1.score(x_test, y_test),x_test.shape[0],len(con_features)),'.3f'))

crossvalidation = KFold(n_splits = 5, shuffle = True, random_state=42)
cv = float(format(cross_val_score(complex_model_1, x_train, y_train, cv=crossvalidation).mean(),'.3f'))

print ("Average Price for Test Data: {:.2f}".format(y_test.mean()))
print('Intercept: {:.2f}'.format(complex_model_1.intercept_))
print('Coefficient: {}'.format(complex_model_1.coef_))

r = evaluation.shape[0]
evaluation.loc[r] = ['Multivariate Linear Regression', 1, 'all features', rmsesm, artrcm, rtrsm, rtesm, artecm, cv]
evaluation

In [None]:
complex_model_1 = linear_model.LinearRegression()
complex_model_1.fit(X_train, y_train)

coefficients = complex_model_1.coef_
coeff = dict(zip(con_features, coefficients))

pred = complex_model_1.predict(X_test)

rmsesm = float(format(np.sqrt(metrics.mean_squared_error(y_test, pred)),'.3f'))

rtrcm = float(format(complex_model_1.score(X_train, y_train),'.3f'))
artrcm = float(format(adjustedR2(complex_model_1.score(X_train, y_train),X_train.shape[0],len(con_features)),'.3f'))
rtesm = float(format(complex_model_1.score(X_test, y_test),'.3f'))
artecm = float(format(adjustedR2(complex_model_1.score(X_test, y_test),X_test.shape[0],len(con_features)),'.3f'))

crossvalidation = KFold(n_splits = 5, shuffle = True, random_state=42)
cv = float(format(cross_val_score(complex_model_1, X_train, y_train, cv=crossvalidation).mean(),'.3f'))

print ("Average Price for Test Data: {:.2f}".format(y_test.mean()))
print('Intercept: {:.2f}'.format(complex_model_1.intercept_))
print('Coefficient: {}'.format(complex_model_1.coef_))

r = evaluation.shape[0]
evaluation.loc[r] = ['Multivariate Linear Regression', 1, 'bmi, hiv, school, death_ratio, gdp', rmsesm, artrcm, rtrsm, rtesm, artecm, cv]
evaluation

In [None]:
def mean_norm(col):
    mean = col.mean()
    mx = np.max(col)
    mn = np.min(col)
    norm = (col - mean)/(mx - mn)
    return norm

In [None]:
for col in con_features:
    X_train[col] = mean_norm(X_train[col])

In [None]:
for col in con_features:
    X_test[col] = mean_norm(X_test[col]);

In [None]:
#y_train, y_test = np.log(y_train), np.log(y_test)

In [None]:
complex_model_1 = linear_model.LinearRegression()
complex_model_1.fit(X_train, y_train)

coefficients = complex_model_1.coef_
coeff = dict(zip(con_features, coefficients))

pred = complex_model_1.predict(X_test)

rmsesm = float(format(np.sqrt(metrics.mean_squared_error(y_test, pred)),'.3f'))
artrcm = float(format(adjustedR2(complex_model_1.score(X_train, y_train),X_train.shape[0],len(con_features)),'.3f'))
rtrcm = float(format(complex_model_1.score(X_train, y_train),'.3f'))

rtesm = float(format(complex_model_1.score(X_test, y_test),'.3f'))
artecm = float(format(adjustedR2(complex_model_1.score(X_test, y_test),X_test.shape[0],len(con_features)),'.3f'))
crossvalidation = KFold(n_splits = 5, shuffle = True, random_state=42)
cv = float(format(cross_val_score(complex_model_1, X_train, y_train, cv=crossvalidation).mean(),'.3f'))

print ("Average Price for Test Data: {:.2f}".format(y_test.mean()))
print('Intercept: {:.2f}'.format(complex_model_1.intercept_))
print('Coefficient: {}'.format(coeff))

r = evaluation.shape[0]
evaluation.loc[r] = ['Multivariate Linear Regression', 1, 'bmi, hiv, school, death_ratio, gdp (standardised)', rmsesm, rtrcm, artrcm, rtesm, artecm, cv]
evaluation

In [None]:
poly = PolynomialFeatures(2)
X_train_2 = poly.fit_transform(X_train)
X_test_2 = poly.transform(X_test)

In [None]:
poly_model_2 = linear_model.LinearRegression()
poly_model_2.fit(X_train_2, y_train)

coefficients = poly_model_2.coef_
coeff = dict(zip(con_features, coefficients))

pred = poly_model_2.predict(X_test_2)

rmsesm = float(format(np.sqrt(metrics.mean_squared_error(y_test, pred)),'.3f'))

rtrcm = float(format(poly_model_2.score(X_train_2, y_train),'.3f'))
artrcm = float(format(adjustedR2(poly_model_2.score(X_train_2, y_train),X_train_2.shape[0],X_train_2.shape[1]),'.3f'))
rtesm = float(format(poly_model_2.score(X_test_2, y_test),'.3f'))
artecm = float(format(adjustedR2(poly_model_2.score(X_test_2, y_test),X_test_2.shape[0],X_test_2.shape[1]),'.3f'))
crossvalidation = KFold(n_splits = 5, shuffle = True, random_state=42)
cv = float(format(cross_val_score(poly_model_2, X_train_2, y_train, cv=crossvalidation).mean(),'.3f'))

print ("Average Price for Test Data: {:.2f}".format(y_test.mean()))
print('Intercept: {:.2f}'.format(poly_model_2.intercept_))
# print('Coefficient: {}'.format(coeff))
coeff = []
for coef in coefficients:
    if coef != 0:
        coeff.append(coef)
print(coeff)

r = evaluation.shape[0]
evaluation.loc[r] = ['Multivariate Polynomial Regression', 2,'bmi, hiv, school, death_ratio, gdp (standardised)', rmsesm, rtrcm, artrcm, rtesm, artecm, cv]
evaluation

In [None]:
poly = PolynomialFeatures(3)
X_train_3 = poly.fit_transform(X_train)
X_test_3 = poly.transform(X_test)

In [None]:
poly_model_3 = linear_model.LinearRegression()
poly_model_3.fit(X_train_3, y_train)

coefficients = poly_model_3.coef_
coeff = dict(zip(con_features, coefficients))

pred = poly_model_3.predict(X_test_3)

rmsesm = float(format(np.sqrt(metrics.mean_squared_error(y_test, pred)),'.3f'))

rtrcm = float(format(poly_model_3.score(X_train_3, y_train),'.3f'))
artrcm = float(format(adjustedR2(poly_model_3.score(X_train_3, y_train),X_train_3.shape[0], X_train_3.shape[1]),'.3f'))
rtesm = float(format(poly_model_3.score(X_test_3, y_test),'.3f'))
artecm = float(format(adjustedR2(poly_model_3.score(X_test_3, y_test),X_test_3.shape[0],X_test_3.shape[1]),'.3f'))
crossvalidation = KFold(n_splits = 5, shuffle = True, random_state=42)
cv = float(format(cross_val_score(poly_model_3, X_train_3, y_train, cv=crossvalidation).mean(),'.3f'))

print ("Average Price for Test Data: {:.2f}".format(y_test.mean()))
print('Intercept: {:.2f}'.format(poly_model_3.intercept_))
# print('Coefficient: {}'.format(coeff))
coeff = []
for coef in coefficients:
    if coef != 0:
        coeff.append(coef)
print(coeff)

r = evaluation.shape[0]
evaluation.loc[r] = ['Multivariate Polynomial Regression', 3,'bmi, hiv, school, death_ratio, gdp (standardised)', rmsesm, rtrcm, artrcm, rtesm, artecm, cv]
evaluation

In [None]:
lasso_1 = LassoCV(max_iter = 100000, cv = 5)
lasso_1.fit(X_train, y_train)

coefficients = lasso_1.coef_
coeff = dict(zip(con_features, coefficients))

pred = lasso_1.predict(X_test)

rmsesm = float(format(np.sqrt(metrics.mean_squared_error(y_test, pred)),'.3f'))

rtrcm = float(format(lasso_1.score(X_train, y_train),'.3f'))
artrcm = float(format(adjustedR2(lasso_1.score(X_train, y_train), X_train.shape[0], X_train.shape[1]),'.3f'))
rtesm = float(format(lasso_1.score(X_test, y_test),'.3f'))
artecm = float(format(adjustedR2(lasso_1.score(X_test, y_test),X_test.shape[0],X_test.shape[1]),'.3f'))
crossvalidation = KFold(n_splits = 5, shuffle = True, random_state=42)
cv = float(format(cross_val_score(lasso_1, X_train, y_train, cv=crossvalidation).mean(),'.3f'))

print ("Average Price for Test Data: {:.2f}".format(y_test.mean()))
print('Intercept: {:.2f}'.format(lasso_1.intercept_))
print('Coefficient: {}'.format(coeff))

r = evaluation.shape[0]
evaluation.loc[r] = ['Multivariate Lasso Regression', 1,'bmi, hiv, school, death_ratio, gdp (standardised) alpha = {}'.format(lasso_1.alpha_), rmsesm, rtrcm, artrcm, rtesm, artecm, cv]
evaluation

In [None]:
lasso_2 = LassoCV(max_iter = 100000, cv = 5)
lasso_2.fit(X_train_2, y_train)

coefficients = lasso_2.coef_
coeff = dict(zip(con_features, coefficients))

pred = lasso_2.predict(X_test_2)

rmsesm = float(format(np.sqrt(metrics.mean_squared_error(y_test, pred)),'.3f'))

rtrcm = float(format(lasso_2.score(X_train_2, y_train),'.3f'))
artrcm = float(format(adjustedR2(lasso_2.score(X_train_2, y_train), X_train_2.shape[0], X_train_2.shape[1]),'.3f'))
rtesm = float(format(lasso_2.score(X_test_2, y_test),'.3f'))
artecm = float(format(adjustedR2(lasso_2.score(X_test_2, y_test),X_test_2.shape[0],X_test_2.shape[1]),'.3f'))
crossvalidation = KFold(n_splits = 5, shuffle = True, random_state=42)
cv = float(format(cross_val_score(lasso_2, X_train_2, y_train, cv=crossvalidation).mean(),'.3f'))

print ("Average Price for Test Data: {:.2f}".format(y_test.mean()))
print('Intercept: {:.2f}'.format(lasso_2.intercept_))
# print('Coefficient: {}'.format(coeff))
coeff = []
for coef in coefficients:
    if coef != 0:
        coeff.append(coef)
print(coeff)

r = evaluation.shape[0]
evaluation.loc[r] = ['Multivariate Lasso Regression', 2,'bmi, hiv, school, death_ratio, gdp (standardised) alpha = {}'.format(lasso_2.alpha_), rmsesm, rtrcm, artrcm, rtesm, artecm, cv]
evaluation

In [None]:
itr = [x/1000 for x in list(range(0, 1000))]

In [None]:
ridge_1 = RidgeCV(alphas = itr, cv = 5)
ridge_1.fit(X_train, y_train)

coefficients = ridge_1.coef_
coeff = dict(zip(con_features, coefficients))

pred = ridge_1.predict(X_test)

rmsesm = float(format(np.sqrt(metrics.mean_squared_error(y_test, pred)),'.3f'))

rtrcm = float(format(ridge_1.score(X_train, y_train),'.3f'))
artrcm = float(format(adjustedR2(ridge_1.score(X_train, y_train), X_train.shape[0], X_train.shape[1]),'.3f'))
rtesm = float(format(ridge_1.score(X_test, y_test),'.3f'))
artecm = float(format(adjustedR2(ridge_1.score(X_test, y_test),X_test.shape[0],X_test.shape[1]),'.3f'))
crossvalidation = KFold(n_splits = 5, shuffle = True, random_state=42)
cv = float(format(cross_val_score(ridge_1, X_train, y_train, cv=crossvalidation).mean(),'.3f'))

print ("Average Price for Test Data: {:.2f}".format(y_test.mean()))
print('Intercept: {:.2f}'.format(ridge_1.intercept_))
print('Coefficient: {}'.format(coeff))

r = evaluation.shape[0]
evaluation.loc[r] = ['Multivariate Ridge Regression', 1,'bmi, hiv, school, death_ratio, gdp (standardised) alpha = {}'.format(ridge_1.alpha_), rmsesm, rtrcm, artrcm, rtesm, artecm, cv]
evaluation

Ridge works best with alpha =0.772

In [None]:
from statsmodels.formula.api import ols

data = pd.concat([x_train, y_train], axis=1)
lr_model_init = ols(formula='life_expectancy~bmi+hiv+school+death_ratio+gdp', data=data).fit()
lr_model_init.summary()