In [1]:
# This script is to build a linear regression model
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from functions import *

In [2]:
# Read the data
data_starbucks = pd.read_csv("out\processed_data_starbucks.csv", encoding = 'cp1252')
data_starbucks.head()

Unnamed: 0.1,Unnamed: 0,Beverage_category,Beverage,Beverage_prep,Calories,Total Fat (g),Trans Fat (g),Saturated Fat (g),Sodium (mg),Total Carbohydrates (g),Cholesterol (mg),Dietary Fibre (g),Sugars (g),Protein (g),Vitamin A (% DV),Vitamin C (% DV),Calcium (% DV),Iron (% DV),Caffeine (mg)
0,0,Coffee,Brewed Coffee,Short,3,0.1,0.0,0.0,0,5,0,0,0,0.3,0.0,0.0,0.0,0.0,175.0
1,1,Coffee,Brewed Coffee,Tall,4,0.1,0.0,0.0,0,10,0,0,0,0.5,0.0,0.0,0.0,0.0,260.0
2,2,Coffee,Brewed Coffee,Grande,5,0.1,0.0,0.0,0,10,0,0,0,1.0,0.0,0.0,0.0,0.0,330.0
3,3,Coffee,Brewed Coffee,Venti,5,0.1,0.0,0.0,0,10,0,0,0,1.0,0.0,0.0,0.02,0.0,410.0
4,4,Classic Espresso Drinks,CaffÃƒÂ¨ Latte,Short Nonfat Milk,70,0.1,0.1,0.0,5,75,10,0,9,6.0,0.1,0.0,0.2,0.0,75.0


In [3]:
# Data info
data_starbucks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241 entries, 0 to 240
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               241 non-null    int64  
 1   Beverage_category        241 non-null    object 
 2   Beverage                 241 non-null    object 
 3   Beverage_prep            241 non-null    object 
 4   Calories                 241 non-null    int64  
 5   Total Fat (g)            241 non-null    float64
 6   Trans Fat (g)            241 non-null    float64
 7   Saturated Fat (g)        241 non-null    float64
 8   Sodium (mg)              241 non-null    int64  
 9   Total Carbohydrates (g)  241 non-null    int64  
 10  Cholesterol (mg)         241 non-null    int64  
 11  Dietary Fibre (g)        241 non-null    int64  
 12  Sugars (g)               241 non-null    int64  
 13  Protein (g)              241 non-null    float64
 14  Vitamin A (% DV)         2

In [4]:
# Split in features and labels
X = data_starbucks.iloc[:,5:].values # Values important as a way to get ready the inputs for the model
y = data_starbucks["Calories"].values
y = y.reshape(-1, 1)

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

In [6]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Split Train between train and validate / 49 is the 20% of the balanced dataset
X_val = X_train[:49]
partial_x_train = X_train[49:]

y_val = y_train[:49]
partial_y_train = y_train[49:]

In [7]:
# Normalize the variables
standardization = StandardScaler()
X_ptrain_std = standardization.fit_transform(partial_x_train)
y_ptrain_std = standardization.fit_transform(partial_y_train)
X_test_std = standardization.fit_transform(X_test)
y_test_std = standardization.fit_transform(y_test)
X_val_std = standardization.fit_transform(X_val)
y_val_std = standardization.fit_transform(y_val)


In [8]:
# Fit PCA model
pca = PCA(n_components=4) 
pca.fit(X_ptrain_std)

In [9]:
# Number of information that contribute each component (4 components contribute with 80% of the total information)
pca.explained_variance_ratio_ 
print(round((sum(pca.explained_variance_ratio_ )*100),2),"%")

81.07 %


In [10]:
# Apply the PCA to all our X features
dt_train = pca.transform(X_ptrain_std)
dt_test = pca.transform(X_test_std)
dt_vali = pca.transform(X_val_std)

In [28]:
# Applying the linear regression model
from sklearn.linear_model import ElasticNet
modelLinear = LinearRegression().fit(dt_train, y_ptrain_std)
y_predict_linear =  modelLinear.predict(dt_test)

modelelastic = ElasticNet(alpha=0.5).fit(dt_train, y_ptrain_std)
y_predict_Elastic = modelLasso.predict(dt_test)

In [26]:
# Metrics for the model
import sklearn.metrics as metrics # importo metricas

# Linear
mse = metrics.mean_squared_error(y_test_std,y_predict_linear)
r2 = metrics.r2_score(y_test_std, y_predict_linear)

# Linear with ElastiNet regularization
mse_r = metrics.mean_squared_error(y_test_std,y_predict_Elastic)
r2_r = metrics.r2_score(y_test_std, y_predict_Elastic)

In [27]:
# Print results
print("The final result for LR without regularization is: ")
print("MSE = ", mse)
print("R2 = ", r2)

print("The final result for LR with regularization is: ")
print("MSE = ", mse_r)
print("R2 = ", r2_r)

The final result for LR without regularization is: 
MSE =  0.05136156600908458
R2 =  0.9486384339909154
The final result for LR with regularization is: 
MSE =  0.17598462378716953
R2 =  0.8240153762128305


In [29]:
# Cross validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score 
kfold_validacion=KFold(10)

results=cross_val_score(modelelastic, dt_train, y_ptrain_std, cv=kfold_validacion)
print(results)
print(results.mean())

[0.79151355 0.83400767 0.87567778 0.86123531 0.86551129 0.91459073
 0.8812498  0.71359884 0.92141011 0.85733303]
0.8516128089450309


In [None]:
# We could get a great linear regression model, which is able to represent the label (calories), 
# with the features and their respective PCA