In [1]:
import pandas as pd #For reading, and manipulating data
from sklearn.linear_model import LinearRegression #For initializing the regression model
from sklearn.metrics import mean_squared_error, mean_absolute_error #for checking the mean squared error of the model
from sklearn.model_selection import train_test_split #For spliting the data into the required sets for testing and training
from tabulate import tabulate #To create table, especially for data presentation
import plotly.graph_objects as go #To create tables

# Load the Data

In [4]:
df = pd.read_csv("regressiondata.csv",sep = ";") #Make use of a  a separator to separate the columns

In [5]:
df #displays dataframe

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


# Part 1
# Univariate regression Model

In [7]:
X =  df["alcohol"] #Feature. can change the feture to use here
y =  df.quality #Target

In [13]:
X = pd.DataFrame(X) #Reshapes the data into the proper structure
y = pd.DataFrame(y) #Reshapes the data into the proper structure

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=7776) #spliting the data, using a 80% to 20% of train data to testing data ratio

# Initializing and Training the model

In [15]:
model = LinearRegression() #initializing the model 
model.fit(X_train, y_train) #training the model

LinearRegression()

In [17]:
model.intercept_ #value for W0

array([1.72388777])

In [18]:
#There is only one weight since this is a univariate model.
model.coef_ #W1

array([[0.37548257]])

Thus our model is given by
##    $$M_{w(d)} = w_0d_0 + w_1d1$$
## $$\text{But}\quad d_0=1$$
##    $$\implies M_{w(d)} = 1.72388777 -0.37548257(\text{alcohol})$$

In [None]:
Y_pred = model.predict(X_test) #Testing the model of future data

In [None]:
mean_absolute_error(y_test, Y_pred) #computing the mean absolute error to check good the model is

0.5875489306900061

# Now we write a code that will test all the features and from the results we can then compare to find the best feature to use for a univariate model.


In [19]:
feature_names = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
ERROR = [] #For storing the MSE from each feature
for i in feature_names:
    X =  df[i] #loop that will run through all the features, selecting one feature for the model everytime.
    y =  df.quality #target values
    X = pd.DataFrame(X) #Reshaping the data
    y = pd.DataFrame(y) #Reshaping the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=7776) #spliting the data
    model = LinearRegression() #initializing the model 
    model.fit(X_train, y_train) #training the model
    Y_pred = model.predict(X_test) #predicting on new data
    MSE = mean_absolute_error(y_test, Y_pred) #computing the mean absolute error
    ERROR.append(MSE)   #storing the MSE values for each feature
    
#prints the feature with its corresponding MSE
fig = go.Figure(data=[go.Table(header=dict(values=['Feature', 'Mean Absolute Error']),
                 cells=dict(values=[feature_names, ERROR])) ])
fig.show()

 From the above we clearly see that the feature that produces the smallest mean squared error is "alcohol". Hence if we were to create a univariate regression model then this would be the best feature to use.

# Part 2
# Multi-variate linear regression model

In [20]:
# The X are the features, while the y is the target
feature_names = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
X =  df[feature_names]
y =  df.quality 

In [22]:
#Data check
print("X shape = ",X.shape ,"\n""y shape = ",y.shape) #This will show that we need to reshape.

X shape =  (1599, 11) 
y shape =  (1599,)


In [23]:
X = pd.DataFrame(X) #Reshapes the data into the proper structure
y = pd.DataFrame(y) #Reshapes the data into the proper structure
print("X shape: ",X.shape,"\n""y shape: ",y.shape)

X shape:  (1599, 11) 
y shape:  (1599, 1)


In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=7776) #initialize a random seed to use the same sample of data every time the codes runs

# Initializing and Training the model

In [25]:
model = LinearRegression() #creating the multi-variate model
model.fit(X_train, y_train) #training the model

LinearRegression()

In [26]:
model.intercept_ #weight w0

array([19.37536289])

In [27]:
model.coef_ #print the optimal weights

array([[ 1.96335392e-02, -1.15977002e+00, -2.59125416e-01,
         1.14722591e-02, -2.21718786e+00,  2.69049432e-03,
        -2.46120925e-03, -1.52469513e+01, -4.30269363e-01,
         9.05999472e-01,  2.91850915e-01]])

In [28]:
Y_pred = model.predict(X_test) #predicting on the test data

In [29]:
mean_absolute_error(y_test, Y_pred) #computing the mean absolute error

0.5192344119565516

Comparing the mean squared error (MSE) from the multivariate model to those of the univariate models we can conclude that the multivariate model performs better as it has the smalles MSE error.