<a href="https://colab.research.google.com/github/Ismael-Barajas/somliere/blob/main/soMLiere.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from numpy.linalg import inv
from IPython.display import Markdown as md

import sklearn as sk
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

from google.colab import drive
drive.mount('/content/drive')


data = pd.read_csv('drive/My Drive/Colab Notebooks/data.csv', delimiter=',')
print("We examine the first 5 rows of our dataset.\nData:\n")
data.head(5)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
We examine the first 5 rows of our dataset.
Data:



Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [None]:
counts = data.type.value_counts().to_dict()
print("Number of red and white values from all data originally:\n", counts)

data = data.dropna(axis= 0, how= 'any')
counts = data.type.value_counts().to_dict()
print("Number of red and white values from all data after drop NA:\n", counts)

binary_data = data.replace({'white':1, 'red':0})
counts = binary_data.type.value_counts().to_dict()
#print("Number of wine type 1 and 0 values from all data after binary encoding:\n", counts)

Number of red and white values from all data originally:
 {'white': 4898, 'red': 1599}
Number of red and white values from all data after drop NA:
 {'white': 4870, 'red': 1593}


In [None]:
white_mask = binary_data['type'] == 1
w_data = binary_data[white_mask]
r_data = binary_data[~white_mask]

counts = w_data.type.value_counts().to_dict()
#print("Number of red/0 and white/1 values from w_data after type split:\n", counts)
counts = r_data.type.value_counts().to_dict()
#print("Number of red/0 and white/1 values from r_data after type split:\n", counts)

#remove the now-useless feature since the data is split by type into two data sets
r_data = r_data.drop(columns='type')
w_data = w_data.drop(columns='type')

In [None]:
w_train, w_test = train_test_split(w_data, test_size=0.3, train_size=0.7, random_state=(2021-10-25), shuffle=True, stratify=None)
r_train, r_test = train_test_split(r_data, test_size=0.3, train_size=0.7, random_state=(2021-10-25), shuffle=True, stratify=None)

y_w_test = w_test.pop('quality')
y_w_train = w_train.pop('quality')
y_r_test = r_test.pop('quality')
y_r_train = r_train.pop('quality')
#print(w_test, "\n", y_w_test)

     #stack ones
#print(w_train.shape)
w_train.insert(loc=0, column='intercept', value=1)
r_train.insert(loc=0, column='intercept', value=1)
w_test.insert(loc=0, column='intercept', value=1)
r_test.insert(loc=0, column='intercept', value=1)
#print(w_train[:1])
#print(w_train.shape)

In [None]:
#solve linear models
w_weights = np.linalg.inv(w_train.transpose().dot(w_train)).dot(w_train.transpose()).dot(y_w_train)
r_weights = np.linalg.inv(r_train.transpose().dot(r_train)).dot(r_train.transpose()).dot(y_r_train)
#get their predictions for test data
w_pred = np.dot(w_test,w_weights)
r_pred = np.dot(r_test,r_weights)
#compute MSE
w_loss = ((w_pred - y_w_test)**2).mean()
r_loss = ((r_pred - y_r_test)**2).mean()

print("Mean Squared (Error) loss for white wine model: ", w_loss)
print("Mean Squared (Error) loss for red wine model: ", r_loss)
print("\n(The leading coefficient is the y-intercept.)")
print("Our white wine model's coefficients are: \n", w_weights)
print("Our red wine model's coefficients are: \n", r_weights)

Mean Squared (Error) loss for white wine model:  0.5586548954775865
Mean Squared (Error) loss for red wine model:  0.39240922640444303

(The leading coefficient is the y-intercept.)
Our white wine model's coefficients are: 
 [ 1.26023199e+02  5.02701076e-02 -1.87311617e+00  7.65173345e-02
  7.27226297e-02 -4.72027388e-01  4.13859072e-03 -4.88422025e-04
 -1.26407695e+02  7.82470944e-01  7.03096887e-01  2.17893087e-01]
Our red wine model's coefficients are: 
 [ 2.03893873e+01  3.86322179e-02 -1.14476171e+00 -1.84513410e-01
  2.00136271e-02 -1.24526465e+00  3.78575830e-03 -3.13486396e-03
 -1.68581858e+01 -3.74355702e-01  9.04494581e-01  3.03783116e-01]


In [None]:
w_fixed_acidity = w_data['fixed acidity'].mean()
w_volatile_acidity = w_data['volatile acidity'].mean()
w_citric_acid = w_data['citric acid'].mean()
w_residual_sugar = w_data['residual sugar'].mean()
w_chlorides = w_data['chlorides'].mean()
w_free_sulfur_dioxide = w_data['free sulfur dioxide'].mean()
w_total_sulfur_dioxide = w_data['total sulfur dioxide'].mean()
w_density = w_data['density'].mean()
w_pH = w_data['pH'].mean()
w_sulphates = w_data['sulphates'].mean()
w_alcohol = w_data['alcohol'].mean()

r_fixed_acidity = r_data['fixed acidity'].mean()
r_volatile_acidity = r_data['volatile acidity'].mean()
r_citric_acid = r_data['citric acid'].mean()
r_residual_sugar = r_data['residual sugar'].mean()
r_chlorides = r_data['chlorides'].mean()
r_free_sulfur_dioxide = r_data['free sulfur dioxide'].mean()
r_total_sulfur_dioxide = r_data['total sulfur dioxide'].mean()
r_density = r_data['density'].mean()
r_pH = r_data['pH'].mean()
r_sulphates = r_data['sulphates'].mean()
r_alcohol = r_data['alcohol'].mean()

#for use when we dont have a particular value during invocation

In [None]:
# <h1> This is a work in progress and might not be necessary for us </h1>

# #remove 1's
# w_train = w_train.drop(columns='intercept')
# w_test = w_test.drop(columns='intercept')
# r_train = r_train.drop(columns='intercept')
# r_test = r_test.drop(columns='intercept')



# max_order = 3
# K = 10
# cv_loss = np.zeros((K,max_order+1))
# ind_loss = np.zeros((K,max_order+1))
# train_loss = np.zeros((K,max_order+1))
# fold_weights = np.zeros((K,max_order+1,max_order*2+1))

# #x_train = xtrain
# #x_ktest = xtest 
# # w 4870
# # r 1593

# for k in range(max_order+1)[1:]:
#     for fold in range(K):
#         y_w_fold = y_w_train[fold*487 : (fold+1)*487]
#         y_r_fold = y_r_train[fold*159 : (fold+1)*159]
#         y_w_ktrain = np.delete(y_w_train, (range(fold*497, (fold+1)*487)))
#         y_r_ktrain = np.delete(y_r_train, (range(fold*159, (fold+1)*159)))
# #stopped here
#         X_fold = x_train[fold*180 : (fold+1)*180, : ]
#         X_fold = np.hstack((np.ones_like(y_fold)[:, np.newaxis], X_fold))
        
#         X_train = np.delete(x_train, (range(fold*180, (fold+1)*180)), 0)
#         X_train = np.hstack((np.ones_like(y_train)[:, np.newaxis], X_train))
        
#         x_test = np.hstack((np.ones_like(ytest)[:, np.newaxis], x_ktest))
        
#         wfold = np.linalg.solve(np.dot(X_train.T,X_train),np.dot(X_train.T,y_train))
        
#         indpred = np.dot(x_test,wfold)
#         foldpred = np.dot(X_fold,wfold)
#         trainpred = np.dot(X_train,wfold)
        
#         cv_loss[fold,k] = ((foldpred - y_fold)**2).mean()
#         ind_loss[fold,k] = ((indpred - ytest)**2).mean()
#         train_loss[fold,k] = ((trainpred - y_train)**2).mean()
#         #fold_weights[(k-1)*(max_order*2+1):(k*max_order*2+1),k,fold] = wfold
#         fold_weights[fold,k,:k*2+1]= wfold
#     x_train = np.hstack((x_train,xtrain**(k+1)))
#     x_ktest = np.hstack((x_ktest,xtest**(k+1)))    

# order = np.arange(max_order+1)
# plt.plot(order[1:],train_loss.mean(axis=0)[1:],'b-',label="Training loss")
# plt.plot(order[1:],cv_loss.mean(axis=0)[1:],'r-',label="CV loss")
# plt.plot(order[1:],ind_loss.mean(axis=0)[1:],'k',label="Independent test loss")
# plt.legend()
# plt.xlabel('Model order')
# plt.ylabel('Mean squared loss')
# print ("The MSE's of the various models when applied to training and validation sets are as follows: ")
# for i in range(1,9):
#     print (i, " order model:")
#     print ("\t training data: \t", train_loss[:,i].mean())
#     print ("\t validation data: \t", cv_loss[:,i].mean())
# print("MSE by Model Order")

Given $x_{1}$, $x_{2}$, ... $x_{n}$, and $y$<br><br>
>$X=\begin{bmatrix}1&x_{1}&x_{2}&\dots&x_{n}\\1&x_{1}&x_{2}&\dots&x_{n}\\1&x_{1}&x_{2}&\dots&x_{n}\\\vdots&\vdots&\vdots\\1&x_{1_m}&x_{2_m}&\dots&x_{n_m}\end{bmatrix}$<br>
$y=\begin{bmatrix}y_{0}\\y_{1}\\y_{2}\\\vdots\\y_{n}\end{bmatrix}$<br>

<b>Recall: </b>Our target, or y, is the wine's quality score.<br>

Find the weight values $w$ below:<br>
> $w=\begin{bmatrix}w_{0}\\w_{1}\\w_{2}\end{bmatrix}$<br>

The equation of our model will be:<br>
>$y = \begin{bmatrix}  X \end{bmatrix} \begin{bmatrix} w \end{bmatrix} $<br>

In the code we have the <code>w_weights</code> and <code>r_weights</code>, which are equivalent to $w$, dotted with <code>w_train</code> or <code>r_train</code>, which is equivalent to $X$ and <code>y_w_train</code> or <code>y_r_train</code> which are equivalent to $y$. <br>

>(white wine model) $y_w_train = w_weights \cdot w_train$<br>

>(red wine model) $y_r_train = r_weights \cdot r_train$<br>

When we do the dot product of these it gets the equation below:<br>
> $y = w_{0} + w_{1} * x_{1} + w_{2} * x_{2}$ + ...<br><br>

And when solving for $w$ we get:<br>
> <h1> $(X^T X)^{-1} X^T y = w$ </h1> <br>

We implement this equation in Python to get all the weights.<br>
In out implementation we solve for the weights in this fashion: <br>
<code>w_weights = np.linalg.inv(w_train.transpose().dot(w_train)).dot(w_train.transpose()).dot(y_w_train)</code><br>
