In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
wine = pd.read_csv("winequality-red.csv", sep = ";")
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [6]:
# basic stats
wine.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [8]:
# check if all the datatypes are numeric
wine.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

In [7]:
# check if there is any nan in the dataset
wine.isna().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [9]:
# Create features 
X = wine.loc[:, "fixed acidity":"alcohol"]

# Create target
y = wine["quality"]


In [10]:
# split data into 80% training data, 20% test data, data is shuffled before applying split
trainx, testx, trainy, testy = train_test_split(X, y, train_size = 0.8, random_state = 0)

# standarize data
scaler = StandardScaler()

# fit on training fetures only
scaler.fit(trainx)

# apply transform to both the training set and the test features
trainx = scaler.transform(trainx)
testx = scaler.transform(testx)


In [11]:
# make an instance of the model
# make the model choose the minimum number of principal components so that 95% of the variance is retained
pca = PCA(0.95)
# pca = PCA(n_components = 4)
# #n_components = 4
# fitting PCA on trainx only
pca.fit(trainx)

#this can be used after fitting the data, find out how many componets are chosen
k = pca.n_components_

# apply the mapping(transform) to both the training and test features
trainx = pca.transform(trainx)
testx = pca.transform(testx)


In [13]:
# apply model, Logistic Regression
# make an instance of the model
# default solver is slow when have large dataset, so "lbfgs" is commonly chosen, solver = "lbfgs"
logisticRegr = LogisticRegression(solver = "lbfgs")
# logisticRegr = LogisticRegression(random_state = 0, penalty = 'l1')


# training the mode on the data, 
# model is learning relationships between x features and y species(target or lables)
logisticRegr.fit(trainx, trainy)

#Predic the labels of new data, using the information the model learned during the model training process

predicy = logisticRegr.predict(testx)

#check accuracy
accuracy = accuracy_score(testy, predicy)

print("When 0.95 of variance is retained, {} principle components are chosen, the accuracy of Logistic\
Regression after PCA is {:.2f} %.".format(k, accuracy * 100))


When 0.95 of variance is retained, 9 principle components are chosen, the accuracy of LogisticRegression after PCA is 61.88 %.
