<h1>Model training</h1>

In [1]:
import torch 
from torch import nn
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import pathlib
import polars as pl


<h3>Dataset Analysis</h3>

In [2]:
df = pl.read_csv("./dataset/winequality-red.csv")
df.head()

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64
7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
df.describe()

statistic,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
"""std""",1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
"""min""",4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
"""25%""",7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
"""50%""",7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
"""75%""",9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.99784,3.4,0.73,11.1,6.0
"""max""",15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [4]:
df.plot()

<h3>Machine Learning Model Creation</h3>

In [5]:
from sklearn.model_selection import train_test_split
import numpy as np

In [6]:
class CustomDataset(Dataset):
    def __init__(self, df) -> None:
        super().__init__()
        self.data = df
    
    def __len__(self):
        return len(self.data)
        
    def __getitem__(self, index):
        
        y = df['quality'].to_numpy()
        
        x = df.drop(['quality'])
        
        x = x.to_numpy()
        
        y = np.float32(y)
        
        x = np.float32(x)
        
        return x[index], y[index]

In [7]:
df = pl.read_csv("./dataset/winequality-red.csv")

In [8]:
train_data, test_data = train_test_split(df, test_size=0.2)

train_data = CustomDataset(train_data)
test_data = CustomDataset(test_data)

In [9]:
train_data = DataLoader(train_data, shuffle=True, batch_size=32)
test_data = DataLoader(test_data, shuffle=True, batch_size=32)

In [10]:
class PredictionModel(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.layers = nn.Sequential(
            nn.Linear(in_features=len(df.columns) -1, 
                      out_features=len(df.columns) -1),
            nn.LeakyReLU(),
            nn.Linear(in_features=len(df.columns) -1,
                      out_features=len(df.columns) -1),
            nn.LeakyReLU(),
            nn.Linear(in_features=len(df.columns) - 1,
                      out_features=1)
        )
        
    def forward(self, x):
        return self.layers(x)
        

In [11]:
model = PredictionModel()

In [12]:
loss_fn = nn.MSELoss()
opt_fn = torch.optim.Adam(params=model.parameters(),
                          lr=0.001)
epochs = 50

batch_size = 32

In [13]:
batch = next(iter(train_data))
pred = model(batch[0])
pred = torch.squeeze(pred, -1)
pred.shape

torch.Size([32])

In [14]:
for epoch in range(epochs):
    for _, batch in enumerate(train_data):
        
        model.train(True)
        
        X, y = batch
        
        opt_fn.zero_grad()
        
        y_pred = torch.squeeze(model(X), 1)
        
        loss = loss_fn(y_pred, y)
        
        loss.backward()
        
        opt_fn.step()
        
    if epoch % 10 == 0:
        
        model.eval()
        
        with torch.no_grad():
            
            accumulated_loss = 0
            
            for _, eval_batch in enumerate(test_data):
                
                X_test, y_test = eval_batch
            
                y_test_pred = torch.squeeze(model(X_test), 1)
                
                accumulated_loss += loss_fn(y_test_pred, y_test)
                
            test_loss = accumulated_loss / batch_size
                
            print(f"Epoch: {epoch} | Loss: {loss/batch_size} | Avg Test Lost: {test_loss}")


Epoch: 0 | Loss: 0.19305823743343353 | Avg Test Lost: 1.6634101867675781
Epoch: 10 | Loss: 0.016711775213479996 | Avg Test Lost: 0.1261260062456131
Epoch: 20 | Loss: 0.01663926988840103 | Avg Test Lost: 0.115607850253582
Epoch: 30 | Loss: 0.024627910926938057 | Avg Test Lost: 0.12856273353099823
Epoch: 40 | Loss: 0.009962933138012886 | Avg Test Lost: 0.11860146373510361


In [15]:

torch.save(model.state_dict(), "./model/trained_model.pth")