In [2]:
# TODO: da fare ancora
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import itertools
import torch
from torch import nn
torch.backends.cudnn.benchmark = False
from torch.utils.data.sampler import WeightedRandomSampler
from torch.utils.data import Dataset, DataLoader, Subset, TensorDataset
from torch.utils.tensorboard import SummaryWriter


import os

In [3]:
class MoviesDataset(Dataset):
    def __init__(self):
        df = pd.read_csv("script_slurm/df.csv")
        df = self.cleaning(df)

        X, y, weights = self.split_XYweights(df)
        
        y = self.discretization(y)
        X.drop('rating_mean', inplace=True, axis=1)

        self.num_classes = y.nunique()
        self.X = torch.FloatTensor(X.values)
        self.y = torch.LongTensor(y)
        self.weights = torch.FloatTensor(weights)


    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx, :], self.y[idx], self.weights[idx]

    def split_XYweights(self, df):
        y = df['rating_mean']
        weights = df['ratings_count']
        X = df.drop(columns=['ratings_count'], axis=1)
        return X, y, weights

    def cleaning(self, df):
        df.dropna(subset=['rating_mean'], inplace=True)
        df_without_tags = df[df.iloc[:, 23:-2].isna().all(axis = 1)]
        df_without_tags_nor_genres = df_without_tags[
            df_without_tags['(no genres listed)'] == 1]
        rows_to_be_deleted = df.loc[df["movieId"].isin(
            df_without_tags_nor_genres["movieId"])].index
        df.drop(rows_to_be_deleted, axis=0, inplace=True)
        df.iloc[:, 23:-2] = df.iloc[:, 23:-2].fillna(0)
        df.drop(['(no genres listed)'], inplace=True, axis=1)
        df_year_without_na = df.year[-pd.isna(df.year)]
        df.year = df.loc[:, 'year'].fillna(
            np.median(df_year_without_na)).astype('int')
        df.drop('movieId', inplace=True, axis=1)
        df.drop_duplicates(inplace=True)
        return df

    def discretization(self, series):
        series = pd.cut(series, bins=5, labels=False)
        return series


In [4]:
class Feedforward(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, af_first_layer, af_hidden_layers, af_output_layer, num_hidden_layers, dropout):
        super(Feedforward, self).__init__()
    
        model = [nn.Linear(input_size, hidden_size), af_first_layer]

        for i in range(num_hidden_layers):
            model.append(nn.Linear(hidden_size, hidden_size))

            if batch_norm:
                model.append(nn.BatchNorm1d(hidden_size))
            
            model.append(af_hidden_layers)
            
            if dropout != 0:
                model.append(nn.Dropout(dropout))
    

        model.append(nn.Linear(hidden_size, num_classes))

        if af_output_layer :
            model.append(af_output_layer)

        self.model = nn.Sequential(*model)
        

    def forward(self, x):
        return self.model(x)


In [79]:
def train_model(model, criterion, optimizer, epochs, data_loader, device):
    model.train()
    loss_values = []
    n_epochs_stop = 3
    patience = 0
    early_stop = False
    min_loss = np.Inf
    for epoch in range(epochs):
        print(epoch)
        for batch_idx, samples in enumerate(data_loader):
            data, targets = samples[0].to(device), samples[1].to(device)
            optimizer.zero_grad()

            # Forward pass
            y_pred = model(data)
            # y_predd = torch.argmax(y_pred, dim=1)

            # Compute Loss
            loss = criterion(y_pred, targets)
            loss_values.append(loss.item())

            # Backward pass
            loss.backward()
            optimizer.step()

            loss_current_batch = loss.item()

            # If the validation loss is at a minimum
            if loss_current_batch < min_loss:
                # Save the model
                # torch.save(model)
                if min_loss - loss_current_batch >= 1:
                    patience = 0
                else:
                    patience += 1
                min_loss = loss_current_batch
            else:
                patience += 1

            if epoch > 1 and patience >= n_epochs_stop:
                print(f"Epoca: {epoch}, patience: {patience}, n stop: {n_epochs_stop}")
                early_stop = True
                break


        if early_stop:
            print(f'Early stopping at epoch number {epoch}!')
            break

    return model, loss_values


In [6]:
def evaluate_model(model, data_loader, device):
    model.eval()
    y_pred = []
    y_val = []

    for batch_idx, samples in enumerate(data_loader):
        data, targets = samples[0].to(device), samples[1].to(device)
        y_pred.append(model(data))
        y_val.append(targets)
    y_pred = torch.stack(y_pred).squeeze()
    y_val = torch.stack(y_val).squeeze()
    y_pred = y_pred.argmax(dim=1, keepdim=True).squeeze()
    print(classification_report(y_val.cpu(), y_pred.cpu(), zero_division=0))


def test_model(model, data_loader, device):
    model.eval()
    y_pred = []
    y_test = []

    for batch_idx, samples in enumerate(data_loader):
        data, targets = samples[0].to(device), samples[1].to(device)
        y_pred.append(model(data))
        y_test.append(targets)
    y_pred = torch.stack(y_pred).squeeze()
    y_test = torch.stack(y_test).squeeze()
    y_pred = y_pred.argmax(dim=1, keepdim=True).squeeze()
    print(classification_report(y_test.cpu(), y_pred.cpu(), zero_division=0))

In [7]:
def set_reproducibility(seed = 42):
	torch.manual_seed(seed)
	np.random.seed(seed)
	os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
	torch.use_deterministic_algorithms(True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device: {}".format(device))

hidden_size = 8
num_epochs = 10
batch_size = 32
learning_rate = 0.001
dropout = 0
batch_norm = False
af_first_layer = nn.LeakyReLU()
af_hidden_layers = nn.LeakyReLU()
af_output_layer = None
num_hidden_layers = 4

Device: cuda


In [26]:
hyperparams = {
	'nums_hidden_layers' : [1, 3, 5, 7, 10],
	'hidden_sizes' : [8, 16, 32, 64, 128],
	'batch_sizes' : [32, 64, 128, 256, 512],
	'af_first_layer' : [nn.Tanh(), nn.LeakyReLU()],
	'af_hidden_layers' : [nn.LeakyReLU()],
	'af_output_layer' : [None], # [None, nn.Softmax(dim=1)],
	'loss_function' : [nn.CrossEntropyLoss()], #[nn.CrossEntropyLoss(), nn.KLDivLoss(reduction = 'batchmean')],
	'dropout' : [0, 0.2, 0.4],
	'batch_norm' : [False, True],
	'learning_rates' : [0.01]
}

num_epochs = 500

In [9]:
#if __name__ == "__main__":
dataset = MoviesDataset()
train_idx, test_idx = train_test_split(np.arange(len(dataset)), test_size=0.2, stratify=dataset.y, random_state=42)
train_idx, val_idx = train_test_split(train_idx, test_size=0.1, stratify=dataset.y[train_idx], random_state=42)

# MinMaxScale training, validation and testing set su year e title_length
X_train = dataset.X[train_idx]
X_val = dataset.X[val_idx]
X_test = dataset.X[test_idx]

train_year_max = torch.max(X_train[:,1])
train_year_min = torch.min(X_train[:,1])
dataset.X[train_idx, 1] = (X_train[:,1] - train_year_min)/(train_year_max - train_year_min)
dataset.X[val_idx, 1] = (X_val[:,1] - train_year_min)/(train_year_max - train_year_min)
dataset.X[test_idx, 1] = (X_test[:,1] - train_year_min)/(train_year_max - train_year_min)

train_title_length_max = torch.max(X_train[:,2])
train_title_length_min = torch.min(X_train[:,2])
dataset.X[train_idx, 2] = (X_train[:,2] - train_title_length_min)/(train_title_length_max - train_title_length_min)
dataset.X[val_idx, 2] = (X_val[:,2] - train_title_length_min)/(train_title_length_max - train_title_length_min)
dataset.X[test_idx, 2] = (X_test[:,2] - train_title_length_min)/(train_title_length_max - train_title_length_min)

In [10]:
# Creating samplers to manage unbalancing classes
def class_weights(y):
    class_count = torch.bincount(y)
    class_weighting = 1. / class_count
    sample_weights = class_weighting[y]   # sarebbe np.array([weight[t] for t in y_train])
    return sample_weights

y_train = dataset.y[train_idx]

sample_weights = class_weights(y_train)
sampler_class_frequency = WeightedRandomSampler(sample_weights, len(train_idx))

# MinMaxScaling ratings_count
weights_train = dataset.weights[train_idx] 
weights_val = dataset.weights[val_idx]
weights_test = dataset.weights[test_idx] 

weights_train_max = torch.max(weights_train)
weights_train_min = torch.min(weights_train)
dataset.weights[train_idx]  = (weights_train - weights_train_min) / (weights_train_max - weights_train_min)
dataset.weights[val_idx] = (weights_val - weights_train_min) / (weights_train_max - weights_train_min)
dataset.weights[test_idx] = (weights_test - weights_train_min) / (weights_train_max - weights_train_min)

sampler_ratings_count = WeightedRandomSampler(dataset.weights[train_idx], len(train_idx))

In [11]:
set_reproducibility()

train_subset = Subset(dataset, train_idx)
val_subset = Subset(dataset, val_idx)
test_subset = Subset(dataset, test_idx)

train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=False, sampler=sampler_class_frequency, drop_last=True)
val_loader = DataLoader(val_subset, batch_size=1, shuffle=False, drop_last=True)
test_loader = DataLoader(test_subset, batch_size=1, shuffle=False, drop_last=True)



In [12]:
for i, samples in enumerate(train_loader):
    print(len(np.where(samples[1].numpy() == 0)[0]),
        len(np.where(samples[1].numpy() == 1)[0]),
        len(np.where(samples[1].numpy() == 2)[0]),
        len(np.where(samples[1].numpy() == 3)[0]),
        len(np.where(samples[1].numpy() == 4)[0]), sep = "\t"
    )

7	7	5	4	9
5	3	9	10	5
4	3	11	6	8
6	7	7	4	8
2	10	4	8	8
7	4	6	10	5
4	3	8	8	9
7	6	6	8	5
8	5	8	6	5
9	7	5	6	5
4	8	6	7	7
6	5	10	7	4
8	6	5	8	5
10	6	6	4	6
4	5	6	9	8
9	4	6	5	8
7	7	6	9	3
7	5	5	10	5
5	9	6	8	4
6	3	2	9	12
5	5	5	5	12
7	7	6	4	8
4	6	7	8	7
6	7	6	8	5
9	5	5	6	7
5	7	7	8	5
5	6	9	5	7
4	6	8	6	8
2	11	4	9	6
3	5	9	9	6
5	7	9	5	6
5	11	4	8	4
7	7	7	6	5
7	8	6	4	7
6	3	11	5	7
6	5	7	8	6
6	7	7	7	5
9	7	7	4	5
6	7	7	5	7
4	6	10	3	9
4	8	6	10	4
5	9	7	8	3
4	6	8	8	6
4	6	12	7	3
4	9	5	6	8
6	5	7	8	6
10	6	6	7	3
7	11	7	2	5
6	5	5	5	11
5	5	8	8	6
10	5	7	4	6
8	7	3	3	11
4	5	7	10	6
6	6	5	9	6
4	4	10	7	7
6	6	6	9	5
4	9	5	7	7
7	10	3	4	8
10	8	8	4	2
3	7	8	4	10
5	1	7	8	11
5	8	7	8	4
5	7	6	7	7
11	10	6	3	2
4	5	6	13	4
9	4	8	7	4
6	10	4	6	6
7	5	8	4	8
5	6	5	8	8
5	7	4	9	7
3	9	6	6	8
6	3	8	4	11
5	6	6	4	11
8	3	6	5	10
8	5	8	5	6
6	4	10	4	8
7	6	9	3	7
8	5	3	6	10
10	7	5	4	6
8	9	6	3	6
5	10	6	5	6
2	7	10	9	4
6	10	5	7	4
7	9	5	5	6
10	4	3	7	8
3	9	9	8	3
4	8	9	6	5
4	8	8	4	8
7	6	8	4	7
8	7	7	3	7
5	8	5	7	7
8	10	3	4	7
5	9	4	5	9
4	11	6	7	4
6	8	7	6	5
3	8	8	6	

In [19]:
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
torch.use_deterministic_algorithms(True)
torch.manual_seed(42)
np.random.seed(42)


model = Feedforward(dataset.X.shape[1], hidden_size, dataset.num_classes, af_first_layer, af_hidden_layers, af_output_layer, num_hidden_layers, dropout)
criterion = torch.nn.CrossEntropyLoss() #(weight = torch.Tensor([1,0.8,0.3,0.2,0.9]).cuda())
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
model.to(device)
evaluate_model(model, val_loader, device)
model, loss_values = train_model(model, criterion, optimizer, num_epochs, train_loader, device)
evaluate_model(model, val_loader, device)
plt.plot(loss_values)
plt.title(f"Number of epochs: {num_epochs}")
plt.show()

In [16]:
results = pd.DataFrame(columns = names)

In [80]:
set_reproducibility()


for config_params in itertools.product(*hyperparams.values()):
	*names, = hyperparams
	for param in range(len(hyperparams)):
		globals()[names[param]] = config_params[param]
	model = Feedforward(dataset.X.shape[1], hidden_size, dataset.num_classes, af_first_layer, af_hidden_layers, af_output_layer, num_hidden_layers, dropout)
	criterion = loss_function #(weight = torch.Tensor([1,0.8,0.3,0.2,0.9]).cuda())
	optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
	model.to(device)
	#test_model(model, test_loader, device)
	model, loss_values = train_model(model, criterion, optimizer, num_epochs, train_loader, device)
	test_model(model, test_loader, device)


0
1
2
Epoca: 2, patience: 2424, n stop: 3
Early stopping at epoch number 2!
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       311
           1       0.10      1.00      0.19      1117
           2       0.00      0.00      0.00      4265
           3       0.00      0.00      0.00      4691
           4       0.00      0.00      0.00       394

    accuracy                           0.10     10778
   macro avg       0.02      0.20      0.04     10778
weighted avg       0.01      0.10      0.02     10778

0
1


KeyboardInterrupt: 

In [None]:
test_model(model, test_loader, device)

In [None]:
def predict(row, model):
    row = torch.Tensor([row])
    yhat = model(row)
    yhat = yhat.detach().numpy()
    return yhat