# Regression using neural network

## 1. Useful import

In [63]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

## 2. Import dataset // Training set

In [64]:
df = pd.read_csv(r"C:\Users\Khéo\OneDrive\IMT\A2\Ue-Journey\Projet\Code\MovieHype\data\arrange.csv")
df = df.drop(columns = ["compagnies_production", "realisateur", "casting", "Mystery", "Unnamed: 0"])

#Date
df['date'] = df['date'].fillna('pas de date')
index_names = df[(df['date'] == 'pas de date')].index
df = df.drop(index=index_names)
df ["date"] = pd.to_datetime(df["date"])
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["day"] = df["date"].dt.day


#Import well
df = df.reindex(columns=["titre","year", "month", "day", "date", "duree", "budget", "recette", "casting_score", "realisateur_score", "compagny_score", "suite", "Action", "Adventure", "Animation", "Comedy", "Crime", "Documentary", "Drama", "Family", "Fantasy", "History", "Horror", "Music", "Romance", "Science Fiction", "TV Movie", "Thriller", "War", "Western"])



#Training datatset
bins = 15

recette_price_bins = pd.qcut(df['recette'], q=bins, labels=list(range(bins)))

X_train, X_test, Y_train, Y_test = train_test_split(df.drop(columns=['titre','recette', "date"]), df['recette'], train_size=0.9, random_state=12, stratify=recette_price_bins)

In [65]:
"""
    Constant indicating the proportion of the dataset to use as training set.
"""

TRAINING_SET_RATIO = 0.9


# Let's split the dataset
train_X = X[: int(TRAINING_SET_RATIO * X.shape[0])][:]
train_Y = Y[: int(TRAINING_SET_RATIO * Y.shape[0])][:]
val_X = X[int(TRAINING_SET_RATIO * X.shape[0]) :][:]
val_Y = Y[int(TRAINING_SET_RATIO * Y.shape[0]) :][:]

# Info
print("train_X", train_X.shape)
print("train_Y", train_Y.shape)
print("val_X", val_X.shape)
print("val_Y", val_Y.shape)

train_X (3303, 27)
train_Y (3303,)
val_X (368, 27)
val_Y (368,)


## 3. Setup for pytorch

In [66]:
class Dataset(torch.utils.data.Dataset):
  '''
  Prepare the dataset for regression
  We transform the dataset into tensor and we scaled them too
  '''

  def __init__(self, X_train, Y_train, X_test, Y_test, scale_data=True):
    if not torch.is_tensor(X_train) and not torch.is_tensor(Y_train) and not torch.is_tensor(X_test) and not torch.is_tensor(Y_test):
      # Apply scaling if necessary
      if scale_data:
          X_train = StandardScaler().fit_transform(X_train)
          X_test = StandardScaler().fit_transform(X_test)

      
      self.X_train = torch.FloatTensor(X_train)
      self.Y_train = torch.FloatTensor(Y_train.to_numpy())

      self.X_test = torch.FloatTensor(X_test)
      self.Y_test = torch.FloatTensor(Y_test.to_numpy())

  def __len__(self):
      return len(self.X_train)

  def __getitemtrain__(self, i):
      return self.X_train[i], self.Y_train[i]

  def __getitemtest__(self, i):
      return self.X_test[i], self.Y_test[i]

### Neural network archicture

In [67]:
class MLP(nn.Module):
  '''
    Multilayer Perceptron for regression.
  '''
  def __init__(self):
    super().__init__()
    self.layers = nn.Sequential(
      nn.Linear(27, 64),
      nn.ReLU(),
      nn.Linear(64, 32),
      nn.ReLU(),
      nn.Linear(32, 1)
    )


  def forward(self, x):
    '''
      Forward pass
    '''
    return self.layers(x)

## 4. Instanciating the model

### Device setup

In [68]:
# Check if GPU is available
is_cuda = torch.cuda.is_available()

# Select it as default, or CPU otherwise
print(is_cuda)
if is_cuda:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

False


### Dataset setup

In [72]:
dataset = torch.utils.data.TensorDataset(X_train,Y_train, X_test, Y_test)
print(dataset.X_train.shape)
print(dataset.Y_train.shape)
trainloader = torch.utils.data.DataLoader(dataset, batch_size=10, shuffle=True, num_workers=0)


TypeError: 'numpy.int32' object is not callable

### Model, Loss function and optimizer

In [70]:
# Initialize the MLP
mlp = MLP()

# Define the loss function and optimizer
loss_function = torch.nn.MSELoss()
optimizer = torch.optim.Adam(mlp.parameters(), lr=1e-4)

### Training

In [71]:
train_losses, test_losses = [], []
n_epochs = 10
# Run the training loop
for epoch in range(0, n_epochs): # 5 epochs at maximum

    # Print epoch
    print(f'Starting epoch {epoch+1}')

    # Set current loss value
    train_loss, test_loss = 0.0, 0.0

    # Iterate over the DataLoader for training data
    for i, data in enumerate(trainloader, 0):
        
        # Get and prepare inputs
        inputs, targets, inputs_test, targets_test =  data
        inputs, targets, inputs_test, targets_test = inputs.float(), targets.float(), inputs_test.float(), targets_test.float()
        targets = targets.reshape((targets.shape[0], 1))
        targets_test = targets_test.reshape((targets_test.shape[0], 1))
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Perform forward pass
        outputs = mlp(inputs)
        
        # Compute loss
        loss = loss_function(outputs, targets)
        
        # Perform backward pass
        loss.backward()
        
        # Perform optimization
        optimizer.step()
        
        # Print statistics
        train_loss += loss.item()
        train_loss = 0.0
        
        with torch.no_grad():
            mlp.eval()
            pred_ytest = mlp.forward(inputs_test)
            test_loss += torch.sqrt(loss_function(inputs_test, targets_test).item())

        train_losses.append(train_loss / len(trainloader))
        test_losses.append(test_loss / len(trainloader))
# Process is complete.
print('Training process has finished.')

Starting epoch 1


NotImplementedError: 

In [None]:
plt.plot(
    np.array(train_losses).reshape((n_epochs, -1)).mean(axis=1),
    label='Training loss'
)
plt.plot(
    np.array(test_losses).reshape((n_epochs, -1)).mean(axis=1),
    label='Validation loss'
)
plt.legend(frameon=False)
plt.xlabel('epochs')
plt.ylabel('MSE')