# **IMPORT**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch.functional as F

import json
import os
import torch

os.chdir('F:\\UNIVERSITY\\UNIVERSITY_DOCUMENTS\\CS231\\doan_v2')

from skimage import io, transform
from tqdm import tqdm
from torch import nn
from torch import optim

from sklearn.metrics import jaccard_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import ClassifierChain, MultiOutputClassifier, MultiOutputRegressor 
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

from torch.utils.data import Dataset, DataLoader
from torch.nn import Linear, ReLU, Sigmoid, Softmax, Dropout, Sequential

from torchvision import transforms, utils
from torchvision.transforms import ToTensor
from torchvision.models import vgg16, resnet50, densenet169
from torchvision.models.vgg import VGG16_Weights

  from .autonotebook import tqdm as notebook_tqdm


# **FUNCTION**

In [2]:
# Load json
def load_json(path):
    with open(path, 'r', encoding='utf-8') as file:
        film_dic = json.load(file)
        return film_dic

# **DATASET**

In [3]:
classes = [
    'action', 'adventure', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'horror', 'mystery', 'thriller', 'romance', 'scifi', 'others'
]

config = {
    'img_size': (224, 224),
    'epochs': 10,
    'batches': 10,
}

In [4]:
class MovieGenreDataset():
    def __init__(
            self, 
            annot_path='data/train.json',
            features_dir='features_VGG',
            classes=[
                'action', 'adventure', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'horror', 'mystery', 'thriller', 'romance', 'scifi', 'others'
            ],
        ) -> None:
        annotation = load_json(annot_path)
        set_name = os.path.basename(annot_path).split('.')[0]
        
        # Load X
        npy_features_name = os.listdir(features_dir)
        npy_file_names = [f'{id}.npy' for id in annotation.keys() if f'{id}.npy' in npy_features_name]
        npy_file_paths = [os.path.join(features_dir, npy_file_name) for npy_file_name in npy_file_names ]

        self.X = np.array([
            np.load(npy_file_path) 
            for npy_file_path in tqdm(npy_file_paths, desc=f"Loading X_{set_name}")
        ])
        
        # Load y
        mlb = MultiLabelBinarizer()
        mlb.fit([classes])

        genres = [
            val['genre'] 
            for val in tqdm(annotation.values(), desc="Loading y")
        ]
        self.y = mlb.transform(genres)

    def get_data(self):
        return self.X, self.y
        

## *VGG*

In [6]:
train_VGG_dataset = MovieGenreDataset(
    annot_path='data/train_new.json',
    features_dir='features_VGG',
)

val_VGG_dataset = MovieGenreDataset(
    annot_path='data/val.json',
    features_dir='features_VGG',
)

test_VGG_dataset = MovieGenreDataset(
    annot_path='data/test.json',
    features_dir='features_VGG',
)

Loading X_train_new: 100%|██████████| 4863/4863 [01:57<00:00, 41.38it/s]
Loading y: 100%|██████████| 4863/4863 [00:00<00:00, 480832.16it/s]
Loading X_val: 100%|██████████| 2359/2359 [01:02<00:00, 37.93it/s]
Loading y: 100%|██████████| 2359/2359 [00:00<00:00, 589863.07it/s]
Loading X_test: 100%|██████████| 2512/2512 [00:56<00:00, 44.39it/s]
Loading y: 100%|██████████| 2513/2513 [00:00<00:00, 1254795.95it/s]


In [41]:
X_train_old_VGG, y_train_old_VGG = train_old_VGG_dataset.get_data()
X_train_VGG, y_train_VGG = train_VGG_dataset.get_data()
X_val_VGG, y_val_VGG = val_VGG_dataset.get_data()
X_test_VGG, y_test_VGG = test_VGG_dataset.get_data()

# **PCA**

## *VGG*

In [10]:
pca_vgg = PCA(n_components=0.4)
X_train_VGG_new = pca_vgg.fit_transform(X_train_VGG)
X_val_VGG_new = pca_vgg.transform(X_val_VGG)
X_test_VGG_new = pca_vgg.transform(X_test_VGG)

In [12]:
X_val_VGG_new.shape

(2359, 155)

# **MODELING**

## *MultiOutputClassifier*

In [13]:
# Random Forest
class MultiOutputRegressorRandomForest():
    def __init__(self):
        self.model = None
        self.model_tuning = None
        self.params = {
            'estimator__bootstrap': [True, False],
            'estimator__max_depth': [30, 50, 70, 90, None],
            'estimator__min_samples_leaf': [1, 2, 4],
            'estimator__min_samples_split': [2, 5, 10],
            'estimator__n_estimators': [50, 100],
        }

    def fit(self, X, y):
        print('Fitting model')
        self.model = MultiOutputRegressor(RandomForestRegressor())
        self.model.fit(X, y)

    def fit_tuning(self, X, y):
        print('Fitting tuning model')
        estimator = MultiOutputRegressor(RandomForestRegressor())
        self.model_tuning = RandomizedSearchCV(
            estimator=estimator, 
            param_distributions=self.params, 
            random_state=0,
            n_iter=2,
            verbose=True,
            n_jobs=-1,
        )
        self.model_tuning.fit(X, y)

    def predict(self, X, mode='default'):
        print('Prediction')
        if mode=='default' and self.model != None:
            prediction = self.model.predict(X)
        elif mode=='tune' and self.model_tuning != None:
            prediction = self.model_tuning.predict(X)
        else:
            print('Cant predict the X set !!')
            prediction = np.array([])
        return prediction


In [18]:
# Random Forest
from sklearn.linear_model import LogisticRegression
class MultiOutputClassifierLogisticRegression():
    def __init__(self):
        self.model = None
        self.model_tuning = None

    def fit(self, X, y):
        print('Fitting model')
        self.model = MultiOutputClassifier(LogisticRegression())
        self.model.fit(X, y)

    def predict(self, X, mode='default'):
        print('Prediction')
        if mode=='default' and self.model != None:
            prediction = self.model.predict(X)
        elif mode=='tune' and self.model_tuning != None:
            prediction = self.model_tuning.predict(X)
        else:
            print('Cant predict the X set !!')
            prediction = np.array([])
        return prediction


#### *Random Forest*

In [None]:
# VGG169
rf_vgg = MultiOutputRegressorRandomForest()
rf_vgg.fit(X_train_VGG_new, y_train_VGG)

Fitting model
Prediction
Prediction


In [None]:
# Predict
prediction_rf_vgg_test = rf_vgg.predict(X_test_VGG_new)
prediction_rf_vgg_val = rf_vgg.predict(X_val_VGG_new)
np.save('evaluation/prediction_rf_vgg_test.npy', prediction_rf_vgg_test)
np.save('evaluation/prediction_rf_vgg_val.npy', prediction_rf_vgg_val)

#### *Logistic Regression*

In [19]:
# Densenet169
logistic_r_vgg = MultiOutputClassifierLogisticRegression()
logistic_r_vgg.fit(X_train_VGG_new, y_train_VGG)

Fitting model


In [42]:
# Densenet169
logistic_r_vgg_old = MultiOutputClassifierLogisticRegression()
logistic_r_vgg_old.fit(X_train_old_VGG, y_train_old_VGG)

Fitting model


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [43]:
# Predict
prediction_logistic_r_vgg_old_test = logistic_r_vgg_old.predict(X_test_VGG)
prediction_logistic_r_vgg_old_val = logistic_r_vgg_old.predict(X_val_VGG)
np.save('evaluation/prediction_logistic_r_vgg_old_test.npy', prediction_logistic_r_vgg_old_test)
np.save('evaluation/prediction_logistic_r_vgg_old_val.npy', prediction_logistic_r_vgg_old_val)

Prediction
Prediction


In [20]:
# Predict
prediction_logistic_r_vgg_test = logistic_r_vgg.predict(X_test_VGG_new)
prediction_logistic_r_vgg_val = logistic_r_vgg.predict(X_val_VGG_new)
np.save('evaluation/prediction_logistic_r_vgg_test.npy', prediction_logistic_r_vgg_test)
np.save('evaluation/prediction_logistic_r_vgg_val.npy', prediction_logistic_r_vgg_val)

Prediction
Prediction


#### *SVC*

In [None]:
# Random Forest
from sklearn.svm import SVC
class MultiOutputClassifierSVC():
    def __init__(self):
        self.model = None
        self.model_tuning = None

    def fit(self, X, y):
        print('Fitting model')
        self.model = MultiOutputClassifier(SVC())
        self.model.fit(X, y)

    def predict(self, X, mode='default'):
        print('Prediction')
        if mode=='default' and self.model != None:
            prediction = self.model.predict(X)
        elif mode=='tune' and self.model_tuning != None:
            prediction = self.model_tuning.predict(X)
        else:
            print('Cant predict the X set !!')
            prediction = np.array([])
        return prediction


In [31]:
# Densenet169
svc_vgg = MultiOutputClassifierSVC()
svc_vgg.fit(X_train_VGG_new, y_train_VGG)

Fitting model


In [32]:
# Predict
prediction_svc_vgg_test = svc_vgg.predict(X_test_VGG_new)
prediction_svc_vgg_val = svc_vgg.predict(X_val_VGG_new)
np.save('evaluation/prediction_svc_vgg_test.npy', prediction_svc_vgg_test)
np.save('evaluation/prediction_svc_vgg_val.npy', prediction_svc_vgg_val)

Prediction
Prediction
