# **IMPORT**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch.functional as F

import json
import os
import torch

os.chdir('F:\\UNIVERSITY\\UNIVERSITY_DOCUMENTS\\CS231\\doan_v2')

from skimage import io, transform
from tqdm import tqdm
from torch import nn
from torch import optim

from sklearn.metrics import jaccard_score, hamming_loss, precision_score, recall_score, f1_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import ClassifierChain, MultiOutputClassifier, MultiOutputRegressor 
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression

from torch.utils.data import Dataset, DataLoader
from torch.nn import Linear, ReLU, Sigmoid, Softmax, Dropout, Sequential

from torchvision import transforms, utils
from torchvision.transforms import ToTensor
from torchvision.models import vgg16, resnet50, densenet169
from torchvision.models.vgg import VGG16_Weights

  from .autonotebook import tqdm as notebook_tqdm


# **FUNCTION**

In [2]:
# Load json
def load_json(path):
    with open(path, 'r', encoding='utf-8') as file:
        film_dic = json.load(file)
        return film_dic

# **DATASET**

In [3]:
classes = [
    'action', 'adventure', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'horror', 'mystery', 'thriller', 'romance', 'scifi', 'others'
]

config = {
    'img_size': (224, 224),
    'epochs': 10,
    'batches': 10,
}

In [4]:
class MovieGenreDataset():
    def __init__(
            self, 
            annot_path='data/train_new.json',
            features_dir='features_VGG',
            classes=[
                'action', 'adventure', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'horror', 'mystery', 'thriller', 'romance', 'scifi', 'others'
            ],
        ) -> None:
        annotation = load_json(annot_path)
        set_name = os.path.basename(annot_path).split('.')[0]
        
        # Load X
        npy_features_name = os.listdir(features_dir)
        npy_file_names = [f'{id}.npy' for id in annotation.keys() if f'{id}.npy' in npy_features_name]
        npy_file_paths = [os.path.join(features_dir, npy_file_name) for npy_file_name in npy_file_names ]

        self.X = np.array([
            np.load(npy_file_path) 
            for npy_file_path in tqdm(npy_file_paths, desc=f"Loading X_{set_name}")
        ])
        
        # Load y
        mlb = MultiLabelBinarizer()
        mlb.fit([classes])

        genres = [
            val['genre'] 
            for val in tqdm(annotation.values(), desc="Loading y")
        ]
        self.y = mlb.transform(genres)

    def get_data(self):
        return self.X, self.y
        

## *DENSENET*

In [5]:
train_Resnet50_dataset = MovieGenreDataset(
    annot_path='data/train_new.json',
    features_dir='features_Resnet50',
)

val_Resnet50_dataset = MovieGenreDataset(
    annot_path='data/val.json',
    features_dir='features_Resnet50',
)

test_Resnet50_dataset = MovieGenreDataset(
    annot_path='data/test.json',
    features_dir='features_Resnet50',
)

Loading X_train_new: 100%|██████████| 4863/4863 [00:38<00:00, 125.53it/s]
Loading y: 100%|██████████| 4863/4863 [00:00<00:00, 2388955.30it/s]
Loading X_val: 100%|██████████| 2359/2359 [00:11<00:00, 197.57it/s]
Loading y: 100%|██████████| 2359/2359 [00:00<00:00, 786765.52it/s]
Loading X_test: 100%|██████████| 2511/2511 [00:07<00:00, 350.68it/s] 
Loading y: 100%|██████████| 2513/2513 [00:00<00:00, 2508994.51it/s]


In [6]:
X_train_Resnet, y_train_Resnet = train_Resnet50_dataset.get_data()
X_val_Resnet, y_val_Resnet = val_Resnet50_dataset.get_data()
X_test_Resnet, y_test_Resnet = test_Resnet50_dataset.get_data()

# **PCA**

## *DENSENET*

In [7]:
pca_Resnet = PCA(n_components=0.4)
X_train_Resnet_new = pca_Resnet.fit_transform(X_train_Resnet)
X_val_Resnet_new = pca_Resnet.transform(X_val_Resnet)
X_test_Resnet_new = pca_Resnet.transform(X_test_Resnet)

In [11]:
cs = np.cumsum(pca_Resnet.explained_variance_ratio_)
np.argwhere(cs >= 0.4)

array([[66]], dtype=int64)

# **MODELING**

## *MultiOutputClassifier*

In [16]:
# Random Forest
class MultiOutputRegressorRandomForest():
    def __init__(self):
        self.model = None
        self.model_tuning = None
        self.params = {
            'estimator__bootstrap': [True, False],
            'estimator__max_depth': [30, 50, 70, 90, None],
            'estimator__min_samples_leaf': [1, 2, 4],
            'estimator__min_samples_split': [2, 5, 10],
            'estimator__n_estimators': [50, 100],
        }

    def fit(self, X, y):
        print('Fitting model')
        self.model = MultiOutputRegressor(RandomForestRegressor())
        self.model.fit(X, y)

    def fit_tuning(self, X, y):
        print('Fitting tuning model')
        estimator = MultiOutputRegressor(RandomForestRegressor())
        self.model_tuning = RandomizedSearchCV(
            estimator=estimator, 
            param_distributions=self.params, 
            random_state=0,
            n_iter=2,
            verbose=True,
            n_jobs=-1,
        )
        self.model_tuning.fit(X, y)

    def predict(self, X, mode='default'):
        print('Prediction')
        if mode=='default' and self.model != None:
            prediction = self.model.predict(X)
        elif mode=='tune' and self.model_tuning != None:
            prediction = self.model_tuning.predict(X)
        else:
            print('Cant predict the X set !!')
            prediction = np.array([])
        return prediction


In [17]:
# Random Forest
class MultiOutputClassifierLogisticRegression():
    def __init__(self):
        self.model = None
        self.model_tuning = None

    def fit(self, X, y):
        print('Fitting model')
        self.model = MultiOutputClassifier(LogisticRegression())
        self.model.fit(X, y)

    def predict(self, X, mode='default'):
        print('Prediction')
        if mode=='default' and self.model != None:
            prediction = self.model.predict(X)
        elif mode=='tune' and self.model_tuning != None:
            prediction = self.model_tuning.predict(X)
        else:
            print('Cant predict the X set !!')
            prediction = np.array([])
        return prediction


### *No Tuning*

#### *Random Forest*

In [21]:
# Resnet169
rf_resnet = MultiOutputRegressorRandomForest()
rf_resnet.fit(X_train_Resnet_new, y_train_Resnet)

Fitting model


In [22]:
# Predict
prediction_rf_resnet_test = rf_resnet.predict(X_test_Resnet_new)
prediction_rf_resnet_val = rf_resnet.predict(X_val_Resnet_new)
np.save('evaluation/prediction_rf_resnet_test.npy', prediction_rf_resnet_test)
np.save('evaluation/prediction_rf_resnet_val.npy', prediction_rf_resnet_val)

Prediction
Prediction


#### *Logistic Regression*

In [25]:
# Resnet169
logistic_r_resnet = MultiOutputClassifierLogisticRegression()
logistic_r_resnet.fit(X_train_Resnet_new, y_train_Resnet)

Fitting model


In [26]:
# Predict
prediction_logistic_r_resnet_test = logistic_r_resnet.predict(X_test_Resnet_new)
prediction_logistic_r_resnet_val = logistic_r_resnet.predict(X_val_Resnet_new)
np.save('evaluation/prediction_logistic_r_resnet_test.npy', prediction_logistic_r_resnet_test)
np.save('evaluation/prediction_logistic_r_resnet_val.npy', prediction_logistic_r_resnet_val)

Prediction
Prediction


#### *SVC*

In [8]:
# Random Forest
from sklearn.svm import SVC
class MultiOutputClassifierSVC():
    def __init__(self):
        self.model = None
        self.model_tuning = None

    def fit(self, X, y):
        print('Fitting model')
        self.model = MultiOutputClassifier(SVC())
        self.model.fit(X, y)

    def predict(self, X, mode='default'):
        print('Prediction')
        if mode=='default' and self.model != None:
            prediction = self.model.predict(X)
        elif mode=='tune' and self.model_tuning != None:
            prediction = self.model_tuning.predict(X)
        else:
            print('Cant predict the X set !!')
            prediction = np.array([])
        return prediction


In [9]:
svc_resnet = MultiOutputClassifierSVC()
svc_resnet.fit(X_train_Resnet_new, y_train_Resnet)

Fitting model


In [10]:
# # Predict
prediction_svc_resnet_test = svc_resnet.predict(X_test_Resnet_new)
prediction_svc_resnet_val = svc_resnet.predict(X_val_Resnet_new)
np.save('evaluation/prediction_svc_resnet_test.npy', prediction_svc_resnet_test)
np.save('evaluation/prediction_svc_resnet_val.npy', prediction_svc_resnet_val)

Prediction
Prediction
