In [2]:
from src.genre_classification.F_Basic_Models import Basic_Models
from src.genre_classification.F_Dataset_Downloader import Dataset_Downloader
from src.genre_classification.F_Pretrained_models import Pretrained 
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset_downloader = Dataset_Downloader()
train_path, test_path = dataset_downloader(overwrite=True)

train_data = pd.read_csv(train_path.resolve())
test_data = pd.read_csv(test_path.resolve())

train_data.head()

Unnamed: 0,title,text,genre
0,Ayni - nan,Ayni - nan. Four Peruvian strangers are select...,action_adventure
1,His Kind of Woman - 1951,His Kind of Woman - 1951. A deported gangster'...,suspense_crime
2,Little Alice's Storytime: Through the Looking ...,Little Alice's Storytime: Through the Looking ...,comedy_family
3,Punchi Andare - 2018,Punchi Andare - 2018. A story about a kid who ...,comedy_family
4,He Has Nothing But Kung Fu - 1977,He Has Nothing But Kung Fu - 1977. He's lost h...,action_adventure


In [4]:
x_train, y_train = train_data.drop(columns=["genre"]), train_data["genre"]

x_test, y_test = test_data.drop(columns=["genre"]), test_data["genre"]

unique_labels = sorted(list(set(train_data["genre"])))


# Entrenamiento de los Modelos Basicos

In [None]:
basic_models_name = ['Naive_Bayes', 'LogReg', 'Linear_SVM', 'Random_Forest']

for model_name in basic_models_name:
    model = Basic_Models(model_type= model_name)

    model.fit(x_train, y_train)
    y_hat = model.predict(x_test)
    model.evaluate(y_true= y_test, y_hat=y_hat, labels = set(y_test), evaluate_type= "all_metrics")
    model.save_model(name= model_name)


# Pruebas de Modelos Transformer

In [5]:
train_texts = train_data["text"].tolist() 
test_texts = test_data["text"].tolist()   

model_transformer = Pretrained(model_type="roberta-base", labels =sorted(list(set(train_data["genre"])))) 

model_transformer.fit(train_texts, train_labels = list(train_data["genre"]), batch_size=16,epochs=3, learning_rate=2e-5, weight_decay=0.01)

predictions = model_transformer.transform(test_texts)

model_transformer.save_model_and_tokenizer()


Using device: cuda
Loading model: roberta-base...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detectadas etiquetas de texto. Convirtiendo a IDs numéricos internamente...


Map: 100%|██████████| 221598/221598 [00:06<00:00, 31931.48 examples/s]



Epoch 1/3


Training: 100%|██████████| 13850/13850 [26:00<00:00,  8.88it/s, loss=1.56] 


Average Loss Epoch 1: 1.0338

Epoch 2/3


Training: 100%|██████████| 13850/13850 [25:51<00:00,  8.93it/s, loss=0.708]


Average Loss Epoch 2: 0.8674

Epoch 3/3


Training: 100%|██████████| 13850/13850 [25:51<00:00,  8.92it/s, loss=0.48] 


Average Loss Epoch 3: 0.7598


Map: 100%|██████████| 55400/55400 [00:01<00:00, 36824.18 examples/s]
Inference: 100%|██████████| 1732/1732 [00:58<00:00, 29.45it/s]


Modelo guardado en ./Models/Modelos_Transformer\roberta-base
Tokenizer guardado en ./Models/Modelos_Transformer\roberta-base_tokenizer


In [12]:
print(y_test )
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, predictions))


0              suspense_crime
1            action_adventure
2            action_adventure
3            action_adventure
4               drama_romance
                 ...         
55395           drama_romance
55396    scifi_horror_fantasy
55397           drama_romance
55398        action_adventure
55399          suspense_crime
Name: genre, Length: 55400, dtype: object
0.0


In [21]:

from src.genre_classification.F_Compute_Metrics import Compute_Metrics

pred_labels = [model_transformer.id2label[idx] for idx in predictions]

print(pred_labels)

metrics = Compute_Metrics(pred_labels, y_test, unique_labels)
resultados = metrics.compute_all()

print(resultados)

['action_adventure', 'scifi_horror_fantasy', 'action_adventure', 'comedy_family', 'action_adventure', 'scifi_horror_fantasy', 'scifi_horror_fantasy', 'drama_romance', 'scifi_horror_fantasy', 'action_adventure', 'suspense_crime', 'suspense_crime', 'drama_romance', 'scifi_horror_fantasy', 'comedy_family', 'drama_romance', 'comedy_family', 'scifi_horror_fantasy', 'comedy_family', 'action_adventure', 'suspense_crime', 'suspense_crime', 'action_adventure', 'scifi_horror_fantasy', 'action_adventure', 'scifi_horror_fantasy', 'scifi_horror_fantasy', 'suspense_crime', 'action_adventure', 'suspense_crime', 'scifi_horror_fantasy', 'scifi_horror_fantasy', 'drama_romance', 'scifi_horror_fantasy', 'action_adventure', 'scifi_horror_fantasy', 'action_adventure', 'comedy_family', 'suspense_crime', 'action_adventure', 'scifi_horror_fantasy', 'drama_romance', 'scifi_horror_fantasy', 'action_adventure', 'drama_romance', 'comedy_family', 'comedy_family', 'suspense_crime', 'suspense_crime', 'suspense_crime'