In [1]:
from models import ModelMgr
import kagglehub
import ast
import pandas as pd

from models.semantic_validation import LLaMAValidationModel

from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split

from tqdm import tqdm
tqdm.pandas()

In [2]:
lvm = LLaMAValidationModel(ModelMgr())
lvm

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

<models.semantic_validation.LLaMA.LLaMAValidationModel at 0x7524106f1c10>

In [3]:
# Download latest version
path = kagglehub.dataset_download("rounakbanik/the-movies-dataset")

print("Path to dataset files:", path)

Path to dataset files: /home/nico/.cache/kagglehub/datasets/rounakbanik/the-movies-dataset/versions/7


In [4]:
# df_ratings_small = pd.read_csv(f"{path}/ratings_small.csv")
# df_links = pd.read_csv(f"{path}/links.csv")
# df_links_small = pd.read_csv(f"{path}/links_small.csv")
# df_credits = pd.read_csv(f"{path}/credits.csv")
# df_ratings = pd.read_csv(f"{path}/ratings.csv")

df_movies_metadata = pd.read_csv(f"{path}/movies_metadata.csv", low_memory=False, index_col="id").drop(index='1997-08-20').drop(index="2012-09-29")

df_keywords = pd.read_csv(f"{path}/keywords.csv", index_col="id")
df_keywords.index = df_keywords.index.astype(str)

In [5]:
df_keywords_map = df_keywords.copy()
df_keywords_map["keywords"] = df_keywords_map["keywords"].apply(lambda x: {y["name"] for y in ast.literal_eval(x)})
df_keywords_map

Unnamed: 0_level_0,keywords
id,Unnamed: 1_level_1
862,"{toy, friendship, boy next door, new toy, toy ..."
8844,"{disappearance, recluse, board game, new home,..."
15602,"{duringcreditsstinger, best friend, fishing, o..."
31357,"{based on novel, interracial relationship, div..."
11862,"{contraception, midlife crisis, mother daughte..."
...,...
439050,{tragic love}
111109,"{pinoy, play, artist}"
67758,{}
227506,{}


In [6]:
def assign_random_keywords(current_keywords: set, df_keywords_pool: pd.DataFrame):
    selected_keywords = df_keywords_pool.sample(1)["keywords"].values[0]
    while not selected_keywords.isdisjoint(current_keywords):
        selected_keywords: set = df_keywords_pool.sample(1)["keywords"].values[0]
    return selected_keywords

In [7]:
df_true, df_false = train_test_split(df_keywords_map, test_size=0.5)

df_true["class"] = True

df_false["keywords"] = df_false["keywords"].apply(lambda x: assign_random_keywords(x, df_true))
df_false["class"] = False

In [11]:
df_full = pd.concat([df_true, df_false])\
    .merge(df_movies_metadata, left_index=True, right_index=True)\
    [["keywords", "overview", "original_title", "class"]]

df_full = df_full[df_full["keywords"].apply(lambda x: len(x) > 0)].sample(frac=1)

df_full.head()

Unnamed: 0_level_0,keywords,overview,original_title,class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
56338,{woman director},7 Khoon Maaf movie is a romantic misadventures...,7 Khoon Maaf,False
78265,"{gangster, criminal mastermind}","Elizabeth is losing her sight, so while her lo...",This Woman Is Dangerous,True
9540,"{voyeurism, prostitution, drug, spectacle, twi...","Elliot, a successful gynecologist, works at th...",Dead Ringers,True
132379,{silent film},A magical glowing white motorcar dismembers po...,The '?' Motorist,True
25518,{woman director},"As part of an intergalactic coalition, a well-...",La Belle verte,True


In [12]:
def eval(df, prompt_creation):
    df["prompt"] = df.apply(prompt_creation, axis=1)
    df["prediction"] = df["prompt"].progress_apply(lambda x: lvm(x))
    print(classification_report(df["class"], df["prediction"]))
    return f1_score(df["class"], df["prediction"])

In [14]:
score = eval(df_full.head(200).copy(), lambda x: f"Do these keywords: {', '.join(x["keywords"])} describe this movie: \"{x["original_title"]}\"?")
print(score)

100%|██████████| 200/200 [00:09<00:00, 20.05it/s]

              precision    recall  f1-score   support

       False       0.61      0.89      0.72       105
        True       0.74      0.37      0.49        95

    accuracy                           0.64       200
   macro avg       0.68      0.63      0.61       200
weighted avg       0.67      0.64      0.61       200

0.49295774647887325





In [15]:
score = eval(df_full.head(200).copy(), lambda x: f"Do these: {', '.join(x["keywords"])} describe this: \"{x["original_title"]}\"?")
print(score)

100%|██████████| 200/200 [00:10<00:00, 19.03it/s]

              precision    recall  f1-score   support

       False       0.59      0.90      0.71       105
        True       0.74      0.31      0.43        95

    accuracy                           0.62       200
   macro avg       0.67      0.61      0.57       200
weighted avg       0.66      0.62      0.58       200

0.43283582089552236





In [16]:
score = eval(df_full.head(200).copy(), lambda x: f"Do these keywords: {', '.join(x["keywords"])} describe a movie with this description: \"{x["overview"]}\"?")
print(score)

100%|██████████| 200/200 [00:11<00:00, 17.29it/s]

              precision    recall  f1-score   support

       False       0.60      0.90      0.72       105
        True       0.76      0.34      0.47        95

    accuracy                           0.64       200
   macro avg       0.68      0.62      0.59       200
weighted avg       0.68      0.64      0.60       200

0.46715328467153283





In [17]:
score = eval(df_full.head(200).copy(), lambda x: f"Do these: {', '.join(x["keywords"])} describe this: \"{x["overview"]}\"?")
print(score)

100%|██████████| 200/200 [00:11<00:00, 17.82it/s]

              precision    recall  f1-score   support

       False       0.60      0.75      0.67       105
        True       0.62      0.45      0.52        95

    accuracy                           0.61       200
   macro avg       0.61      0.60      0.60       200
weighted avg       0.61      0.61      0.60       200

0.524390243902439



