In [2]:
from transformers import pipeline
from nltk.tokenize import sent_tokenize,word_tokenize
import nltk
import torch
from glob import glob
import pandas as pd

In [32]:
nltk.download("punkt")
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HuyTinh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\HuyTinh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

# Load Model

In [9]:
model_name = "facebook/bart-large-mnli"
device = 0 if torch.cuda.is_available() else "cpu"

In [10]:
def load_model(device):
    theme_classifier = pipeline(
        task="zero-shot-classification",
        model=model_name,
        device=device  # Set to 0 for GPU, -1 for CPU
    )
    return theme_classifier

In [11]:
theme_classifier = load_model(device)




In [12]:
theme_list = ["firendship","battle", "seft development", "love", "sacrifice", "hope"]

In [13]:
theme_classifier(
    "I gave him a right hook then a left jab",
    theme_list,
    multi_label=True
)

{'sequence': 'I gave him a right hook then a left jab',
 'labels': ['battle',
  'firendship',
  'seft development',
  'hope',
  'sacrifice',
  'love'],
 'scores': [0.9121252298355103,
  0.6396022439002991,
  0.5008586645126343,
  0.08781788498163223,
  0.04500041902065277,
  0.004292412661015987]}

# Load Dataset

In [14]:
files = glob('../data/subtitles/*.ass')

In [15]:
files[:5]

['../data/subtitles\\01.ass',
 '../data/subtitles\\02.ass',
 '../data/subtitles\\03.ass',
 '../data/subtitles\\04.ass',
 '../data/subtitles\\05.ass']

In [16]:
with open(files[0], 'r') as file:
    lines = file.readlines()
    lines = lines[26:]
    lines = [",".join(line.split(",")[9:]) for line in lines]
    lines = [line.replace("\\N","") for line in lines]

In [17]:
lines[:2]

['Wealth, fame, power...\n', 'Gold Roger,the King of the Pirates,\n']

In [18]:
" ".join(lines[:10])

"Wealth, fame, power...\n Gold Roger,the King of the Pirates,\n attained everythingthis world has to offer.\n The words he uttered just beforehis death drove people to the seas.\n My treasure?If you want it, you can have it!\n Find it! I left everythingthis world has to offer there!\n And so men head for theGrand Line in pursuit of their dreams!\n The world has trulyentered a Great Pirate Era!\n We're going to gather upall our dreams\n and set out in searchof something to find\n"

In [19]:
int(files[0].split("\\")[-1].split(".")[0])

1

In [20]:
def load_subtitles_dataset(dataset_path):
    subtitles_path = glob(dataset_path + '/*.ass')
    
    scripts = []
    episode_nums = []
    
    for path in subtitles_path:
        
        #Read lines
        with open(files[0], 'r') as file:
            lines = file.readlines()
            lines = lines[26:]
            lines = [",".join(line.split(",")[9:]) for line in lines]
            
        lines = [line.replace("\\N","") for line in lines]
        script = " ".join(lines)
        
        episode = int(path.split("\\")[-1].split(".")[0])
        
        scripts.append(script)
        episode_nums.append(episode)
        
    
    df = pd.DataFrame.from_dict({"episode": episode_nums, "script": scripts})
    
    return df

In [21]:
dataset_path = "../data/subtitles"
df = load_subtitles_dataset(dataset_path)

In [22]:
df.head()

Unnamed: 0,episode,script
0,1,"Wealth, fame, power...\n Gold Roger,the King o..."
1,2,"Wealth, fame, power...\n Gold Roger,the King o..."
2,3,"Wealth, fame, power...\n Gold Roger,the King o..."
3,4,"Wealth, fame, power...\n Gold Roger,the King o..."
4,5,"Wealth, fame, power...\n Gold Roger,the King o..."


# Run Model

In [23]:
script = df.iloc[0]["script"]

In [24]:
script

'Wealth, fame, power...\n Gold Roger,the King of the Pirates,\n attained everythingthis world has to offer.\n The words he uttered just beforehis death drove people to the seas.\n My treasure?If you want it, you can have it!\n Find it! I left everythingthis world has to offer there!\n And so men head for theGrand Line in pursuit of their dreams!\n The world has trulyentered a Great Pirate Era!\n We\'re going to gather upall our dreams\n and set out in searchof something to find\n ONE PIECE!\n Compasses only cause delays\n Delirious with fever,I take the helm\n If the dusty treasure map hasbeen verified, it\'s not a legend!\n When it comes to personal storms,\n simply ride aboardsomeone else\'s biorhythm\n and pretend it isn\'t there!\n We\'re going to gather upall our dreams\n and set out in searchof something to find\n A coin in my pocket,and do you wanna be my friend?\n We are, We are on the cruise!\n We are!\n Whoa...\n "I\'m Luffy! The Man Who\'sGonna be King of the Pirates!"\n No 

In [33]:
script_sentences = sent_tokenize(script)
script_sentences[:2]

['Wealth, fame, power...\n Gold Roger,the King of the Pirates,\n attained everythingthis world has to offer.',
 'The words he uttered just beforehis death drove people to the seas.']