In [1]:
from transformers import pipeline
from nltk import sent_tokenize
import nltk
import torch
import tensorflow as tf
from tensorflow import keras as tf_keras
from glob import glob
import re
import pandas as pd
import os
import numpy as np




  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kshitij\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# LOAD MODEL 

In [4]:
import torch.cuda


model_name = "facebook/bart-large-mnli"
device = 0 if torch.cuda.is_available() else "cpu" 

In [5]:
device

'cpu'

In [6]:
def load_model(device):
    theme_classifier  = pipeline(
        "zero-shot-classification",
        model = model_name,
        device=device
    )
    return theme_classifier 


In [7]:
theme_classifier = load_model(device)






In [8]:
theme_list = ["friendship","love","betrayal","sacrifice","battle","magic"]

In [9]:
theme_classifier(
    "I gave him a left hook and then a right jab",
    theme_list,
    multi_label = True
)

{'sequence': 'I gave him a left hook and then a right jab',
 'labels': ['battle', 'magic', 'sacrifice', 'betrayal', 'love', 'friendship'],
 'scores': [0.8919000625610352,
  0.45951583981513977,
  0.05627250298857689,
  0.014992475509643555,
  0.014075926505029202,
  0.0071584009565413]}

In [10]:
# load dataset

In [11]:
with open(r'C:\Users\Kshitij\Downloads\NLP\data\Subtitiles\1-Harry.Potter.and.the.Sorcerers.Stone.2001.1080p.BrRip.x264.YIFY ( FIRST TRY)-en.srt', 'r') as file:
    lines = file.readlines()

In [12]:
lines[:4]

['\n',
 '2\n',
 '00:01:22,277 --> 00:01:27,271\n',
 "I should've known that you would\n"]

In [13]:
def extract_dialogue(file_path):
    sub_paths = glob(os.path.join(file_path, '*.srt')) 
    movie_name = []
    scripts = []

    for path in sub_paths:
        dialogue_lines = [] 
        with open(path, 'r', encoding='utf-8') as file:
            for line in file:
                if not re.match(r'^\d+$', line) and not re.match(r'^\d{2}:\d{2}:\d{2}', line):
                    clean_line = line.strip()
                    if clean_line:
                        dialogue_lines.append(clean_line)
        
        script = " ".join(dialogue_lines)
        scripts.append(script)
        
        movie = movie = path.split("\\")[-1].split("1080p")[0].replace(".", " ").strip().replace("-"," ")
        movie_name.append(movie)

    df = pd.DataFrame.from_dict({"Movie": movie_name, "Script": scripts})
    return df



In [14]:
dataset_path =r'C:\Users\Kshitij\Downloads\NLP\data\Subtitiles'


In [15]:
dataset_path

'C:\\Users\\Kshitij\\Downloads\\NLP\\data\\Subtitiles'

In [16]:
df = extract_dialogue(dataset_path)

In [17]:
df

Unnamed: 0,Movie,Script
0,1 Harry Potter and the Sorcerers Stone 2001,"I should've known that you would be here, Prof..."
1,2 Harry Potter and the Chamber of Secrets 2002,"I can't let you out, Hedwig. I'm not allowed t..."
2,3 Harry Potter and the Prisoner of Azkaban 2004,Lumos Maxima. Lumos Maxima. Lumos Maxima. Lumo...
3,4 Harry Potter and the Goblet of Fire 2005,"Bloody kids. How fastidious you've become, Wor..."
4,5 Harry Potter and the Order of the Phoenix 2007,"I don't know about you, it's just too hot toda..."
5,6 Harry Potter and the Half Blood Prince 2009,I killed Sirius Black! He's back. The police a...
6,7 Harry Potter and the Deathly Hallows Part 1 ...,"These are dark times, there is no denying. Our..."
7,8 Harry Potter And The Deathly Hallows Part 2 ...,It's beautiful here. It was our aunt�s. We use...


RUN MODEL

In [18]:
script = df.iloc[0,1]
print(script[:100])
sentences = sent_tokenize(script)

I should've known that you would be here, Professor McGonagall. Good evening, Professor Dumbledore. 


In [19]:
sentences

["I should've known that you would be here, Professor McGonagall.",
 'Good evening, Professor Dumbledore.',
 'Are the rumors true, Albus?',
 "I'm afraid so, professor.",
 'The good and the bad.',
 '- And the boy?',
 '- Hagrid is bringing him.',
 'Is it wise to trust Hagrid with something so important?',
 'Professor, I would trust Hagrid with my life.',
 'Professor Dumbledore, sir.',
 'Professor McGonagall.',
 '- No problems, I trust, Hagrid?',
 '- No, sir.',
 'Little tyke fell asleep as we were flying over Bristol.',
 'Try not to wake him.',
 'There you go.',
 "Do you really think it's safe, leaving him with these people?",
 "I've watched them all day.",
 "They're the worst sort of Muggles.",
 '- They really are... - The only family he has.',
 "He'll be famous.",
 'Every child in our world will know his name.',
 'Exactly.',
 "He's far better off growing up away from all of that.",
 "Until he's ready.",
 'There, there, Hagrid.',
 "It's not really goodbye, after all.",
 'Good luck... ...

MAKING BATCHES FOR OUR MODEL

In [20]:
script_bathces = []
batch_size = 20
for i in range(0,len(sentences),batch_size):
    s20 = " ".join(sentences[i:i+batch_size])
    script_bathces.append(s20)

In [21]:
script_bathces[:3]

["I should've known that you would be here, Professor McGonagall. Good evening, Professor Dumbledore. Are the rumors true, Albus? I'm afraid so, professor. The good and the bad. - And the boy? - Hagrid is bringing him. Is it wise to trust Hagrid with something so important? Professor, I would trust Hagrid with my life. Professor Dumbledore, sir. Professor McGonagall. - No problems, I trust, Hagrid? - No, sir. Little tyke fell asleep as we were flying over Bristol. Try not to wake him. There you go. Do you really think it's safe, leaving him with these people? I've watched them all day. They're the worst sort of Muggles. - They really are... - The only family he has.",
 "He'll be famous. Every child in our world will know his name. Exactly. He's far better off growing up away from all of that. Until he's ready. There, there, Hagrid. It's not really goodbye, after all. Good luck... ...Harry Potter. Up. Get up! Now! Wake up, cousin! We're going to the zoo! - Here he comes, the birthday bo

In [22]:
theme_output = theme_classifier(
    script_bathces[:2],
    theme_list,
    multi_label = True
)

In [23]:
theme_output

[{'sequence': "I should've known that you would be here, Professor McGonagall. Good evening, Professor Dumbledore. Are the rumors true, Albus? I'm afraid so, professor. The good and the bad. - And the boy? - Hagrid is bringing him. Is it wise to trust Hagrid with something so important? Professor, I would trust Hagrid with my life. Professor Dumbledore, sir. Professor McGonagall. - No problems, I trust, Hagrid? - No, sir. Little tyke fell asleep as we were flying over Bristol. Try not to wake him. There you go. Do you really think it's safe, leaving him with these people? I've watched them all day. They're the worst sort of Muggles. - They really are... - The only family he has.",
  'labels': ['sacrifice', 'magic', 'friendship', 'betrayal', 'battle', 'love'],
  'scores': [0.9624725580215454,
   0.8704215884208679,
   0.8313580751419067,
   0.5827594995498657,
   0.5694814920425415,
   0.46515998244285583]},
 {'sequence': "He'll be famous. Every child in our world will know his name. Ex

In [24]:
themes={}
for i in theme_output:
    for label,score in zip(i["labels"],i["scores"]):
        if label not in themes:
            themes[label] = []
        themes[label].append(score)

In [25]:
themes

{'sacrifice': [0.9624725580215454, 0.8515483736991882],
 'magic': [0.8704215884208679, 0.7602370977401733],
 'friendship': [0.8313580751419067, 0.8649079203605652],
 'betrayal': [0.5827594995498657, 0.08585517853498459],
 'battle': [0.5694814920425415, 0.27196866273880005],
 'love': [0.46515998244285583, 0.6542882919311523]}

In [26]:
temp_df = pd.DataFrame.from_dict(themes)
temp_df.head()

Unnamed: 0,sacrifice,magic,friendship,betrayal,battle,love
0,0.962473,0.870422,0.831358,0.582759,0.569481,0.46516
1,0.851548,0.760237,0.864908,0.085855,0.271969,0.654288


MAKE A FUNCTION OF ALL THIS

In [27]:
def get_theme_inference(script):
    sentences = sent_tokenize(script)
    #batches
    script_bathces = []
    batch_size = 20
    for i in range(0,len(sentences),batch_size):
        s20 = " ".join(sentences[i:i+batch_size])
        script_bathces.append(s20)
    #model
    theme_output = theme_classifier(
    script_bathces[:2],
    theme_list,
    multi_label = True
    )
    #data wrangling
    themes={}
    for i in theme_output:
        for label,score in zip(i["labels"],i["scores"]):
            if label not in themes:
                themes[label] = []
            themes[label].append(score)
            
    themes = {key : np.mean(np.array(value)) for key,value in themes.items()}
    
    return themes

In [28]:
themes_new = {key : np.mean(np.array(value)) for key,value in themes.items()}

In [29]:
themes_new

{'sacrifice': 0.9070104658603668,
 'magic': 0.8153293430805206,
 'friendship': 0.848132997751236,
 'betrayal': 0.33430733904242516,
 'battle': 0.4207250773906708,
 'love': 0.5597241371870041}

In [30]:
df = df.head(2)

In [31]:
df

Unnamed: 0,Movie,Script
0,1 Harry Potter and the Sorcerers Stone 2001,"I should've known that you would be here, Prof..."
1,2 Harry Potter and the Chamber of Secrets 2002,"I can't let you out, Hedwig. I'm not allowed t..."


In [32]:
output_themes = df["Script"].apply(get_theme_inference)

In [33]:
output_themes

0    {'sacrifice': 0.9070104658603668, 'magic': 0.8...
1    {'betrayal': 0.5584453716874123, 'sacrifice': ...
Name: Script, dtype: object

In [38]:
theme_df = pd.DataFrame.from_dict(output_themes.tolist())
theme_df

[{'sacrifice': 0.9070104658603668,
  'magic': 0.8153293430805206,
  'friendship': 0.848132997751236,
  'betrayal': 0.33430733904242516,
  'battle': 0.4207250773906708,
  'love': 0.5597241371870041},
 {'betrayal': 0.5584453716874123,
  'sacrifice': 0.8836944997310638,
  'magic': 0.7357389330863953,
  'friendship': 0.751003086566925,
  'battle': 0.4187958389520645,
  'love': 0.3426327407360077}]

In [35]:
df[theme_df.columns] = theme_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[theme_df.columns] = theme_df


In [36]:
df

Unnamed: 0,Movie,Script
0,1 Harry Potter and the Sorcerers Stone 2001,"{'sacrifice': 0.9070104658603668, 'magic': 0.8..."
1,2 Harry Potter and the Chamber of Secrets 2002,"{'betrayal': 0.5584453716874123, 'sacrifice': ..."
