In [214]:
import os
import ast
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import MinMaxScaler

from nltk.tokenize import RegexpTokenizer
import re
import nltk

In [66]:
nltk.download('words')
words = set(nltk.corpus.words.words())

[nltk_data] Downloading package words to /Users/rajshah/nltk_data...
[nltk_data]   Package words is already up-to-date!


First, I want to gather my corpus of documents - that will be all of the script data.

In [128]:
df = [] # Initializing empty list to store scene data

# Iterate over all scene files in the directory

for f in os.listdir('Dataset/allScenes'):
    if not f.endswith('.csv'):
        continue
    filename = f.split('_scenes.csv')[0]
    scenes = pd.read_csv('Dataset/allScenes/' + f, index_col=False).total_tokens.values
    full_tokens = []
    full_scenes = []

    for scene in scenes:
        scene_x = ast.literal_eval(scene)
        full_scenes.append([token for token in scene_x if token in words])
        full_tokens += scene_x
    full_tokens = [token for token in full_tokens if token in words]
    df.append({
        'movie_filename': filename,
        'tokens': full_tokens,
        'text': (' ').join(full_tokens),
        'scenes': full_scenes
    })
df = pd.DataFrame(df)

In [129]:
df

Unnamed: 0,movie_filename,tokens,text,scenes
0,Easy-A,"[go, two, town, neighboring, every, video, con...",go two town neighboring every video confess lo...,"[[go, two, town, neighboring, every, video, co..."
1,Killers-Of-The-Flower-Moon-Read-The-Screenplay,"[sacred, teaching, white, bury, gave, pah, gra...",sacred teaching white bury gave pah grandfathe...,"[[sacred, teaching, white, bury, gave, pah, gr..."
2,Cast-Away,"[pretty, gas, way, get, cut, mountain, filter,...",pretty gas way get cut mountain filter engine ...,"[[pretty], [gas, way, get, cut, mountain, filt..."
3,Ghost-Ship,"[work, cabin, mind, friendship, find, know, ma...",work cabin mind friendship find know main talk...,"[[work, cabin, mind, friendship, find, know, m..."
4,Downsizing,"[afraid, know, happen, make, right, born, thin...",afraid know happen make right born thing old g...,"[[afraid, know, happen, make, right, born, thi..."
...,...,...,...,...
791,Bourne-Ultimatum-The,"[radio, give, argument, gun, would, last, ago,...",radio give argument gun would last ago pam thr...,"[[radio, give, argument, gun], [would, last, a..."
792,Happy-Go-Lucky,"[bit, dance, make, test, framing, holding, cel...",bit dance make test framing holding celebrate ...,"[[bit, dance, make, test, framing, holding, ce..."
793,Blind-Side-The,"[investigate, trouble, bit, find, know, grange...",investigate trouble bit find know granger file...,"[[investigate], [trouble, bit, find, know, gra..."
794,Croods-The,"[find, last, hope, would, every, three, fun, n...",find last hope would every three fun neighbor ...,"[[find, last, hope, would, every, three, fun, ..."


In [130]:
tfidf = TfidfVectorizer() # Initializing TF-IDF Vectorizer
tfidf = tfidf.fit(df.text.values) # Fitting the vectorizer 

In [131]:
# Transforming data
X_tf = tfidf.transform(df.text.values)

In [132]:
# Storing data in a DataFrame
n=pd.DataFrame(X_tf.toarray(), columns=tfidf.get_feature_names_out())

Now, I want to look at each individual scene, and give them a score.

In [133]:
full = pd.concat([df, n],axis=1)

In [216]:
scaler = MinMaxScaler()

In [226]:
for index, row in full.iterrows():
    file = 'Dataset/allScenes/' + row.movie_filename + '_scenes.csv'
    
    # Extract importance score based on TF-IDF values

    movieImportance = (row[list(tfidf.get_feature_names_out())].values)
    scenes = row.scenes
    x = (tfidf.transform([' '.join(scene) for scene in scenes]))
    scaledArray = scaler.fit_transform(cosine_similarity(x, X_tf[index]))
    pd_file = pd.read_csv(file)

    # Add importance scores to the DataFrame

    pd_file['importance'] = scaledArray
    pd_file['movie_filename'] = row.movie_filename
    pd_file.to_csv(file, index=False)