# Setup


In [43]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
import os

# if ' processing_df_.csv' in os.listdir('../data'):
#     print("Partial dataset found!")
#     # use partial or full dataset
#     df_file_name = 'filtered_df_synopsis.csv'
# else:
# use draft dataset
df_file_name = 'filtered_df.csv'

print(f"Using {df_file_name} as dataset.")
    
try:
    df = pd.read_csv(f'../data/{df_file_name}', low_memory=False)
except FileNotFoundError as e:
    print("File not found. Download the IMDB_Dataset.")
    raise e

pd.options.display.max_columns = None
pd.options.display.max_rows = 100

print(df.columns)
print(df.isnull().sum())

Using filtered_df.csv as dataset.
Index(['Best Picture', 'Certificate (GB)', 'Certificate (US)', 'Genres (1st)',
       'Genres (2nd)', 'Genres (3rd)', 'Genres (full list)',
       'Image Url (Title)', 'IMDB Url (title)', 'Plot', 'Plot (medium)',
       'Production Companies (1st)', 'Production Companies (2nd)',
       'Production Companies (3rd)', 'Production Companies (List)', 'Tagline',
       'Title', 'Title Id', 'What did they do ?', 'Year of Release',
       'IMDB Rating', 'Number Of Votes', 'Runtime (Minutes)', 'Lead Actors'],
      dtype='object')
Best Picture                   17707
Certificate (GB)                3407
Certificate (US)                1466
Genres (1st)                       1
Genres (2nd)                    2842
Genres (3rd)                    8254
Genres (full list)                 1
Image Url (Title)                  1
IMDB Url (title)                   0
Plot                               2
Plot (medium)                   2943
Production Companies (1st)     

# Data


In [44]:
processing_df = df[df['Plot'].notna() & df['Genres (full list)'].notna() & df['Production Companies (1st)'].notna()]

processing_df['Best Picture'] = processing_df['Best Picture'].fillna('None')
processing_df['Certificate (GB)'] = processing_df['Certificate (GB)'].fillna('None')
processing_df['Certificate (US)'] = processing_df['Certificate (US)'].fillna('None')

processing_df.drop(['Genres (1st)',
       'Genres (2nd)', 'Genres (3rd)',
       'IMDB Url (title)', 'Plot (medium)',
       'Production Companies (List)', 'Production Companies (2nd)',
       'Production Companies (3rd)', 'Tagline', 'What did they do ?',
        'Number Of Votes'], axis=1, inplace=True)
print(processing_df.isnull().sum())
processing_df



Best Picture                  0
Certificate (GB)              0
Certificate (US)              0
Genres (full list)            0
Image Url (Title)             0
Plot                          0
Production Companies (1st)    0
Title                         0
Title Id                      0
Year of Release               0
IMDB Rating                   0
Runtime (Minutes)             0
Lead Actors                   0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processing_df['Best Picture'] = processing_df['Best Picture'].fillna('None')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processing_df['Certificate (GB)'] = processing_df['Certificate (GB)'].fillna('None')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processing_df['Certificate (US)'] = process

Unnamed: 0,Best Picture,Certificate (GB),Certificate (US),Genres (full list),Image Url (Title),Plot,Production Companies (1st),Title,Title Id,Year of Release,IMDB Rating,Runtime (Minutes),Lead Actors
0,,PG,PG,"Adventure,Comedy,Mystery,Romance,Sci-Fi",https://m.media-amazon.com/images/M/MV5BZjI0YT...,The seniors return to Earth to visit their rel...,Zanuck/Brown Productions,Cocoon: The Return,tt0094890,1988,5.4,116.0,"['Priscilla Ashley Behne', 'Bill Wohrman', 'Ba..."
1,,15,R,Comedy,https://m.media-amazon.com/images/M/MV5BNGM1ND...,A sendup of all the teen movies that have accu...,Columbia Pictures,Not Another Teen Movie,tt0277371,2001,5.7,89.0,"['Lukas Behnken', 'Cody McMains', 'Deon Richmo..."
2,,PG,PG-13,"Drama,Romance",https://m.media-amazon.com/images/M/MV5BNzI4OT...,A chef with a mysterious past spends the day w...,Metanoia Films,Bella,tt0482463,2006,7.1,91.0,"['Lukas Behnken', 'Dominic Colón', 'Hudson Coo..."
3,,,PG-13,Drama,https://m.media-amazon.com/images/M/MV5BMjAzMT...,Two brothers and their surfing buddies face ne...,Capstone Pictures,Local Boys,tt0293357,2002,6.6,102.0,"['Lukas Behnken', 'Travis Aaron Wade', 'Dick D..."
4,,15,R,"Action,Horror,Sci-Fi,Thriller",https://m.media-amazon.com/images/M/MV5BNDlmOT...,"Sean Barker, unwilling host to an alien bio-ar...",Biomorphs Inc.,Guyver: Dark Hero,tt0109965,1994,5.8,118.0,"[""'Evil' Ted Smith"", 'Billi Lee', 'Brian Simps..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17947,,,Not Rated,"Documentary,Crime",https://m.media-amazon.com/images/M/MV5BMTkyNj...,An investigative documentary about the epidemi...,Chain Camera Pictures,The Invisible War,tt2120152,2012,7.6,93.0,['Kirby Dick']
17948,,18,R,"Documentary,Crime",https://m.media-amazon.com/images/M/MV5BNDIyMj...,A documentary of the decline of America. It fe...,Filmlink International,The Killing of America,tt0157894,1981,7.6,90.0,['Sheldon Renan']
17950,,,,"Documentary,History",https://m.media-amazon.com/images/M/MV5BMjA1ND...,"Featuring never-before-seen footage, this docu...",Firelight Media Inc.,Jonestown: The Life and Death of Peoples Temple,tt0762111,2006,7.8,86.0,['Stanley Nelson']
17951,,15,Not Rated,"Documentary,History,News",https://m.media-amazon.com/images/M/MV5BMTg2NT...,The story of two coalitions -- ACT UP and TAG ...,Public Square Films,How to Survive a Plague,tt2124803,2012,7.6,110.0,['David France']


## Listifying


In [45]:
import re

# Function to process company names
def alphanumericify(text):
    # Remove non-alphanumeric characters and strip whitespace
    return   re.sub(r'[^A-Za-z0-9 ]+', '', text)


# Split the 'Genres (full list)' column on commas
processing_df['genres'] = processing_df['Genres (full list)'].str.split(',') #.apply(lambda x: [alphanumericify(item) for item in x])
# processing_df['genres'] = processing_df['Genres (full list)']
# Display the DataFrame to verify the new 'genres' column
print(processing_df[['Title', 'genres']].head())

processing_df




                    Title                                         genres
0      Cocoon: The Return  [Adventure, Comedy, Mystery, Romance, Sci-Fi]
1  Not Another Teen Movie                                       [Comedy]
2                   Bella                               [Drama, Romance]
3              Local Boys                                        [Drama]
4       Guyver: Dark Hero             [Action, Horror, Sci-Fi, Thriller]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processing_df['genres'] = processing_df['Genres (full list)'].str.split(',') #.apply(lambda x: [alphanumericify(item) for item in x])


Unnamed: 0,Best Picture,Certificate (GB),Certificate (US),Genres (full list),Image Url (Title),Plot,Production Companies (1st),Title,Title Id,Year of Release,IMDB Rating,Runtime (Minutes),Lead Actors,genres
0,,PG,PG,"Adventure,Comedy,Mystery,Romance,Sci-Fi",https://m.media-amazon.com/images/M/MV5BZjI0YT...,The seniors return to Earth to visit their rel...,Zanuck/Brown Productions,Cocoon: The Return,tt0094890,1988,5.4,116.0,"['Priscilla Ashley Behne', 'Bill Wohrman', 'Ba...","[Adventure, Comedy, Mystery, Romance, Sci-Fi]"
1,,15,R,Comedy,https://m.media-amazon.com/images/M/MV5BNGM1ND...,A sendup of all the teen movies that have accu...,Columbia Pictures,Not Another Teen Movie,tt0277371,2001,5.7,89.0,"['Lukas Behnken', 'Cody McMains', 'Deon Richmo...",[Comedy]
2,,PG,PG-13,"Drama,Romance",https://m.media-amazon.com/images/M/MV5BNzI4OT...,A chef with a mysterious past spends the day w...,Metanoia Films,Bella,tt0482463,2006,7.1,91.0,"['Lukas Behnken', 'Dominic Colón', 'Hudson Coo...","[Drama, Romance]"
3,,,PG-13,Drama,https://m.media-amazon.com/images/M/MV5BMjAzMT...,Two brothers and their surfing buddies face ne...,Capstone Pictures,Local Boys,tt0293357,2002,6.6,102.0,"['Lukas Behnken', 'Travis Aaron Wade', 'Dick D...",[Drama]
4,,15,R,"Action,Horror,Sci-Fi,Thriller",https://m.media-amazon.com/images/M/MV5BNDlmOT...,"Sean Barker, unwilling host to an alien bio-ar...",Biomorphs Inc.,Guyver: Dark Hero,tt0109965,1994,5.8,118.0,"[""'Evil' Ted Smith"", 'Billi Lee', 'Brian Simps...","[Action, Horror, Sci-Fi, Thriller]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17947,,,Not Rated,"Documentary,Crime",https://m.media-amazon.com/images/M/MV5BMTkyNj...,An investigative documentary about the epidemi...,Chain Camera Pictures,The Invisible War,tt2120152,2012,7.6,93.0,['Kirby Dick'],"[Documentary, Crime]"
17948,,18,R,"Documentary,Crime",https://m.media-amazon.com/images/M/MV5BNDIyMj...,A documentary of the decline of America. It fe...,Filmlink International,The Killing of America,tt0157894,1981,7.6,90.0,['Sheldon Renan'],"[Documentary, Crime]"
17950,,,,"Documentary,History",https://m.media-amazon.com/images/M/MV5BMjA1ND...,"Featuring never-before-seen footage, this docu...",Firelight Media Inc.,Jonestown: The Life and Death of Peoples Temple,tt0762111,2006,7.8,86.0,['Stanley Nelson'],"[Documentary, History]"
17951,,15,Not Rated,"Documentary,History,News",https://m.media-amazon.com/images/M/MV5BMTg2NT...,The story of two coalitions -- ACT UP and TAG ...,Public Square Films,How to Survive a Plague,tt2124803,2012,7.6,110.0,['David France'],"[Documentary, History, News]"


## Adding plot_keywords

Adding plot_keywords to each movie

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Assuming 'processing_df' is your DataFrame and 'Plot' is the column with plot descriptions
tfidf_vectorizer = TfidfVectorizer(max_features=100, stop_words='english')  # Adjust max_features as needed
tfidf_matrix = tfidf_vectorizer.fit_transform( processing_df['Plot'])

# Get the feature names which represent the keywords
feature_names = tfidf_vectorizer.get_feature_names_out()

# Function to get keywords for each movie
def get_keywords(row, features):
    # Sort indices of nonzero elements
    sorted_indices = row.nonzero()[1]
    # Get corresponding feature names (keywords)
    keywords = [features[i] for i in sorted_indices]
    return keywords

# Extract keywords for each movie
processing_df['plot_keywords'] = [get_keywords(row, feature_names) for row in tfidf_matrix]

# Sort movies by the length of the keyword list
sorted_df = processing_df.sort_values(by='plot_keywords', key=lambda x: x.str.len(), ascending=False)

# Displaying the sorted DataFrame
print(sorted_df[['Title', 'plot_keywords']])


                 Title                                      plot_keywords
12264    Soul Assassin  [future, death, true, dead, crime, father, mot...
1936            Hoovey  [work, true, school, living, story, boy, fathe...
10604         Betrayal  [house, look, dead, gets, small, drug, mother,...
11431           Rancid  [future, tries, work, school, best, murder, ge...
17183   Real Gangsters  [future, work, living, story, crime, finds, ci...
...                ...                                                ...
9881        Studio 666                                                 []
3971   Debt Collectors                                                 []
3975          Galaxina                                                 []
3976         Coneheads                                                 []
11850         Land Ho!                                                 []

[17667 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processing_df['plot_keywords'] = [get_keywords(row, feature_names) for row in tfidf_matrix]



## Count most common keyword

In [47]:
from collections import Counter

# Flatten the list of keywords from all movies into a single list
all_keywords = sum(processing_df['plot_keywords'].tolist(), [])

# Calculate the frequency of each keyword
keyword_freq = Counter(all_keywords)

# Convert to a DataFrame for easy handling
keyword_freq_df = pd.DataFrame(keyword_freq.items(), columns=['plot_keywords', 'frequency']).sort_values(by='frequency', ascending=False)

# Display the top 10 most frequent keywords
print(keyword_freq_df.head(10))


   plot_keywords  frequency
12         young       2034
52          life       1929
5            new       1544
11           man       1518
28        family       1280
36         world       1188
53         woman       1175
29          love        934
82         story        919
35         group        880


In [50]:
processing_df.drop(['Genres (full list)'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processing_df.drop(['Genres (full list)'], axis=1, inplace=True)


In [51]:
processing_df.head(20)

Unnamed: 0,Best Picture,Certificate (GB),Certificate (US),Image Url (Title),Plot,Production Companies (1st),Title,Title Id,Year of Release,IMDB Rating,Runtime (Minutes),Lead Actors,genres,plot_keywords
0,,PG,PG,https://m.media-amazon.com/images/M/MV5BZjI0YT...,The seniors return to Earth to visit their rel...,Zanuck/Brown Productions,Cocoon: The Return,tt0094890,1988,5.4,116.0,"['Priscilla Ashley Behne', 'Bill Wohrman', 'Ba...","[Adventure, Comedy, Mystery, Romance, Sci-Fi]",[old]
1,,15,R,https://m.media-amazon.com/images/M/MV5BNGM1ND...,A sendup of all the teen movies that have accu...,Columbia Pictures,Not Another Teen Movie,tt0277371,2001,5.7,89.0,"['Lukas Behnken', 'Cody McMains', 'Deon Richmo...",[Comedy],[past]
2,,PG,PG-13,https://m.media-amazon.com/images/M/MV5BNzI4OT...,A chef with a mysterious past spends the day w...,Metanoia Films,Bella,tt0482463,2006,7.1,91.0,"['Lukas Behnken', 'Dominic Colón', 'Hudson Coo...","[Drama, Romance]","[friend, day, mysterious, past]"
3,,,PG-13,https://m.media-amazon.com/images/M/MV5BMjAzMT...,Two brothers and their surfing buddies face ne...,Capstone Pictures,Local Boys,tt0293357,2002,6.6,102.0,"['Lukas Behnken', 'Travis Aaron Wade', 'Dick D...",[Drama],[new]
4,,15,R,https://m.media-amazon.com/images/M/MV5BNDlmOT...,"Sean Barker, unwilling host to an alien bio-ar...",Biomorphs Inc.,Guyver: Dark Hero,tt0109965,1994,5.8,118.0,"[""'Evil' Ted Smith"", 'Billi Lee', 'Brian Simps...","[Action, Horror, Sci-Fi, Thriller]","[discover, fight]"
5,,12,PG-13,https://m.media-amazon.com/images/M/MV5BOTBjZj...,A young man discovers a mechanical device that...,New Line Cinema,The Guyver,tt0101988,1991,4.9,88.0,"[""'Evil' Ted Smith"", 'Brian Simpson', 'David W...","[Action, Comedy, Horror, Sci-Fi, Thriller]","[secret, begins, discovers, man, young]"
6,,18,R,https://m.media-amazon.com/images/M/MV5BNDQ2MD...,"A prototype enhanced human, on the run from Ch...",Overseas FilmGroup,Drive,tt0116147,1997,6.6,100.0,"[""'Evil' Ted Smith"", 'Kenny Richards', 'Dieter...","[Action, Adventure, Sci-Fi, Comedy]","[time, men, run]"
7,,12,PG-13,https://m.media-amazon.com/images/M/MV5BYjc2Nz...,A gubernatorial candidate hires a wormy specia...,Broadway Pictures,Black Sheep,tt0115697,1996,6.2,87.0,"[""'Gypsy' Spheeris"", 'Bruce McGill', 'Andrew B...",[Comedy],"[brother, make, job]"
8,,15,PG-13,https://m.media-amazon.com/images/M/MV5BYWYzN2...,A Japanese-American fisherman is accused of ki...,Universal Pictures,Snow Falling on Cedars,tt0120834,1999,6.7,127.0,"['A. Arthur Takemoto', 'Max Wright', 'Daniel v...","[Drama, Mystery, Romance, Thriller]",[american]
9,,PG,PG,https://m.media-amazon.com/images/M/MV5BMDdhMW...,"Two women, black and white, in 1955 Montgomery...",Dave Bell Associates,The Long Walk Home,tt0100046,1990,7.3,97.0,"['A. Bernard Sneed', 'Charles Hubbard', 'Bobby...","[Drama, History]",[women]


In [52]:
# from sklearn.preprocessing import LabelEncoder
# import pandas as pd

training_df = processing_df.copy()

# # extract all unique genres
# all_genres = set(genre for sublist in training_df['genres'] for genre in sublist)
# print(len(all_genres))
# le_genres = LabelEncoder()
# le_genres.fit(list(all_genres))
# training_df['genres_encoded'] = training_df['genres'].apply(le_genres.transform)


# all_production_companies = set(company for company in training_df['Production Companies (1st)'])
# print(len(all_production_companies))
# le_companies = LabelEncoder()
# le_companies.fit(list(all_production_companies))
# training_df['production_company_encoded'] = le_companies.transform(training_df['Production Companies (1st)'])



# all_plot_keywords = set(genre for sublist in training_df['plot_keywords'] for genre in sublist)
# print(len(all_plot_keywords))
# le_plot_keywords = LabelEncoder()
# le_plot_keywords.fit(list(all_plot_keywords))
# training_df['plot_keywords_encoded'] = training_df['plot_keywords'].apply(le_plot_keywords.transform)

# training_df['genres_decoded'] = training_df['genres_encoded'].apply(le_genres.inverse_transform)
# training_df['production_company_decoded'] = le_companies.inverse_transform(training_df['production_company_encoded'])
# training_df['plot_keywords_decode'] = training_df['plot_keywords_encoded'].apply(le_plot_keywords.inverse_transform)



# training_df.drop(['genres', 'production_companies', 'plot_keywords'], axis=1, inplace=True)

training_df.head(20)


Unnamed: 0,Best Picture,Certificate (GB),Certificate (US),Image Url (Title),Plot,Production Companies (1st),Title,Title Id,Year of Release,IMDB Rating,Runtime (Minutes),Lead Actors,genres,plot_keywords
0,,PG,PG,https://m.media-amazon.com/images/M/MV5BZjI0YT...,The seniors return to Earth to visit their rel...,Zanuck/Brown Productions,Cocoon: The Return,tt0094890,1988,5.4,116.0,"['Priscilla Ashley Behne', 'Bill Wohrman', 'Ba...","[Adventure, Comedy, Mystery, Romance, Sci-Fi]",[old]
1,,15,R,https://m.media-amazon.com/images/M/MV5BNGM1ND...,A sendup of all the teen movies that have accu...,Columbia Pictures,Not Another Teen Movie,tt0277371,2001,5.7,89.0,"['Lukas Behnken', 'Cody McMains', 'Deon Richmo...",[Comedy],[past]
2,,PG,PG-13,https://m.media-amazon.com/images/M/MV5BNzI4OT...,A chef with a mysterious past spends the day w...,Metanoia Films,Bella,tt0482463,2006,7.1,91.0,"['Lukas Behnken', 'Dominic Colón', 'Hudson Coo...","[Drama, Romance]","[friend, day, mysterious, past]"
3,,,PG-13,https://m.media-amazon.com/images/M/MV5BMjAzMT...,Two brothers and their surfing buddies face ne...,Capstone Pictures,Local Boys,tt0293357,2002,6.6,102.0,"['Lukas Behnken', 'Travis Aaron Wade', 'Dick D...",[Drama],[new]
4,,15,R,https://m.media-amazon.com/images/M/MV5BNDlmOT...,"Sean Barker, unwilling host to an alien bio-ar...",Biomorphs Inc.,Guyver: Dark Hero,tt0109965,1994,5.8,118.0,"[""'Evil' Ted Smith"", 'Billi Lee', 'Brian Simps...","[Action, Horror, Sci-Fi, Thriller]","[discover, fight]"
5,,12,PG-13,https://m.media-amazon.com/images/M/MV5BOTBjZj...,A young man discovers a mechanical device that...,New Line Cinema,The Guyver,tt0101988,1991,4.9,88.0,"[""'Evil' Ted Smith"", 'Brian Simpson', 'David W...","[Action, Comedy, Horror, Sci-Fi, Thriller]","[secret, begins, discovers, man, young]"
6,,18,R,https://m.media-amazon.com/images/M/MV5BNDQ2MD...,"A prototype enhanced human, on the run from Ch...",Overseas FilmGroup,Drive,tt0116147,1997,6.6,100.0,"[""'Evil' Ted Smith"", 'Kenny Richards', 'Dieter...","[Action, Adventure, Sci-Fi, Comedy]","[time, men, run]"
7,,12,PG-13,https://m.media-amazon.com/images/M/MV5BYjc2Nz...,A gubernatorial candidate hires a wormy specia...,Broadway Pictures,Black Sheep,tt0115697,1996,6.2,87.0,"[""'Gypsy' Spheeris"", 'Bruce McGill', 'Andrew B...",[Comedy],"[brother, make, job]"
8,,15,PG-13,https://m.media-amazon.com/images/M/MV5BYWYzN2...,A Japanese-American fisherman is accused of ki...,Universal Pictures,Snow Falling on Cedars,tt0120834,1999,6.7,127.0,"['A. Arthur Takemoto', 'Max Wright', 'Daniel v...","[Drama, Mystery, Romance, Thriller]",[american]
9,,PG,PG,https://m.media-amazon.com/images/M/MV5BMDdhMW...,"Two women, black and white, in 1955 Montgomery...",Dave Bell Associates,The Long Walk Home,tt0100046,1990,7.3,97.0,"['A. Bernard Sneed', 'Charles Hubbard', 'Bobby...","[Drama, History]",[women]


In [53]:
training_df.to_csv('../data/training_df.csv', index=False)

In [54]:
# training_df.sort_values(by='Best Picture', ascending=False).head(5)