# Setup


In [17]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
import os

# if ' processing_df_.csv' in os.listdir('../data'):
#     print("Partial dataset found!")
#     # use partial or full dataset
#     df_file_name = 'filtered_df_synopsis.csv'
# else:
# use draft dataset
df_file_name = 'filtered_df.csv'

print(f"Using {df_file_name} as dataset.")
    
try:
    df = pd.read_csv(f'../data/{df_file_name}', low_memory=False)
except FileNotFoundError as e:
    print("File not found. Download the IMDB_Dataset.")
    raise e

pd.options.display.max_columns = None
pd.options.display.max_rows = 100

print(df.columns)
print(df.isnull().sum())

Using filtered_df.csv as dataset.
Index(['Best Picture', 'Certificate (GB)', 'Certificate (US)', 'Genres (1st)',
       'Genres (2nd)', 'Genres (3rd)', 'Genres (full list)',
       'Image Url (Title)', 'IMDB Url (title)', 'Plot', 'Plot (medium)',
       'Production Companies (1st)', 'Production Companies (2nd)',
       'Production Companies (3rd)', 'Production Companies (List)', 'Tagline',
       'Title', 'Title Id', 'What did they do ?', 'Year of Release',
       'IMDB Rating', 'Number Of Votes', 'Runtime (Minutes)', 'Lead Actors'],
      dtype='object')
Best Picture                   17707
Certificate (GB)                3407
Certificate (US)                1466
Genres (1st)                       1
Genres (2nd)                    2842
Genres (3rd)                    8254
Genres (full list)                 1
Image Url (Title)                  1
IMDB Url (title)                   0
Plot                               2
Plot (medium)                   2943
Production Companies (1st)     

# Data


In [18]:
processing_df = df[df['Plot'].notna() & df['Genres (full list)'].notna() & df['Production Companies (1st)'].notna()]
processing_df.drop(['Genres (1st)',
       'Genres (2nd)', 'Genres (3rd)',
       'Image Url (Title)', 'IMDB Url (title)', 'Plot (medium)',
       'Production Companies (List)', 'Production Companies (2nd)',
       'Production Companies (3rd)', 'Tagline',
       'Title Id', 'Year of Release', 'What did they do ?',
        'Number Of Votes', 'Runtime (Minutes)', 'Lead Actors'], axis=1, inplace=True)
print(processing_df.isnull().sum())
processing_df



Best Picture                  17421
Certificate (GB)               3246
Certificate (US)               1380
Genres (full list)                0
Plot                              0
Production Companies (1st)        0
Title                             0
IMDB Rating                       0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processing_df.drop(['Genres (1st)',


Unnamed: 0,Best Picture,Certificate (GB),Certificate (US),Genres (full list),Plot,Production Companies (1st),Title,IMDB Rating
0,,PG,PG,"Adventure,Comedy,Mystery,Romance,Sci-Fi",The seniors return to Earth to visit their rel...,Zanuck/Brown Productions,Cocoon: The Return,5.4
1,,15,R,Comedy,A sendup of all the teen movies that have accu...,Columbia Pictures,Not Another Teen Movie,5.7
2,,PG,PG-13,"Drama,Romance",A chef with a mysterious past spends the day w...,Metanoia Films,Bella,7.1
3,,,PG-13,Drama,Two brothers and their surfing buddies face ne...,Capstone Pictures,Local Boys,6.6
4,,15,R,"Action,Horror,Sci-Fi,Thriller","Sean Barker, unwilling host to an alien bio-ar...",Biomorphs Inc.,Guyver: Dark Hero,5.8
...,...,...,...,...,...,...,...,...
17947,,,Not Rated,"Documentary,Crime",An investigative documentary about the epidemi...,Chain Camera Pictures,The Invisible War,7.6
17948,,18,R,"Documentary,Crime",A documentary of the decline of America. It fe...,Filmlink International,The Killing of America,7.6
17950,,,,"Documentary,History","Featuring never-before-seen footage, this docu...",Firelight Media Inc.,Jonestown: The Life and Death of Peoples Temple,7.8
17951,,15,Not Rated,"Documentary,History,News",The story of two coalitions -- ACT UP and TAG ...,Public Square Films,How to Survive a Plague,7.6


## Listifying


In [19]:
import pandas as pd
import re

# Function to process company names
def alphanumericify(text):
    # Remove non-alphanumeric characters and strip whitespace
    return   re.sub(r'[^A-Za-z0-9 ]+', '', text)


# Split the 'Genres (full list)' column on commas
processing_df['genres'] = processing_df['Genres (full list)'].str.split(',') #.apply(lambda x: [alphanumericify(item) for item in x])
processing_df['genres'] = processing_df['genres'].apply(lambda x: [genre.strip() for genre in x])

# Display the DataFrame to verify the new 'genres' column
print(processing_df[['Title', 'genres']].head())

processing_df




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processing_df['genres'] = processing_df['Genres (full list)'].str.split(',') #.apply(lambda x: [alphanumericify(item) for item in x])


                    Title                                         genres
0      Cocoon: The Return  [Adventure, Comedy, Mystery, Romance, Sci-Fi]
1  Not Another Teen Movie                                       [Comedy]
2                   Bella                               [Drama, Romance]
3              Local Boys                                        [Drama]
4       Guyver: Dark Hero             [Action, Horror, Sci-Fi, Thriller]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processing_df['genres'] = processing_df['genres'].apply(lambda x: [genre.strip() for genre in x])


Unnamed: 0,Best Picture,Certificate (GB),Certificate (US),Genres (full list),Plot,Production Companies (1st),Title,IMDB Rating,genres
0,,PG,PG,"Adventure,Comedy,Mystery,Romance,Sci-Fi",The seniors return to Earth to visit their rel...,Zanuck/Brown Productions,Cocoon: The Return,5.4,"[Adventure, Comedy, Mystery, Romance, Sci-Fi]"
1,,15,R,Comedy,A sendup of all the teen movies that have accu...,Columbia Pictures,Not Another Teen Movie,5.7,[Comedy]
2,,PG,PG-13,"Drama,Romance",A chef with a mysterious past spends the day w...,Metanoia Films,Bella,7.1,"[Drama, Romance]"
3,,,PG-13,Drama,Two brothers and their surfing buddies face ne...,Capstone Pictures,Local Boys,6.6,[Drama]
4,,15,R,"Action,Horror,Sci-Fi,Thriller","Sean Barker, unwilling host to an alien bio-ar...",Biomorphs Inc.,Guyver: Dark Hero,5.8,"[Action, Horror, Sci-Fi, Thriller]"
...,...,...,...,...,...,...,...,...,...
17947,,,Not Rated,"Documentary,Crime",An investigative documentary about the epidemi...,Chain Camera Pictures,The Invisible War,7.6,"[Documentary, Crime]"
17948,,18,R,"Documentary,Crime",A documentary of the decline of America. It fe...,Filmlink International,The Killing of America,7.6,"[Documentary, Crime]"
17950,,,,"Documentary,History","Featuring never-before-seen footage, this docu...",Firelight Media Inc.,Jonestown: The Life and Death of Peoples Temple,7.8,"[Documentary, History]"
17951,,15,Not Rated,"Documentary,History,News",The story of two coalitions -- ACT UP and TAG ...,Public Square Films,How to Survive a Plague,7.6,"[Documentary, History, News]"


## Adding plot_keywords

Adding plot_keywords to each movie

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Assuming 'processing_df' is your DataFrame and 'Plot' is the column with plot descriptions
tfidf_vectorizer = TfidfVectorizer(max_features=100, stop_words='english')  # Adjust max_features as needed
tfidf_matrix = tfidf_vectorizer.fit_transform( processing_df['Plot'])

# Get the feature names which represent the keywords
feature_names = tfidf_vectorizer.get_feature_names_out()

# Function to get keywords for each movie
def get_keywords(row, features):
    # Sort indices of nonzero elements
    sorted_indices = row.nonzero()[1]
    # Get corresponding feature names (keywords)
    keywords = [features[i] for i in sorted_indices]
    return keywords

# Extract keywords for each movie
processing_df['plot_keywords'] = [get_keywords(row, feature_names) for row in tfidf_matrix]

# Sort movies by the length of the keyword list
sorted_df = processing_df.sort_values(by='plot_keywords', key=lambda x: x.str.len(), ascending=False)

# Displaying the sorted DataFrame
print(sorted_df[['Title', 'plot_keywords']])


                 Title                                      plot_keywords
12264    Soul Assassin  [future, death, true, dead, crime, father, mot...
1936            Hoovey  [work, true, school, living, story, boy, fathe...
10604         Betrayal  [house, look, dead, gets, small, drug, mother,...
11431           Rancid  [future, tries, work, school, best, murder, ge...
17183   Real Gangsters  [future, work, living, story, crime, finds, ci...
...                ...                                                ...
9881        Studio 666                                                 []
3971   Debt Collectors                                                 []
3975          Galaxina                                                 []
3976         Coneheads                                                 []
11850         Land Ho!                                                 []

[17667 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processing_df['plot_keywords'] = [get_keywords(row, feature_names) for row in tfidf_matrix]



## Count most common keyword

In [21]:
from collections import Counter

# Flatten the list of keywords from all movies into a single list
all_keywords = sum(processing_df['plot_keywords'].tolist(), [])

# Calculate the frequency of each keyword
keyword_freq = Counter(all_keywords)

# Convert to a DataFrame for easy handling
keyword_freq_df = pd.DataFrame(keyword_freq.items(), columns=['plot_keywords', 'frequency']).sort_values(by='frequency', ascending=False)

# Display the top 10 most frequent keywords
print(keyword_freq_df.head(10))


   plot_keywords  frequency
12         young       2034
52          life       1929
5            new       1544
11           man       1518
28        family       1280
36         world       1188
53         woman       1175
29          love        934
82         story        919
35         group        880


### Score common keywords


In [22]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# import pandas as pd

# # Assuming 'filtered_df' is your DataFrame and 'Plot' is the column with plot descriptions
# tfidf_vectorizer = TfidfVectorizer(stop_words='english')
# tfidf_matrix = tfidf_vectorizer.fit_transform(processing_df['Plot'])

# # Sum tf-idf score for each term across all documents
# sums = tfidf_matrix.sum(axis=0) 
# keywords_with_scores = [(word, sums[0, idx]) for word, idx in tfidf_vectorizer.vocabulary_.items()]

# # Create a DataFrame with the keywords and their scores
# keywords_df = pd.DataFrame(keywords_with_scores, columns=['plot_keywords', 'Score'])

# # Sort the DataFrame by score in descending order to get the most common keywords
# common_keywords = keywords_df.sort_values(by='Score', ascending=False)
# print(common_keywords.head(10))  # Adjust the number to get the top N keywords



In [23]:
# from collections import Counter

# genre_counts = Counter([genre for sublist in filtered_df['genres'] for genre in sublist])
# print(genre_counts)


In [24]:
processing_df.drop(['Genres (full list)', 'Plot'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processing_df.drop(['Genres (full list)', 'Plot'], axis=1, inplace=True)


In [25]:
processing_df.head(20)

Unnamed: 0,Best Picture,Certificate (GB),Certificate (US),Production Companies (1st),Title,IMDB Rating,genres,plot_keywords
0,,PG,PG,Zanuck/Brown Productions,Cocoon: The Return,5.4,"[Adventure, Comedy, Mystery, Romance, Sci-Fi]",[old]
1,,15,R,Columbia Pictures,Not Another Teen Movie,5.7,[Comedy],[past]
2,,PG,PG-13,Metanoia Films,Bella,7.1,"[Drama, Romance]","[friend, day, mysterious, past]"
3,,,PG-13,Capstone Pictures,Local Boys,6.6,[Drama],[new]
4,,15,R,Biomorphs Inc.,Guyver: Dark Hero,5.8,"[Action, Horror, Sci-Fi, Thriller]","[discover, fight]"
5,,12,PG-13,New Line Cinema,The Guyver,4.9,"[Action, Comedy, Horror, Sci-Fi, Thriller]","[secret, begins, discovers, man, young]"
6,,18,R,Overseas FilmGroup,Drive,6.6,"[Action, Adventure, Sci-Fi, Comedy]","[time, men, run]"
7,,12,PG-13,Broadway Pictures,Black Sheep,6.2,[Comedy],"[brother, make, job]"
8,,15,PG-13,Universal Pictures,Snow Falling on Cedars,6.7,"[Drama, Mystery, Romance, Thriller]",[american]
9,,PG,PG,Dave Bell Associates,The Long Walk Home,7.3,"[Drama, History]",[women]


In [36]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

training_df = processing_df.copy()

# extract all unique genres
all_genres = set(genre for sublist in training_df['genres'] for genre in sublist)
le_genres = LabelEncoder()
le_genres.fit(list(all_genres))
training_df['genres_encoded'] = training_df['genres'].apply(le_genres.transform)
training_df['genres_decode'] = training_df['genres_encoded'].apply(le_genres.inverse_transform)


all_production_companies = set(company for company in training_df['Production Companies (1st)'])
print(len(all_production_companies))
le_companies = LabelEncoder()
le_companies.fit(list(all_production_companies))
training_df['production_company_encoded'] = le_companies.transform(training_df['Production Companies (1st)'])
# training_df['production_company_decode'] = training_df['production_company_encoded'].apply(le_companies.inverse_transform)


all_plot_keywords = set(genre for sublist in training_df['plot_keywords'] for genre in sublist)
print(len(all_plot_keywords))
le_plot_keywords = LabelEncoder()
le_plot_keywords.fit(list(all_plot_keywords))
training_df['plot_keywords_encoded'] = training_df['plot_keywords'].apply(le_plot_keywords.transform)
training_df['plot_keywords_decode'] = training_df['plot_keywords_encoded'].apply(le_plot_keywords.inverse_transform)



# training_df.drop(['genres', 'production_companies', 'plot_keywords'], axis=1, inplace=True)

training_df.head(20)


8045
100


Unnamed: 0,Best Picture,Certificate (GB),Certificate (US),Production Companies (1st),Title,IMDB Rating,genres,plot_keywords,genres_encoded,genres_decode,production_company_encoded,plot_keywords_encoded,plot_keywords_decode
0,,PG,PG,Zanuck/Brown Productions,Cocoon: The Return,5.4,"[Adventure, Comedy, Mystery, Romance, Sci-Fi]",[old],"[1, 4, 15, 18, 19]","[Adventure, Comedy, Mystery, Romance, Sci-Fi]",8000,[61],[old]
1,,15,R,Columbia Pictures,Not Another Teen Movie,5.7,[Comedy],[past],[4],[Comedy],2294,[63],[past]
2,,PG,PG-13,Metanoia Films,Bella,7.1,"[Drama, Romance]","[friend, day, mysterious, past]","[7, 18]","[Drama, Romance]",5286,"[29, 12, 58, 63]","[friend, day, mysterious, past]"
3,,,PG-13,Capstone Pictures,Local Boys,6.6,[Drama],[new],[7],[Drama],1887,[59],[new]
4,,15,R,Biomorphs Inc.,Guyver: Dark Hero,5.8,"[Action, Horror, Sci-Fi, Thriller]","[discover, fight]","[0, 12, 19, 21]","[Action, Horror, Sci-Fi, Thriller]",1349,"[15, 24]","[discover, fight]"
5,,12,PG-13,New Line Cinema,The Guyver,4.9,"[Action, Comedy, Horror, Sci-Fi, Thriller]","[secret, begins, discovers, man, young]","[0, 4, 12, 19, 21]","[Action, Comedy, Horror, Sci-Fi, Thriller]",5597,"[73, 2, 16, 53, 99]","[secret, begins, discovers, man, young]"
6,,18,R,Overseas FilmGroup,Drive,6.6,"[Action, Adventure, Sci-Fi, Comedy]","[time, men, run]","[0, 1, 19, 4]","[Action, Adventure, Sci-Fi, Comedy]",5832,"[84, 55, 70]","[time, men, run]"
7,,12,PG-13,Broadway Pictures,Black Sheep,6.2,[Comedy],"[brother, make, job]",[4],[Comedy],1651,"[5, 52, 41]","[brother, make, job]"
8,,15,PG-13,Universal Pictures,Snow Falling on Cedars,6.7,"[Drama, Mystery, Romance, Thriller]",[american],"[7, 15, 18, 21]","[Drama, Mystery, Romance, Thriller]",7676,[1],[american]
9,,PG,PG,Dave Bell Associates,The Long Walk Home,7.3,"[Drama, History]",[women],"[7, 11]","[Drama, History]",2593,[93],[women]


In [None]:
training_df.to_csv('../data/processing_df_.csv', index=False)