# Setup


In [29]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
import os

# if ' training_df_.csv' in os.listdir('../data'):
#     print("Partial dataset found!")
#     # use partial or full dataset
#     df_file_name = 'filtered_df_synopsis.csv'
# else:
# use draft dataset
df_file_name = 'filtered_df.csv'

print(f"Using {df_file_name} as dataset.")
    
try:
    df = pd.read_csv(f'../data/{df_file_name}', low_memory=False)
except FileNotFoundError as e:
    print("File not found. Download the IMDB_Dataset.")
    raise e


Using filtered_df.csv as dataset.


In [30]:
pd.options.display.max_columns = None
pd.options.display.max_rows = 100

df.columns
# df.shape

Index(['Best Picture', 'Certificate (GB)', 'Certificate (US)', 'Genres (1st)',
       'Genres (2nd)', 'Genres (3rd)', 'Genres (full list)',
       'Image Url (Title)', 'IMDB Url (title)', 'Plot', 'Plot (medium)',
       'Production Companies (1st)', 'Production Companies (2nd)',
       'Production Companies (3rd)', 'Production Companies (List)', 'Tagline',
       'Title', 'Title Id', 'What did they do ?', 'Year of Release',
       'IMDB Rating', 'Number Of Votes', 'Runtime (Minutes)', 'Lead Actors'],
      dtype='object')

# Data


In [31]:
filtered_df = df[df['Plot'].notna()]
filtered_df = df[df['Genres (full list)'].notna()]
print(filtered_df.shape)
filtered_df.isnull().sum()

(17952, 24)


Best Picture                   17706
Certificate (GB)                3407
Certificate (US)                1466
Genres (1st)                       0
Genres (2nd)                    2841
Genres (3rd)                    8253
Genres (full list)                 0
Image Url (Title)                  1
IMDB Url (title)                   0
Plot                               2
Plot (medium)                   2942
Production Companies (1st)       283
Production Companies (2nd)      3777
Production Companies (3rd)      8121
Production Companies (List)      283
Tagline                         4016
Title                              0
Title Id                           0
What did they do ?                 0
Year of Release                    0
IMDB Rating                        0
Number Of Votes                    0
Runtime (Minutes)                  0
Lead Actors                        0
dtype: int64

In [32]:
training_df = df[df['Plot'].notna() & df['Genres (full list)'].notna()]
training_df.drop(['Genres (1st)',
       'Genres (2nd)', 'Genres (3rd)',
       'Image Url (Title)', 'IMDB Url (title)', 'Plot (medium)',
       'Production Companies (1st)', 'Production Companies (2nd)',
       'Production Companies (3rd)', 'Tagline',
       'Title Id', 'Year of Release','What did they do ?',
        'Number Of Votes', 'Runtime (Minutes)', 'Lead Actors'], axis=1, inplace=True)
training_df.columns

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_df.drop(['Genres (1st)',


Index(['Best Picture', 'Certificate (GB)', 'Certificate (US)',
       'Genres (full list)', 'Plot', 'Production Companies (List)', 'Title',
       'IMDB Rating'],
      dtype='object')

## plot_keywords

Adding plot_keywords to each movie

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Assuming 'training_df' is your DataFrame and 'Plot' is the column with plot descriptions
tfidf_vectorizer = TfidfVectorizer(max_features=100, stop_words='english')  # Adjust max_features as needed
tfidf_matrix = tfidf_vectorizer.fit_transform( training_df['Plot'])

# Get the feature names which represent the keywords
feature_names = tfidf_vectorizer.get_feature_names_out()

# Function to get keywords for each movie
def get_keywords(row, features):
    # Sort indices of nonzero elements
    sorted_indices = row.nonzero()[1]
    # Get corresponding feature names (keywords)
    keywords = [features[i] for i in sorted_indices]
    return keywords

# Extract keywords for each movie
training_df['plot_keywords'] = [get_keywords(row, feature_names) for row in tfidf_matrix]

# Sort movies by the length of the keyword list
sorted_df = training_df.sort_values(by='plot_keywords', key=lambda x: x.str.len(), ascending=False)

# Displaying the sorted DataFrame
print(sorted_df[['Title', 'plot_keywords']])


                                            Title  \
12264                               Soul Assassin   
1936                                       Hoovey   
10604                                    Betrayal   
11431                                      Rancid   
17183                              Real Gangsters   
...                                           ...   
7256                                        Live!   
16951  Scotty and the Secret History of Hollywood   
14290                                      Trumbo   
3971                              Debt Collectors   
15719           Ranveer vs. Wild with Bear Grylls   

                                           plot_keywords  
12264  [future, death, true, dead, crime, father, mot...  
1936   [work, true, school, living, story, boy, fathe...  
10604  [house, look, dead, gets, small, drug, mother,...  
11431  [future, tries, work, school, best, murder, ge...  
17183  [future, work, living, story, crime, finds, ci...  
...      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_df['plot_keywords'] = [get_keywords(row, feature_names) for row in tfidf_matrix]


### Count most common keyword

In [34]:
from collections import Counter

# Flatten the list of keywords from all movies into a single list
all_keywords = sum(training_df['plot_keywords'].tolist(), [])

# Calculate the frequency of each keyword
keyword_freq = Counter(all_keywords)

# Convert to a DataFrame for easy handling
keyword_freq_df = pd.DataFrame(keyword_freq.items(), columns=['plot_keywords', 'frequency']).sort_values(by='frequency', ascending=False)

# Display the top 10 most frequent keywords
print(keyword_freq_df.head(100))


   plot_keywords  frequency
12         young       2064
51          life       1967
5            new       1569
11           man       1546
27        family       1294
35         world       1216
52         woman       1191
28          love        953
81         story        936
34         group        893
36       friends        874
0            old        848
90        school        786
74         finds        761
41         years        733
73          girl        731
25          home        725
64        father        721
65          town        706
24          year        695
32         lives        675
37          high        656
13          time        653
69          city        593
18      american        592
21          help        576
67        mother        575
42          wife        572
57           war        564
40           son        561
93         death        545
4     mysterious        534
75           boy        523
77         small        522
2         friend    

### Score common keywords


In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Assuming 'filtered_df' is your DataFrame and 'Plot' is the column with plot descriptions
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(training_df['Plot'])

# Sum tf-idf score for each term across all documents
sums = tfidf_matrix.sum(axis=0) 
keywords_with_scores = [(word, sums[0, idx]) for word, idx in tfidf_vectorizer.vocabulary_.items()]

# Create a DataFrame with the keywords and their scores
keywords_df = pd.DataFrame(keywords_with_scores, columns=['plot_keywords', 'Score'])

# Sort the DataFrame by score in descending order to get the most common keywords
common_keywords = keywords_df.sort_values(by='Score', ascending=False)
print(common_keywords.head(50))  # Adjust the number to get the top N keywords



     plot_keywords       Score
49           young  265.641278
367           life  253.558840
50             man  224.889802
28             new  205.465025
194         family  193.071249
362          woman  189.262340
223          world  166.403975
192           love  145.986599
212          group  143.296035
864          story  139.145568
293        friends  137.794550
1156        school  132.987681
8              old  124.940240
677           girl  122.968222
598         father  121.375419
596           town  119.156404
676          finds  117.800256
143           home  117.083520
219          lives  109.216498
151           year  107.480960
271           high  106.913930
245          years  106.344270
590         mother  101.204372
104       american  101.119849
87            time   99.891059
633           city   99.846108
247            son   98.451047
242           wife   97.301716
680            boy   96.754925
427            war   96.570090
1400         death   92.839105
160     

In [36]:
from collections import Counter

# Flatten the list of keywords from all movies into a single list
all_keywords = sum(training_df['plot_keywords'].tolist(), [])

# Calculate the frequency of each keyword
keyword_freq = Counter(all_keywords)

# Convert to a DataFrame for easy handling
keyword_freq_df = pd.DataFrame(keyword_freq.items(), columns=['plot_keywords', 'Frequency']).sort_values(by='Frequency', ascending=False)

# Display the top 10 most frequent keywords
print(keyword_freq_df.head(10))

   plot_keywords  Frequency
12         young       2064
51          life       1967
5            new       1569
11           man       1546
27        family       1294
35         world       1216
52         woman       1191
28          love        953
81         story        936
34         group        893


In [41]:
import pandas as pd

# Sample DataFrame
# filtered_df = pd.read_csv('your_data.csv')  # Uncomment this if you're reading from a CSV file

# Split the 'Genres (full list)' column on commas
training_df['genres'] = training_df['Genres (full list)'].str.split(',')
training_df['genres'] = training_df['genres'].apply(lambda x: [genre.strip() for genre in x])
training_df.drop(['Genres (full list)', 'Plot'], axis=1, inplace=True)

# Split the 'Genres (full list)' column on commas
training_df['production_companies'] = training_df['Production Companies (List)'].str.split(',')
training_df['production_companies'] = training_df['production_companies'].apply(lambda x: [company.strip() for company in x])

training_df.drop(['Genres (full list)', 'Plot', 'Production Companies (List)'], axis=1, inplace=True)

# Display the DataFrame to verify the new 'genres' column
print(training_df[['Title', 'genres', 'production_companies']].head())




                    Title                                         genres
0      Cocoon: The Return  [Adventure, Comedy, Mystery, Romance, Sci-Fi]
1  Not Another Teen Movie                                       [Comedy]
2                   Bella                               [Drama, Romance]
3              Local Boys                                        [Drama]
4       Guyver: Dark Hero             [Action, Horror, Sci-Fi, Thriller]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_df['genres'] = training_df['Genres (full list)'].str.split(',')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_df['genres'] = training_df['genres'].apply(lambda x: [genre.strip() for genre in x])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_df.drop(['Genres (full list)', 'Plot'], axis=1, inplace=True)


In [42]:
training_df.columns

Index(['Best Picture', 'Certificate (GB)', 'Certificate (US)',
       'Production Companies (List)', 'Title', 'IMDB Rating', 'plot_keywords',
       'genres'],
      dtype='object')

In [40]:
from collections import Counter

genre_counts = Counter([genre for sublist in filtered_df['genres'] for genre in sublist])
print(genre_counts)


Counter({'Drama': 8875, 'Comedy': 6284, 'Thriller': 5455, 'Horror': 3382, 'Romance': 3364, 'Action': 3310, 'Crime': 2950, 'Adventure': 2170, 'Mystery': 2077, 'Sci-Fi': 1936, 'Fantasy': 1608, 'Documentary': 1505, 'Family': 1257, 'Biography': 1197, 'Music': 839, 'History': 702, 'Sport': 622, 'Animation': 580, 'War': 551, 'Musical': 331, 'Western': 233, 'News': 37, 'Reality-TV': 4, 'Game-Show': 1})


In [44]:
training_df.head(20)

Unnamed: 0,Best Picture,Certificate (GB),Certificate (US),Production Companies (List),Title,IMDB Rating,plot_keywords,genres
0,,PG,PG,Zanuck/Brown Productions; Twentieth Century Fox;,Cocoon: The Return,5.4,[old],"[Adventure, Comedy, Mystery, Romance, Sci-Fi]"
1,,15,R,Columbia Pictures; Original Film; Neal H. Mori...,Not Another Teen Movie,5.7,[past],[Comedy]
2,,PG,PG-13,Metanoia Films; Bella Production; Burnside Ent...,Bella,7.1,"[friend, day, mysterious, past]","[Drama, Romance]"
3,,,PG-13,Capstone Pictures; ;,Local Boys,6.6,[new],[Drama]
4,,15,R,Biomorphs Inc.; L.A. Hero;,Guyver: Dark Hero,5.8,"[discover, fight]","[Action, Horror, Sci-Fi, Thriller]"
5,,12,PG-13,New Line Cinema; The Guyver Productions;,The Guyver,4.9,"[secret, begins, discovers, man, young]","[Action, Comedy, Horror, Sci-Fi, Thriller]"
6,,18,R,Overseas FilmGroup; NEO Motion Pictures;,Drive,6.6,"[time, men]","[Action, Adventure, Sci-Fi, Comedy]"
7,,12,PG-13,Broadway Pictures; Paramount Pictures;,Black Sheep,6.2,"[brother, make, job]",[Comedy]
8,,15,PG-13,Universal Pictures; The Kennedy/Marshall Compa...,Snow Falling on Cedars,6.7,[american],"[Drama, Mystery, Romance, Thriller]"
9,,PG,PG,Dave Bell Associates; New Visions Pictures;,The Long Walk Home,7.3,[women],"[Drama, History]"
