In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
movies = pd.read_csv('Data/Top_10000_Movies.csv', lineterminator='\n')

In [3]:
movies.genre

0              ['Science Fiction', 'Action', 'Adventure']
1       ['Action', 'Adventure', 'Science Fiction', 'Fa...
2              ['Action', 'Adventure', 'Science Fiction']
3                         ['Action', 'Crime', 'Thriller']
4       ['Comedy', 'Action', 'Adventure', 'Science Fic...
                              ...                        
9995    ['Adventure', 'Animation', 'Comedy', 'Science ...
9996                        ['History', 'Drama', 'Music']
9997               ['Drama', 'Action', 'Thriller', 'War']
9998                        ['Drama', 'Music', 'Romance']
9999                     ['Drama', 'Mystery', 'Thriller']
Name: genre, Length: 10000, dtype: object

In [4]:
#Using this code to turn the genre from strings into list 
movie_genre = [i.strip('[]').split(",") for i in movies.genre]
for movie in (movie_genre):
    for number, genres in enumerate(movie):
        movie[number] = genres.strip("' '")
movies['genre'] =  movie_genre

In [5]:
#removing any movies without overview
movies.dropna(subset = ['overview'], inplace = True)

In [6]:
#removing any movies that have very short overview, which will be useless. 
movies['summary_length'] = [len(i) for i in movies.overview]
print("average movie summary length:", movies.summary_length.agg('mean'))
print("max movie summary length:", movies.summary_length.agg('max'))
print("shortest move summary length:", movies.summary_length.agg('min'))
movies_df = movies[movies['summary_length']>50]

average movie summary length: 281.9732323232323
max movie summary length: 1000
shortest move summary length: 12


In [7]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9863 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         9863 non-null   int64  
 1   id                 9863 non-null   int64  
 2   original_language  9863 non-null   object 
 3   original_title     9863 non-null   object 
 4   popularity         9863 non-null   float64
 5   release_date       9839 non-null   object 
 6   vote_average       9863 non-null   float64
 7   vote_count         9863 non-null   int64  
 8   genre              9863 non-null   object 
 9   overview           9863 non-null   object 
 10  revenue            9863 non-null   int64  
 11  runtime            9858 non-null   float64
 12  tagline            7077 non-null   object 
 13  summary_length     9863 non-null   int64  
dtypes: float64(3), int64(5), object(6)
memory usage: 1.1+ MB


In [8]:
genre_list = movies.genre
genre_counting_list =[]
for movie in genre_list:
    [genre_counting_list.append(genre) for genre in movie]
movie_counter = Counter(genre_counting_list)
del movie_counter['']

In [9]:
    from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
genre_df = pd.DataFrame(mlb.fit_transform(movies_df['genre']),columns=mlb.classes_, index=movies_df.id)


In [10]:
df = pd.merge(movies_df, genre_df, how ='left', on ='id')
df

Unnamed: 0.1,Unnamed: 0,id,original_language,original_title,popularity,release_date,vote_average,vote_count,genre,overview,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,0,580489,en,Venom: Let There Be Carnage,5401.308,2021-09-30,6.8,1736,"[Science Fiction, Action, Adventure]",After finding a host body in investigative rep...,...,0,0,0,0,0,1,0,0,0,0
1,1,524434,en,Eternals,3365.535,2021-11-03,7.1,622,"[Action, Adventure, Science Fiction, Fantasy]",The Eternals are a team of ancient aliens who ...,...,0,0,0,0,0,1,0,0,0,0
2,2,438631,en,Dune,2911.423,2021-09-15,8.0,3632,"[Action, Adventure, Science Fiction]","Paul Atreides, a brilliant and gifted young ma...",...,0,0,0,0,0,1,0,0,0,0
3,3,796499,en,Army of Thieves,2552.437,2021-10-27,6.9,555,"[Action, Crime, Thriller]",A mysterious woman recruits bank teller Ludwig...,...,0,0,0,0,0,0,0,1,0,0
4,4,550988,en,Free Guy,1850.470,2021-08-11,7.8,3493,"[Comedy, Action, Adventure, Science Fiction]",A bank teller called Guy realizes he is a back...,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9858,9995,530,en,A Grand Day Out,9.266,1990-05-18,7.5,594,"[Adventure, Animation, Comedy, Science Fiction...",Wallace and Gromit have run out of cheese and ...,...,0,0,0,0,0,1,0,0,0,0
9859,9996,15934,en,El cantante,10.417,2006-09-12,7.0,80,"[History, Drama, Music]","The rise and fall of salsa singer, Héctor Lavo...",...,1,0,1,0,0,0,0,0,0,0
9860,9997,162215,en,How I Live Now,9.520,2013-09-10,6.6,705,"[Drama, Action, Thriller, War]",An American girl on holiday in the English cou...,...,0,0,0,0,0,0,0,1,1,0
9861,9998,5723,en,Once,9.267,2007-03-23,7.4,990,"[Drama, Music, Romance]",A vacuum repairman moonlights as a street musi...,...,0,0,1,0,1,0,0,0,0,0


In [11]:
df.to_csv("Data/movies_df.csv")

In [12]:
movies_df= movies_df['genre'].str.join('|').str.get_dummies()

In [13]:
df.sort_values('release_date', ascending = False).head(1000)

Unnamed: 0.1,Unnamed: 0,id,original_language,original_title,popularity,release_date,vote_average,vote_count,genre,overview,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
6487,6570,640146,en,Ant-Man and the Wasp: Quantumania,11.666,2023-07-27,0.0,0,"[Adventure, Science Fiction, Comedy]",The third film in the Marvel Studios' Ant-Man ...,...,0,0,0,0,0,1,0,0,0,0
3424,3465,594767,en,Shazam! Fury of the Gods,22.952,2023-06-01,0.0,0,"[Comedy, Action, Fantasy]",The further adventures of crime-fighting super...,...,0,0,0,0,0,0,0,0,0,0
7860,7967,447277,en,The Little Mermaid,11.071,2023-05-26,0.0,0,"[Adventure, Family, Fantasy, Romance]",A young mermaid makes a deal with a sea witch:...,...,0,0,0,0,1,0,0,0,0,0
1898,1918,447365,en,Guardians of the Galaxy Vol. 3,32.095,2023-05-03,0.0,0,"[Action, Adventure, Science Fiction, Comedy]",The third film based on Marvel's Guardians of ...,...,0,0,0,0,0,1,0,0,0,0
871,883,76600,en,Avatar 2,58.466,2022-12-14,0.0,0,"[Action, Adventure, Science Fiction, Fantasy]",Twelve years after exploring Pandora and joini...,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5830,5901,744746,en,Caveat,16.193,2020-10-04,6.8,51,"[Horror, Thriller]",A lone drifter suffering from partial memory l...,...,0,1,0,0,0,0,0,1,0,0
5737,5806,435615,en,Possessor Uncut,17.402,2020-10-02,6.4,393,"[Thriller, Science Fiction, Horror, Mystery]","Tasya Vos, an elite corporate assassin, uses b...",...,0,1,0,1,0,1,0,1,0,0
1517,1531,741998,it,Il legame,40.722,2020-10-02,5.5,177,"[Horror, Thriller, Drama]",While visiting her fiancé's mother in southern...,...,0,1,0,0,0,0,0,1,0,0
5846,5917,575417,en,On the Rocks,17.656,2020-10-02,6.2,375,"[Drama, Comedy]","Faced with sudden doubts about her marriage, a...",...,0,0,0,0,0,0,0,0,0,0


In [15]:
movies_df.tagline()

AttributeError: 'DataFrame' object has no attribute 'tagline'