### 2023 Data

In [1]:
import pandas as pd
import numpy as np

In [7]:
link = "https://en.wikipedia.org/wiki/List_of_American_films_of_2023" # fetching all the 2023 movies from wikipedia
df1 = pd.read_html(link, header=0)[2]
df2 = pd.read_html(link, header=0)[3]
df3 = pd.read_html(link, header=0)[4]
df4 = pd.read_html(link, header=0)[5]

In [8]:
df = pd.concat([df1,df2,df3,df4],ignore_index=True) #concat all the dataframes in a single for preprocessing
df

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref.
0,J A N U A R Y,6,M3GAN,Universal Pictures / Blumhouse Productions / A...,Gerard Johnstone (director); Akela Cooper (scr...,[3]
1,J A N U A R Y,6,The Old Way,Saban Films / Saturn Films,Brett Donowho (director); Carl W. Lucas (scree...,[4]
2,J A N U A R Y,11,The Devil Conspiracy,Samuel Goldwyn Films,Nathan Frankowski (director); Ed Alan (screenp...,[5]
3,J A N U A R Y,13,Plane,Lionsgate / MadRiver Pictures / Di Bonaventura...,Jean-François Richet (director); Charles Cummi...,[6]
4,J A N U A R Y,13,House Party,Warner Bros. Pictures / New Line Cinema,"Calmatic (director); Jamal Olori, Stephen Glov...",[7]
...,...,...,...,...,...,...
338,D E C E M B E R,22,Memory,Ketchup Entertainment / Mubi,Michel Franco (director/screenplay); Jessica C...,[325]
339,D E C E M B E R,25,The Color Purple,Warner Bros. Pictures / Amblin Entertainment /...,"Blitz Bazawule (director), Marcus Gardley (scr...",[326]
340,D E C E M B E R,25,The Boys in the Boat,Metro-Goldwyn-Mayer / Smokehouse Pictures,"George Clooney (director), Mark L. Smith (scre...",[327]
341,D E C E M B E R,25,Ferrari,Neon / STXfilms / Ketchup Entertainment,"Michael Mann (director), Troy Kennedy Martin (...",[328]


In [9]:
df.columns

Index(['Opening', 'Opening.1', 'Title', 'Production company', 'Cast and crew',
       'Ref.'],
      dtype='object')

#### As it is observed in the above table, we do not have the genre column so we will get the genre data from IMDb. You will need to create an API key in order to access the API key.

### pip install tmdbv3api

In [10]:
#get your API Key here: https://www.themoviedb.org/settings/api
from tmdbv3api import TMDb
import json
import requests
tmdb = TMDb()

api = 'cf1e5b9d8cf31e19913dd6bc256abb2a'
tmdb.api_key=api # insert your api key here


In [11]:
import requests

from tmdbv3api import Movie

tmdb_movie = Movie()

def get_genre(x):
    genres = []
    
    try:
        # Search for the movie title
        result = tmdb_movie.search(x)
        
        # Check if any results were found
        if not result:
            return 'No results found'

        # Get the movie ID from the first result
        movie_id = result[0].id
        
        # Fetch movie details using the movie ID
        response = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={tmdb.api_key}')
        data_json = response.json()
        
        # Check if the movie has genres
        if 'genres' in data_json and data_json['genres']:
            for genre in data_json['genres']:
                genres.append(genre['name'])
            return ' '.join(genres)  # Return genres as a space-separated string
        else:
            return np.nan
    
    except Exception as e:
        # Handle any exceptions (e.g., connection issues, API errors)
        return f'Error: {str(e)}'

# Example usage to map genres to the DataFrame
df['genres'] = df['Title'].map(lambda x: get_genre(x))


In [12]:
# df['genres']= df['Title'].map(lambda x : get_genre(x))

df.head()

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref.,genres
0,J A N U A R Y,6,M3GAN,Universal Pictures / Blumhouse Productions / A...,Gerard Johnstone (director); Akela Cooper (scr...,[3],Science Fiction Horror
1,J A N U A R Y,6,The Old Way,Saban Films / Saturn Films,Brett Donowho (director); Carl W. Lucas (scree...,[4],Western Drama
2,J A N U A R Y,11,The Devil Conspiracy,Samuel Goldwyn Films,Nathan Frankowski (director); Ed Alan (screenp...,[5],Horror Fantasy Science Fiction Thriller
3,J A N U A R Y,13,Plane,Lionsgate / MadRiver Pictures / Di Bonaventura...,Jean-François Richet (director); Charles Cummi...,[6],Action Adventure Thriller
4,J A N U A R Y,13,House Party,Warner Bros. Pictures / New Line Cinema,"Calmatic (director); Jamal Olori, Stephen Glov...",[7],Comedy


In [13]:
import re
import pandas as pd

def extract_info2(text):
    # Check if the text is None or not a string
    if not isinstance(text, str):
        return pd.Series([None, None, None, None])
    
    # Initialize the director to None
    director_name = None
    # Initialize the actors to None
    actor_1, actor_2, actor_3 = None, None, None
    
    # Check if the text contains a director's name and role(s)
    director_match = re.search(r'([a-zA-Z\s]+) \(([^)]+)\)', text)  # Extract name and roles inside parentheses
    if director_match:
        name = director_match.group(1).strip()  # Extract the name (e.g., Michael Moore)
        roles = director_match.group(2).strip()  # Extract the roles (e.g., director/screenplay/narrator)
        
        # If 'director' is in the roles, assign only the name to the director column
        if 'director' in roles:
            director_name = name  # Only assign the name to the director column if 'director' is present in the roles
            
            # Remove the director's name from the actors list to avoid duplication
            text = text.replace(name + " (" + roles + ")", "")  # Remove the entire director's entry

    # Extract actors from the cast and crew field (after the last semicolon)
    actors = re.split(r';', text)[-1].strip().split(', ')  # Split by semicolon and take actors from the last part
    
    # Remove the director from the actors list (if it was listed there)
    actors = [actor for actor in actors if actor != director_name]
    
    # Assign actors if they exist
    actor_1 = actors[0] if len(actors) > 0 else None
    actor_2 = actors[1] if len(actors) > 1 else None
    actor_3 = actors[2] if len(actors) > 2 else None

    # Return the extracted information in a pandas Series
    return pd.Series([director_name, actor_1, actor_2, actor_3])

In [14]:
df[['director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name']] = df['Cast and crew'].apply(extract_info2)


In [15]:
df=df.rename(columns={'Title':'movie_title'})
df.head(2)

Unnamed: 0,Opening,Opening.1,movie_title,Production company,Cast and crew,Ref.,genres,director_name,actor_1_name,actor_2_name,actor_3_name
0,J A N U A R Y,6,M3GAN,Universal Pictures / Blumhouse Productions / A...,Gerard Johnstone (director); Akela Cooper (scr...,[3],Science Fiction Horror,Gerard Johnstone,Allison Williams,Violet McGraw,Amie Donald
1,J A N U A R Y,6,The Old Way,Saban Films / Saturn Films,Brett Donowho (director); Carl W. Lucas (scree...,[4],Western Drama,Brett Donowho,Nicolas Cage,Ryan Kiera Armstrong,


In [40]:
new_df23 = df[['director_name','actor_1_name','actor_2_name','actor_3_name','genres','movie_title']]
new_df23

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title
0,Gerard Johnstone,Allison Williams,Violet McGraw,Amie Donald,Science Fiction Horror,M3GAN
1,Brett Donowho,Nicolas Cage,Ryan Kiera Armstrong,,Western Drama,The Old Way
2,Nathan Frankowski,Alice Orr-Ewing,Joe Doyle,Eveline Hall,Horror Fantasy Science Fiction Thriller,The Devil Conspiracy
3,ois Richet,Gerard Butler,Mike Colter,Yoson An,Action Adventure Thriller,Plane
4,Calmatic,Tosin Cole,Jacob Latimore,Karen Obilom,Comedy,House Party
...,...,...,...,...,...,...
338,Michel Franco,Jessica Chastain,Peter Sarsgaard,Merritt Wever,Action Thriller Crime,Memory
339,Blitz Bazawule,Fantasia Barrino,Taraji P. Henson,Danielle Brooks,Drama,The Color Purple
340,George Clooney,Callum Turner,Joel Edgerton,Peter Guinness,Drama History,The Boys in the Boat
341,Michael Mann,Adam Driver,Penelope Cruz,Shailene Woodley,History Drama,Ferrari


In [41]:
new_df23.shape

(343, 6)

In [42]:
new_df23.isna().sum()

director_name     3
actor_1_name      0
actor_2_name      4
actor_3_name     20
genres            0
movie_title       0
dtype: int64

In [43]:
new_df23.dropna(thresh=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df23.dropna(thresh=1,inplace=True)


In [44]:
new_df23.isna().sum()

director_name     3
actor_1_name      0
actor_2_name      4
actor_3_name     20
genres            0
movie_title       0
dtype: int64

In [45]:
new_df23.fillna('unknown',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df23.fillna('unknown',inplace=True)


In [46]:
new_df23.isna().sum()

director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
genres           0
movie_title      0
dtype: int64

In [47]:
new_df23['movie_title']= new_df23['movie_title'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df23['movie_title']= new_df23['movie_title'].str.lower()


In [48]:
new_df23.sample(2)

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title
260,Martin Scorsese,Leonardo DiCaprio,Robert De Niro,Lily Gladstone,Crime History Drama,killers of the flower moon
113,Kyle Patrick Alvarez,Isaiah Russell-Bailey,Mckenna Grace,Billy Barratt,Science Fiction Action Adventure Family,crater


In [49]:

new_df23['comb'] = new_df23['actor_1_name'] + ' ' + new_df23['actor_2_name'] + ' '+ new_df23['actor_3_name'] + ' '+ new_df23['director_name'] +' ' + new_df23['genres']
new_df23

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df23['comb'] = new_df23['actor_1_name'] + ' ' + new_df23['actor_2_name'] + ' '+ new_df23['actor_3_name'] + ' '+ new_df23['director_name'] +' ' + new_df23['genres']


Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,Gerard Johnstone,Allison Williams,Violet McGraw,Amie Donald,Science Fiction Horror,m3gan,Allison Williams Violet McGraw Amie Donald Ger...
1,Brett Donowho,Nicolas Cage,Ryan Kiera Armstrong,unknown,Western Drama,the old way,Nicolas Cage Ryan Kiera Armstrong unknown Bret...
2,Nathan Frankowski,Alice Orr-Ewing,Joe Doyle,Eveline Hall,Horror Fantasy Science Fiction Thriller,the devil conspiracy,Alice Orr-Ewing Joe Doyle Eveline Hall Nathan ...
3,ois Richet,Gerard Butler,Mike Colter,Yoson An,Action Adventure Thriller,plane,Gerard Butler Mike Colter Yoson An ois Richet ...
4,Calmatic,Tosin Cole,Jacob Latimore,Karen Obilom,Comedy,house party,Tosin Cole Jacob Latimore Karen Obilom Calmati...
...,...,...,...,...,...,...,...
338,Michel Franco,Jessica Chastain,Peter Sarsgaard,Merritt Wever,Action Thriller Crime,memory,Jessica Chastain Peter Sarsgaard Merritt Wever...
339,Blitz Bazawule,Fantasia Barrino,Taraji P. Henson,Danielle Brooks,Drama,the color purple,Fantasia Barrino Taraji P. Henson Danielle Bro...
340,George Clooney,Callum Turner,Joel Edgerton,Peter Guinness,Drama History,the boys in the boat,Callum Turner Joel Edgerton Peter Guinness Geo...
341,Michael Mann,Adam Driver,Penelope Cruz,Shailene Woodley,History Drama,ferrari,Adam Driver Penelope Cruz Shailene Woodley Mic...


In [50]:
old = pd.read_csv('last1_data.csv')

old.sample(2)

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,movie_title,genres,comb
1807,Matt Reeves,Mike Vogel,Ben Feldman,Liza Lapira,cloverfield,Action Adventure Horror Sci-Fi,Mike Vogel Ben Feldman Liza Lapira Matt Reeves...
6114,Taylor Swift,Jack Antonoff,Aaron Dessner,Justin Vernon,folklore: the long pond studio sessions,Music Documentary,Jack Antonoff Aaron Dessner Justin Vernon Tayl...


In [51]:
final = pd.concat([old,new_df23],ignore_index=True)
final

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,movie_title,genres,comb
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,avatar,Action Adventure Fantasy Sci-Fi,CCH Pounder Joel David Moore Wes Studi James C...
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,pirates of the caribbean: at world's end,Action Adventure Fantasy,Johnny Depp Orlando Bloom Jack Davenport Gore ...
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,spectre,Action Adventure Thriller,Christoph Waltz Rory Kinnear Stephanie Sigman ...
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,the dark knight rises,Action Thriller,Tom Hardy Christian Bale Joseph Gordon-Levitt ...
4,Doug Walker,Doug Walker,Rob Walker,unknown,star wars: episode vii - the force awakens ...,Documentary,Doug Walker Rob Walker unknown Doug Walker Doc...
...,...,...,...,...,...,...,...
7197,Michel Franco,Jessica Chastain,Peter Sarsgaard,Merritt Wever,memory,Action Thriller Crime,Jessica Chastain Peter Sarsgaard Merritt Wever...
7198,Blitz Bazawule,Fantasia Barrino,Taraji P. Henson,Danielle Brooks,the color purple,Drama,Fantasia Barrino Taraji P. Henson Danielle Bro...
7199,George Clooney,Callum Turner,Joel Edgerton,Peter Guinness,the boys in the boat,Drama History,Callum Turner Joel Edgerton Peter Guinness Geo...
7200,Michael Mann,Adam Driver,Penelope Cruz,Shailene Woodley,ferrari,History Drama,Adam Driver Penelope Cruz Shailene Woodley Mic...


In [52]:
final.isna().sum()

director_name    2
actor_1_name     0
actor_2_name     0
actor_3_name     0
movie_title      0
genres           0
comb             0
dtype: int64

In [53]:
final.dropna(inplace=True)

In [54]:
final.drop_duplicates(keep='last',inplace=True)

final

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,movie_title,genres,comb
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,avatar,Action Adventure Fantasy Sci-Fi,CCH Pounder Joel David Moore Wes Studi James C...
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,pirates of the caribbean: at world's end,Action Adventure Fantasy,Johnny Depp Orlando Bloom Jack Davenport Gore ...
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,spectre,Action Adventure Thriller,Christoph Waltz Rory Kinnear Stephanie Sigman ...
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,the dark knight rises,Action Thriller,Tom Hardy Christian Bale Joseph Gordon-Levitt ...
4,Doug Walker,Doug Walker,Rob Walker,unknown,star wars: episode vii - the force awakens ...,Documentary,Doug Walker Rob Walker unknown Doug Walker Doc...
...,...,...,...,...,...,...,...
7197,Michel Franco,Jessica Chastain,Peter Sarsgaard,Merritt Wever,memory,Action Thriller Crime,Jessica Chastain Peter Sarsgaard Merritt Wever...
7198,Blitz Bazawule,Fantasia Barrino,Taraji P. Henson,Danielle Brooks,the color purple,Drama,Fantasia Barrino Taraji P. Henson Danielle Bro...
7199,George Clooney,Callum Turner,Joel Edgerton,Peter Guinness,the boys in the boat,Drama History,Callum Turner Joel Edgerton Peter Guinness Geo...
7200,Michael Mann,Adam Driver,Penelope Cruz,Shailene Woodley,ferrari,History Drama,Adam Driver Penelope Cruz Shailene Woodley Mic...


In [56]:
final.to_csv('last2_data.csv',index=False)