In [122]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv
import regex as re

In [253]:
def collect_movies_artist_data(genre: str, year:str, sub_years:list) -> pd.DataFrame :

    titles = []
    directors = []
    casts = []
    countries = []
    all_years = []
    genres = []

    if genre == 'horror':
        url = f"https://en.wikipedia.org/wiki/List_of_horror_films_of_{year}"
        sub_years = [year]
    else: 
        url = f"https://en.wikipedia.org/wiki/List_of_{genre}_films_of_the_{year}"

    wiki_page = requests.get(url)
    soup = BeautifulSoup(wiki_page.text, 'html.parser')
    all_tables = soup.find_all('table', {'class':'wikitable'})
            
    for sub_year_idx, sub_year in enumerate(sub_years):   
        for row in all_tables[sub_year_idx].find_all('tr'): 
            columns = row.find_all('td')
            
            if (genre == 'thriller') or (genre == 'fantasy') or (genre=='science_fiction'): 
                if (len(columns)!=0): 
                    if (re.search(r"2\d{3}\n", columns[0].text)):
                        sub_year = columns[0].text[:4]

            if len(columns) >= 4:
                if genre == 'horror' and not (sub_year == '2020' or sub_year == '2021' or sub_year == '2022' or sub_year == '2019'):
                    try: 
                        title = row.find('a').get_text(strip=True)
                    except: 
                        title = row.find('i').get_text(strip=True)
                    director = columns[0].get_text(strip=True)
                    cast_list = columns[1].get_text(strip=True)
                    country = columns[2].get_text(strip=True)
                elif genre == 'comedy' and sub_year=='2007':       
                    title = row.find('a').get_text(strip=True)
                    director = columns[0].get_text(strip=True)
                    cast_list = columns[1].get_text(strip=True)
                    country = columns[2].get_text(strip=True)
                else: 
                    title = columns[0].get_text(strip=True)
                    director = columns[1].get_text(strip=True)
                    cast_list = columns[2].get_text(strip=True)
                    country = columns[3].get_text(strip=True)

                titles.append(title)
                directors.append(director)
                casts.append(cast_list)
                countries.append(country)
                genres.append(genre)
                all_years.append(sub_year)
    data = {
    "Title": titles,
    "Director": directors,
    "Cast": casts,
    "Country": countries,
    "Genre": genres, 
    "Year": all_years}

    df = pd.DataFrame(data)

    return df

In [259]:
params = {
        "thriller": {"years": ["2020s","2010s", "2000s"],
                "sub_years": [["2020", "2021", "2022"],["2010", "2011", "2012","2013","2014","2015","2016","2017","2018","2019"],[[]]]}, 
        "action": {"years": ["2020s","2010s", "2000s"],
                "sub_years": [["2020", "2021", "2022"],["2010", "2011", "2012","2013","2014","2015","2016","2017","2018","2019"],["2000", "2001", "2002","2003","2004","2005","2006","2007","2008","2009"]]},             
        "comedy": {"years": ["2020s","2010s", "2000s"],
                "sub_years": [["2020", "2021", "2022"],["2010", "2011", "2012","2013","2014","2015","2016","2017","2018","2019"],["2000", "2001", "2002","2003","2004","2005","2006","2007"]]},
        "adventure": {"years": ["2010s", "2000s"],
                "sub_years": [["2010", "2011", "2012","2013","2014","2015","2016","2017","2018","2019", "2020", "2021", "2022"], ["2000", "2001", "2002","2003","2004","2005","2006","2007","2008","2009"]]}, 
        "horror": {"years": ["2020", "2021", "2022", "2010","2011","2012","2013","2014","2015","2016","2017","2018","2019", "2000", "2001", "2002","2003","2004","2005","2006","2007","2008","2009"], 
                "sub_years": [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]},
        "fantasy": {"years": ["2020s","2010s", "2000s"],
                "sub_years": [[[]],[[]],[[]]]}, 
        "science_fiction": {"years": ["2020s", "2010s", "2000s"],
                "sub_years": [[[]],[[]],[[]]]}, 
        }

data_movie_artist = pd.DataFrame()
for genre in params.keys():
    for years, sub_years in zip(params[genre]["years"],params[genre]["sub_years"]):
        data = collect_movies_artist_data(genre, years, sub_years)
        data_movie_artist = pd.concat([data_movie_artist, data]) 

data_movie_artist = data_movie_artist.reset_index(drop=True)


In [263]:
# Some cast are empty so we need to remove those movies: 
data_movie_artist = data_movie_artist[data_movie_artist['Cast'] != ''].reset_index(drop=True)

# Movies that exists in two different years cleaning: 
# I did some research and found out that they exists in these years: 
#data_movie_artist[data_movie_artist['Title']=='Run Sweetheart Run'] # findes kun i 2020
#data_movie_artist[data_movie_artist['Title']=='The Black Phone'] # findes kun i 2021
#data_movie_artist[data_movie_artist['Title']=='Bhool Bhulaiyaa 2' ]# findes kun i 2022
#data_movie_artist[data_movie_artist['Title']=='Apartment 143' ]# findes kun i 2011
#data_movie_artist[data_movie_artist['Title']=='Underworld: Blood Wars' ]# findes kun i 2016

movies_to_remove = {
    'Run Sweetheart Run': "2022",
    'The Black Phone': "2022",
    'Bhool Bhulaiyaa 2': "2021",
    'Apartment 143': "2012",
    'Underworld: Blood Wars': "2017", 
    'Bloody Bloody Bible Camp': '2012'}
for title, year in movies_to_remove.items():
    mask = data_movie_artist[(data_movie_artist['Title'] == title) & (data_movie_artist['Year'] == year)]
    data_movie_artist = data_movie_artist.drop(mask.index[0]).reset_index(drop=True)


# Different Movies but same title
duplicates_in_title = data_movie_artist[data_movie_artist.duplicated(subset=['Title'], keep=False)]
filtered_duplicates = duplicates_in_title[~duplicates_in_title.duplicated(subset=['Director', 'Country'], keep=False)]
for index, row in filtered_duplicates.iterrows():
    data_movie_artist.loc[index, 'Title'] = row['Title'] + ' ' + row['Director']
data_movie_artist = data_movie_artist.reset_index(drop=True)


### Same movie different Genre
duplicates = data_movie_artist[data_movie_artist[['Title', 'Country', 'Year']].duplicated(keep=False)]

genre_mapping = {}
for index, row in duplicates.iterrows():
    key = (row['Title'], row['Country'], row['Year'])
    if key not in genre_mapping:
        genre_mapping[key] = set()
    genre_mapping[key].add(row['Genre'])

# Create a boolean mask for duplicates
is_duplicate = data_movie_artist[['Title', 'Country', 'Year']].duplicated(keep=False)

data_movie_artist.loc[is_duplicate, 'Genre'] = data_movie_artist[is_duplicate].apply(
    lambda row: 'action-horror' if 'horror' in genre_mapping[(row['Title'], row['Country'], row['Year'])] and 'action' in genre_mapping[(row['Title'], row['Country'], row['Year'])] else
        ('action-thriller' if 'thriller' in genre_mapping[(row['Title'], row['Country'], row['Year'])] and 'action' in genre_mapping[(row['Title'], row['Country'], row['Year'])] else
        ('comedy-action' if 'action' in genre_mapping[(row['Title'], row['Country'], row['Year'])] and 'comedy' in genre_mapping[(row['Title'], row['Country'], row['Year'])] else
        ('comedy-horror' if 'horror' in genre_mapping[(row['Title'], row['Country'], row['Year'])] and 'comedy' in genre_mapping[(row['Title'], row['Country'], row['Year'])] else 
        ('drama-thriller' if 'thriller' in genre_mapping[(row['Title'], row['Country'], row['Year'])] and 'drama' in genre_mapping[(row['Title'], row['Country'], row['Year'])] else
        ('drama-comedy' if 'drama' in genre_mapping[(row['Title'], row['Country'], row['Year'])] and 'comedy' in genre_mapping[(row['Title'], row['Country'], row['Year'])] else
        ('drama-action' if 'drama' in genre_mapping[(row['Title'], row['Country'], row['Year'])] and 'action' in genre_mapping[(row['Title'], row['Country'], row['Year'])] else
        ('action-adventure' if 'adventure' in genre_mapping[(row['Title'], row['Country'], row['Year'])] and 'action' in genre_mapping[(row['Title'], row['Country'], row['Year'])] else
        ('thriller-horror' if 'thriller' in genre_mapping[(row['Title'], row['Country'], row['Year'])] and 'horror' in genre_mapping[(row['Title'], row['Country'], row['Year'])] else
        ('comedy-thriller' if 'thriller' in genre_mapping[(row['Title'], row['Country'], row['Year'])] and 'comedy' in genre_mapping[(row['Title'], row['Country'], row['Year'])] else
        ('comedy-adventure' if 'adventure' in genre_mapping[(row['Title'], row['Country'], row['Year'])] and 'comedy' in genre_mapping[(row['Title'], row['Country'], row['Year'])] else
        ('thriller-adventure' if 'adventure' in genre_mapping[(row['Title'], row['Country'], row['Year'])] and 'thriller' in genre_mapping[(row['Title'], row['Country'], row['Year'])] else
        ('drama-horror' if 'drama' in genre_mapping[(row['Title'], row['Country'], row['Year'])] and 'horror' in genre_mapping[(row['Title'], row['Country'], row['Year'])] else
         ('adventure-horror' if 'adventure' in genre_mapping[(row['Title'], row['Country'], row['Year'])] and 'horror' in genre_mapping[(row['Title'], row['Country'], row['Year'])] else
         row['Genre']))))))))))))), axis=1)

data_movie_artist = data_movie_artist.drop_duplicates(subset=['Title', 'Country', 'Year', 'Genre']).reset_index(drop=True)

# Movies which exists beceuase of difference in country names  
data_movie_artist['Country_Length'] = data_movie_artist['Country'].str.len()
duplicate_rows = data_movie_artist[data_movie_artist.duplicated(subset=['Title', 'Director'], keep=False)]
filtered_data = duplicate_rows.sort_values(by='Country_Length').drop_duplicates(subset=['Title', 'Director'], keep='first').drop(columns=['Country_Length'])
data_movie_artist = pd.concat([data_movie_artist.drop(duplicate_rows.index), filtered_data]).reset_index(drop=True).drop(columns=['Country_Length'])

In [264]:
data_movie_artist['Country'].value_counts().reset_index()

Unnamed: 0,index,Country
0,United States,3518
1,Japan,218
2,United Kingdom,197
3,France,145
4,South Korea,115
...,...,...
438,United KingdomTanzania,1
439,United KingdomSpainBelgium,1
440,United states,1
441,DenmarkUnited KingdomNorwaySwedenGermany,1


In [265]:
data_movie_artist['Genre'].value_counts()

horror                1883
comedy                1021
science_fiction        633
action                 596
adventure              463
thriller               429
fantasy                417
action-thriller        140
comedy-action           86
action-adventure        82
comedy-horror           56
action-horror           56
thriller-horror         40
comedy-adventure        30
thriller-adventure      10
adventure-horror         5
Name: Genre, dtype: int64

In [64]:
duplicates_in_title = data_movie_artist[data_movie_artist.duplicated(subset=['Title'], keep=False)]
filtered_duplicates = duplicates_in_title[~duplicates_in_title.duplicated(subset=['Director', 'Country'], keep=False)]
for index, row in filtered_duplicates.iterrows():
    data_movie_artist.loc[index, 'Title'] = row['Title'] + ' ' + row['Director']
data_movie_artist = data_movie_artist.reset_index(drop=True)

In [266]:
data_movie_artist[data_movie_artist['Title'].duplicated(keep=False)]

Unnamed: 0,Title,Director,Cast,Country,Genre,Year
48,Master,Lokesh Kanagaraj,"Vijay,Vijay Sethupathi,Malavika Mohanan,Arjun ...",India,action-thriller,2021
77,Beast,Nelson Dilipkumar,"Vijay,Pooja Hegde",India,action-thriller,2022
87,Master,Mariama Diallo,"Regina Hall,Zoe Renee,Amber Gray,Molly Bernard...",United States,thriller-horror,2022
325,Pulse,Kiyoshi Kurosawa,"Haruhiko Kato,Kumiko Asō,Koyuki",Japan,thriller,2001
503,Pulse,Jim Sonzero,"Kristen Bell,Ian Somerhalder,Rick Gonzalez",United States,thriller-horror,2006
958,Fantastic Four,Josh Trank,"Miles Teller,Kate Mara,Michael B. Jordan,Jamie...",United States,action-adventure,2015
990,Captain America: Civil War,"Antony Russo, Joe Russo","Chris Evans,Robert Downey Jr.",United States,action,2016
1055,Avengers: Infinity War,"Antony Russo, Joe Russo","Robert Downey Jr.,Scarlett Johansson,Chris Pra...",United States,action,2018
1084,Robin Hood,Otto Bathurst,"Taron Egerton,Jamie Foxx,Ben Mendelsohn,Jamie ...",United States,action-adventure,2018
2499,Robin Hood,Ridley Scott,"Russell Crowe,Cate Blanchett,William Hurt,Max ...",United States,adventure,2010


## Split cast into each row

In [235]:
df = data_movie_artist.copy()

In [236]:
df['Cast'] = df['Cast'].str.split(',')
df = df.explode('Cast').reset_index(drop=True)
df['Cast'] = df['Cast'].str.lower()
df['Title'] = df['Title'].str.lower()


In [238]:
# Number of nodes: 
nr_nodes = len(df['Cast'].unique())
print(nr_nodes)

2207


In [241]:
unique_cast = df.groupby('Cast')['Title'].agg(list).reset_index()
unique_cast.rename(columns={'Title': 'Titles'}, inplace=True)

# Step 4
artist_connections = {}
for index, row in df.iterrows():
    current_artist = row['Cast']
    other_artists = df[df['Title'] == row['Title']]['Cast'].tolist()
    other_artists.remove(current_artist)
    
    if current_artist in artist_connections:
        artist_connections[current_artist].extend(other_artists)
    else:
        artist_connections[current_artist] = other_artists

# Remove duplicates in the "connected" list
for artist, connections in artist_connections.items():
    artist_connections[artist] = list(set(connections))

artist_connections_df = pd.DataFrame(list(artist_connections.items()), columns=['Cast', 'connected'])

# Step 5
# Add a new column for counting connected artists
artist_connections_df['connected_count'] = artist_connections_df['connected'].apply(len)

# Merge the DataFrames
final_df = unique_cast.merge(artist_connections_df, on='Cast')


In [243]:
final_df['connected_count'].sum()

11037

In [244]:
artist_connections = {}
for index, row in df.iterrows():
    current_artist = row['Cast']
    other_artists = df[df['Title'] == row['Title']]['Cast'].tolist()
    other_artists.remove(current_artist)  # Remove the current artist from the list
    if current_artist in artist_connections:
        artist_connections[current_artist].extend(other_artists)
    else:
        artist_connections[current_artist] = other_artists

# Convert the dictionary to a DataFrame
artist_connections_df = pd.DataFrame(list(artist_connections.items()), columns=['cast', 'connected'])


In [169]:
final_df = unique_cast.merge(artist_connections_df, on='Cast')

In [178]:
final_df['connected'][final_df['Cast']=='tom cruise']


4969    [rosamund pike, robert duvall, tom cruise, ros...
Name: connected, dtype: object

In [148]:
df['Title'] = df['Title'].str.lower()

In [155]:
df[df[['Title', 'Cast', 'Genre']].duplicated()]

Unnamed: 0,Title,Director,Cast,Country,Genre,Year
2896,kingsman: the secret service,Matthew Vaughn,colin firth,United StatesUnited Kingdom,action,2015
2898,kingsman: the secret service,Matthew Vaughn,samuel l. jackson,United StatesUnited Kingdom,action,2015
4561,jack goes boating,Philip Seymour Hoffman,amy ryan,United States,comedy,2010
5115,the players,Various,geraldine nakache,France,comedy,2012
7574,sint,Dick Maas,egbert-jan weeber,Netherlands,horror,2010
7817,apartment 143,Carles Torrens,kai lennox,Spain,horror,2012
7818,apartment 143,Carles Torrens,gia mantegna,Spain,horror,2012
7845,bloody bloody bible camp,Vito Trabuco,reggie bannister,United States,horror,2012
7846,bloody bloody bible camp,Vito Trabuco,tim sullivan,United States,horror,2012
7847,bloody bloody bible camp,Vito Trabuco,ron jeremy,United States,horror,2012


In [156]:
df[df['Title'] == 'kingsman: the secret service']


Unnamed: 0,Title,Director,Cast,Country,Genre,Year
863,kingsman: the secret service,Matthew Vaughn,colin firth,"United Kingdom,United States",thriller,2014
864,kingsman: the secret service,Matthew Vaughn,taron egerton,"United Kingdom,United States",thriller,2014
865,kingsman: the secret service,Matthew Vaughn,samuel l. jackson,"United Kingdom,United States",thriller,2014
2743,kingsman: the secret service,Matthew Vaughn,colin firth,United States,action,2014
2744,kingsman: the secret service,Matthew Vaughn,samuel l. jackson,United States,action,2014
2745,kingsman: the secret service,Matthew Vaughn,mark strong,United States,action,2014
2896,kingsman: the secret service,Matthew Vaughn,colin firth,United StatesUnited Kingdom,action,2015
2897,kingsman: the secret service,Matthew Vaughn,taron egerton,United StatesUnited Kingdom,action,2015
2898,kingsman: the secret service,Matthew Vaughn,samuel l. jackson,United StatesUnited Kingdom,action,2015
2899,kingsman: the secret service,Matthew Vaughn,,United StatesUnited Kingdom,action,2015
