# FilmPit movies
---------------------------
The FilmPit is a podcast creating reviews for low-budget movies. I created this notebook to collect movie titles and information.

In [1]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np
from os import path

## Scraping titles from filmpit site

In [2]:
# Checking if file already exist otherwise scrapes titles from the filmpit site using requests and BeautifulSoup
if path.exists('datasets/movie_titles.csv'):
    movie_titles = pd.read_csv('datasets/movie_titles.csv').squeeze('columns')
else:
    URL = "https://thefilmpit.com"

    def get_movies_title(URL):
        """Scrapes podcasts hrefs to get movie titles"""
        movie_titles = []
        page = requests.get(URL)
        next_link = None   
        if page.ok:
            soup = BeautifulSoup(page.content, 'html.parser')
            try:
                next_link = soup.find('link', {'rel':'next',}).get('href') # checking for next page
            except AttributeError:
                print("This page hasn't a next link")
            for podcast in soup.find_all('a', attrs={'rel':'bookmark'}): # getting titles from hrefs and clean them
                podcast_link = podcast.get('href')
                title = podcast_link.split('/')[-2].replace('-', ' ')
                movie_titles.append(title)
        return movie_titles, next_link

    next_link = URL
    movie_titles = []

    while next_link:
        print(f"Scraping next link: {next_link}")
        titles, next_link = get_movies_title(next_link)
        movie_titles += titles
        time.sleep(5)
    print("Scraping finished.")

In [3]:
if not path.exists('datasets/movie_titles.csv'):
    movie_titles += ['never too young to die', 'Dr Caligari', 'Yeti'] # plus 3 movies that don't exist on the site
    movie_titles = pd.Series(movie_titles)
    
    fixed_titles = {'diagalaxiaki poiotita galaxy of terror': 'galaxy of terror',
                    'i scholi tou gkontfrei sakura killers': 'sakura killers',
                    'exairetika petsino podkast the punisher': 'the punisher',
                    'o rambu tis indonisias einai edo': 'Rambu aka The Intruder',
                    'tha einai san star gouorz alla den tha einai star gouorz battle beyond the stars': 'battle beyond the stars',
                    'brady idrotas kai pioti tie night stalker': 'night stalker',
                    'pao na kano penintarika double dragon': 'double dragon',
                    'mousikorama shock em dead': 'shock em dead',
                    'asiatiki tourne 3 undefeatable': 'undefeatable',
                    'asiatiki tourne 2 w is war': 'w is war',
                    'asiatiki tourne 1 for your height only': 'for your height only',
                    'ena mikro mousiko breik rappin': 'rappin',
                    'dark night of the scarecrow feat elina dimitriadi': 'dark night of the scarecrow',
                    'prom night feat elina dimitriadi': 'prom night',
                    'aerobicide': 'Killer Workout',
                    'an american hippie in paris': 'An American Hippie in Israel',
                    'tc2000': 'tc 2000',
                    'american ninia': 'american ninja',
                    'class of nukem high': "Class of Nuke 'Em High",
                    'superman 4': "superman iv"
                    }

    movie_titles.replace(fixed_titles, inplace=True)
    movie_titles.drop([91, 109], inplace=True)

In [4]:
if not path.exists('datasets/movie_titles.csv'):
    movie_titles.to_csv('datasets/movie_titles.csv', index=False, header=['titles'])

## Get movie info from TMDB

I'm going to use tmdbv3api library to collect information from [The Movie Database](https://www.themoviedb.org).

In [5]:
if path.exists('datasets/movies_tmdb.csv'):
    movies_tmdb = pd.read_csv('datasets/movies_tmdb.csv', index_col=0)
else:
    from tmdbv3api import TMDb
    from config import config
    tmdb = TMDb()
    tmdb.api_key = config['tmdb_api_key'] # api key is free with a simple registration on TMDB

    from tmdbv3api import Movie
    movie = Movie()

    basic_info = {}
    not_found = []
    for title in movie_titles.to_list():
        print('Fetching ' + title)
        try:
            basic_info[title] = movie.search(title)[0] # getting the first search result
        except IndexError:
            not_found.append(title) # Making a list for movies that weren't found

    print('Movies not found: ', not_found)

Fetching raw nerve
Fetching raw justice
Fetching the baby
Fetching dont go in the house
Fetching conquest
Fetching battletruck
Fetching ninja squad
Fetching ultimax force
Fetching punk vacation
Fetching suburbia
Fetching An American Hippie in Israel
Fetching blood beach
Fetching hunters of the golden cobra
Fetching intent to kill
Fetching maniac cop
Fetching hard ticket to hawaii
Fetching warlock
Fetching maximum overdrive
Fetching nekromantik
Fetching manos
Fetching superman iv
Fetching flash gordon
Fetching surf nazis must die
Fetching to kako
Fetching zardoz
Fetching dark angel
Fetching girl in room 2a
Fetching tc 2000
Fetching interzone
Fetching zaat
Fetching american ninja
Fetching krull
Fetching beastmaster
Fetching raiders of the living dead
Fetching 315
Fetching q winged serpent
Fetching karate warrior 6
Fetching savage streets
Fetching amsterdamned
Fetching fatal deviation
Fetching black gestapo
Fetching ss experiment love camp
Fetching ilsa she wolf of the ss
Fetching yor
Fet

In [6]:
# Check fetced titles to inspect if they are right
if not path.exists('datasets/movies_tmdb.csv'):
    for title in movie_titles.to_list():
        if basic_info.get(title):
            print(title, ': ', basic_info[title]['title'], basic_info[title]['release_date'][:4])

raw nerve :  Raw Nerve 1991
raw justice :  Raw Justice 1994
the baby :  The Boss Baby: Family Business 2021
dont go in the house :  Don't Go in the House 1979
conquest :  Game of Thrones - Conquest & Rebellion: An Animated History of the Seven Kingdoms 2017
battletruck :  Warlords of the 21st Century 1982
ninja squad :  The Ninja Squad 1986
ultimax force :  Ultimax Force 1987
punk vacation :  Punk Vacation 1990
suburbia :  Infidelity in Suburbia 2017
An American Hippie in Israel :  An American Hippie in Israel 1972
blood beach :  Blood Beach 1980
hunters of the golden cobra :  The Hunters of the Golden Cobra 1982
intent to kill :  Intent to Kill 1992
maniac cop :  Maniac Cop 1988
hard ticket to hawaii :  Hard Ticket to Hawaii 1987
warlock :  Warlock 1959
maximum overdrive :  Maximum Overdrive 1986
nekromantik :  Nekromantik 1987
manos :  Manos: The Hands of Fate 1966
superman iv :  Superman IV: The Quest for Peace 1987
flash gordon :  Flash Gordon 1980
surf nazis must die :  Surf Nazis

In [7]:
if not path.exists('datasets/movies_tmdb.csv'):
    
    # Correcting wrong matches with the right id_key
    wrong_matches = {'Rambu aka The Intruder': '81944',
                     'lambada set the night on fire': '117269',
                     'the baby': '28156',
                     'conquest': '27232',
                     'suburbia': '28054',
                     'warlock': '11342',
                     'to kako': '39897',
                     'commander': '205697',
                     'captain america': '13995',
                     'star wars holiday special': '74849',
                     'the punisher': '8867',
                     'night stalker':'66474',
                     'the perfect weapon': '34421',
                     'the rage': '114936',
                     'warbirds': '219359',
                     'arena': '44796',
                     'jack frost': '27318',
                     'elves': '30452',
                     'prom night': '36599',
                     'cheerleader camp': '40087',
                     'endgame': '28850',
                     'thunder': '109104',
                     'Dr Caligari': '35642',
                     'Yeti': '92316'}
    

    movie_ids = {title:info['id'] for title, info in basic_info.items()}
    movie_ids.update(wrong_matches)

In [8]:
if not path.exists('datasets/movies_tmdb.csv'):
    print('Starting api request')
    movie_records = {}
    for title, id in movie_ids.items():
        
        # Now will get detailed info for each movie
        print(f"Fetching {title}")
        mov = movie.details(id)
        temp_list = [
            mov.imdb_id,
            mov.original_title,
            mov.budget,
            mov.revenue,
            mov.runtime,
            mov.popularity,
            [company['name'] for company in mov.production_companies],
            [key['name'] for key in mov.keywords.keywords],
            [act['name'] for act in mov.casts.cast],
            mov.overview
        ]
        movie_records[mov.title] = temp_list
    
    print('TMDB request finished')
    columns = ['imdb_id',
               'original_title',
               'budget',
               'revenue',
               'runtime',
               'popularity',
               'production_companies',
               'keywords',
               'cast',
               'overview'
              ]
    movies_tmdb = pd.DataFrame.from_dict(movie_records, orient='index', columns=columns)

Starting api request
Fetching raw nerve
Fetching raw justice
Fetching the baby
Fetching dont go in the house
Fetching conquest
Fetching battletruck
Fetching ninja squad
Fetching ultimax force
Fetching punk vacation
Fetching suburbia
Fetching An American Hippie in Israel
Fetching blood beach
Fetching hunters of the golden cobra
Fetching intent to kill
Fetching maniac cop
Fetching hard ticket to hawaii
Fetching warlock
Fetching maximum overdrive
Fetching nekromantik
Fetching manos
Fetching superman iv
Fetching flash gordon
Fetching surf nazis must die
Fetching to kako
Fetching zardoz
Fetching dark angel
Fetching girl in room 2a
Fetching tc 2000
Fetching interzone
Fetching zaat
Fetching american ninja
Fetching krull
Fetching beastmaster
Fetching raiders of the living dead
Fetching 315
Fetching q winged serpent
Fetching karate warrior 6
Fetching savage streets
Fetching amsterdamned
Fetching fatal deviation
Fetching black gestapo
Fetching ss experiment love camp
Fetching ilsa she wolf of th

In [10]:
# flagging zero as missing value
movies_tmdb[['budget', 'revenue', 'runtime']] = movies_tmdb[['budget', 'revenue', 'runtime']].replace({0: np.nan})

movies_tmdb.head(2)

Unnamed: 0,imdb_id,original_title,budget,revenue,runtime,popularity,production_companies,keywords,cast,overview
Raw Nerve,tt0102761,Raw Nerve,,,91.0,1.327,"[Action International Pictures, Pyodawn]",[],"[Glenn Ford, Sandahl Bergman, Randall 'Tex' Co...",A race car driver has visions of the victims o...
Raw Justice,tt0110948,Raw Justice,,,95.0,8.814,"[West Side Studios, Winters Hollywood Entertai...","[prostitute, bounty hunter, mayor, blackmail, ...","[Robert Hays, David Keith, Pamela Anderson, Le...",The mayor of a small town hires a bounty hunte...


In [11]:
# creating a different csv file for each dense field
if not path.exists('datasets/companies.csv'):
    companies = (movies_tmdb[['imdb_id', 'production_companies']]
                 .reset_index()
                 .rename(columns={'index':'title'})
                 .explode(column='production_companies')
                 .dropna().reset_index(drop=True)).copy()
    companies.to_csv('datasets/companies.csv', index=False)

In [12]:
if not path.exists('datasets/cast.csv'):
    cast = (movies_tmdb[['imdb_id', 'cast']]
             .reset_index()
             .rename(columns={'index':'title'})
             .explode(column='cast')
             .dropna().reset_index(drop=True)).copy()
    cast.to_csv('datasets/cast.csv', index=False)

In [13]:
if not path.exists('datasets/keywords.csv'):
    keywords = (movies_tmdb[['imdb_id', 'keywords']]
                .reset_index().rename(columns={'index':'title'})
                .explode(column='keywords')
                .dropna().reset_index(drop=True).copy())
    keywords.to_csv('datasets/keywords.csv', index=False)

In [14]:
movies_tmdb.drop(columns=['production_companies', 'keywords', 'cast'], inplace=True)
if not path.exists('datasets/movies_tmdb.csv'):
    movies_tmdb.to_csv('datasets/movies_tmdb.csv')

## Get movie info from OMDB
First I tried omdb python library but I had more luck with the Open Movie Database api. I used imdb_id field from TMDB to get extra info from OMDB.

In [15]:
if path.exists('datasets/movies_omdb.csv'):
    movies_omdb = pd.read_csv('datasets/movies_omdb.csv', index_col=0)
else:
    from config import config
    omdb_info = {}
    for imdb_id in movies_tmdb['imdb_id'].to_list():
        try:
            res = requests.get(f"http://www.omdbapi.com/?i={imdb_id}&apikey={config['omdb_api_key']}", timeout=3).json()
            omdb_info[imdb_id] = [
                res['Year'],
                res['Rated'],
                res['Genre'],
                res['Director'],
                res['Writer'],
                res['Language'],
                res['Country'],
                res['Awards'],
                res['Metascore'],
                res['imdbRating'],
                res['imdbVotes'],
                [rating['Value'] for rating in res['Ratings'] if rating['Source'] == 'Rotten Tomatoes']
            ]
        except:
            print(f"Request for movie id {imdb_id} did not executed.")

In [16]:
if not path.exists('datasets/movies_omdb.csv'):
    columns = ['year',
               'rated',
               'genre',
               'director',
               'writer',
               'language',
               'country',
               'awards',
               'metascore',
               'imdb_rating',
               'imdb_votes',
               'rotten_rating']
    movies_omdb = pd.DataFrame.from_dict(omdb_info, orient='index', columns=columns)

In [17]:
movies_omdb.head(2)

Unnamed: 0,year,rated,genre,director,writer,language,country,awards,metascore,imdb_rating,imdb_votes,rotten_rating
tt0102761,1991,R,"Drama, Mystery, Thriller",David A. Prior,"Jason Coleman, David A. Prior, Lawrence L. Sim...",English,United States,,,4.0,297,[]
tt0110948,1994,R,"Action, Thriller",David A. Prior,David A. Prior,English,United States,1 win,,4.1,1341,[]


In [18]:
movies_omdb = movies_omdb.reset_index().rename(columns={'index': 'imdb_id'})
movies_omdb.replace({'N/A': np.nan}, inplace=True)

In [19]:
movies_omdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114 entries, 0 to 113
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   imdb_id        114 non-null    object
 1   year           114 non-null    object
 2   rated          103 non-null    object
 3   genre          114 non-null    object
 4   director       114 non-null    object
 5   writer         113 non-null    object
 6   language       114 non-null    object
 7   country        114 non-null    object
 8   awards         26 non-null     object
 9   metascore      27 non-null     object
 10  imdb_rating    114 non-null    object
 11  imdb_votes     114 non-null    object
 12  rotten_rating  114 non-null    object
dtypes: object(13)
memory usage: 11.7+ KB


In [20]:
# some cleaning for the rating columns
movies_omdb['imdb_votes'] = movies_omdb['imdb_votes'].str.replace(',', '').astype(int)

movies_omdb['rotten_rating'] = movies_omdb.rotten_rating.apply(lambda x: x[0] if x else np.nan).str.replace('%', '').astype(float)
movies_omdb['imdb_rating'] = movies_omdb.imdb_rating.astype(float)
movies_omdb['metascore'] = movies_omdb.metascore.astype(float)

In [21]:
movies_omdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114 entries, 0 to 113
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   imdb_id        114 non-null    object 
 1   year           114 non-null    object 
 2   rated          103 non-null    object 
 3   genre          114 non-null    object 
 4   director       114 non-null    object 
 5   writer         113 non-null    object 
 6   language       114 non-null    object 
 7   country        114 non-null    object 
 8   awards         26 non-null     object 
 9   metascore      27 non-null     float64
 10  imdb_rating    114 non-null    float64
 11  imdb_votes     114 non-null    int32  
 12  rotten_rating  45 non-null     float64
dtypes: float64(3), int32(1), object(9)
memory usage: 11.3+ KB


In [22]:
# Because there are 5 fields with dense info I created a function to make a separate csv file for each
def create_sup_tables(column, f_name):
    """
    The fuction picks a dataframe column, splits values by coma,
    creates different columns and writes the new dataframe
    to a csv file based on f_name
    """
    df = movies_omdb[['imdb_id', column]].copy()
    df[column] = df[column].str.split(',')
    df = df.explode(column=column)
    df[column] = df[column].str.strip()
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    df.to_csv(path.join('datasets', f_name), index=False)

In [23]:
if not path.exists('datasets/genres.csv'):
    create_sup_tables('genre', 'genres.csv')
    
if not path.exists('datasets/writers.csv'):
    create_sup_tables('writer', 'writers.csv')
    
if not path.exists('datasets/countries.csv'):
    create_sup_tables('country', 'countries.csv')
    
if not path.exists('datasets/languages.csv'):
    create_sup_tables('language', 'languages.csv')
    
if not path.exists('datasets/directors.csv'):
    create_sup_tables('director', 'directors.csv')

In [24]:
movies_omdb.drop(columns=['genre', 'writer', 'country', 'director'], inplace=True)

if not path.exists('datasets/movies_omdb.csv'):
    movies_omdb.to_csv('datasets/movies_omdb.csv')

In [25]:
# merge our two datasets
movies = movies_tmdb.reset_index().rename(columns={'index':'title'}).merge(movies_omdb, how='inner', on='imdb_id')

In [26]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 114 entries, 0 to 113
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           114 non-null    object 
 1   imdb_id         114 non-null    object 
 2   original_title  114 non-null    object 
 3   budget          32 non-null     float64
 4   revenue         25 non-null     float64
 5   runtime         113 non-null    float64
 6   popularity      114 non-null    float64
 7   overview        114 non-null    object 
 8   year            114 non-null    object 
 9   rated           103 non-null    object 
 10  language        114 non-null    object 
 11  awards          26 non-null     object 
 12  metascore       27 non-null     float64
 13  imdb_rating     114 non-null    float64
 14  imdb_votes      114 non-null    int32  
 15  rotten_rating   45 non-null     float64
dtypes: float64(7), int32(1), object(8)
memory usage: 14.7+ KB


In [27]:
if not path.exists('datasets/movies.csv'):
    movies.to_csv('datasets/movies.csv', index=False)