In [1]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from dotenv import load_dotenv
import os
import re
import pandas as pd
import calendar

In [2]:
load_dotenv()
API_KEY = os.getenv('TMDB_API_KEY')

In [3]:
#returns a list of dictionarys whith films that are possible results
def find_movies(movie_search):
    #search TMDB API over query
    response = requests.get(
        f"https://api.themoviedb.org/3/search/movie?api_key={API_KEY}"
        f"&query={string_to_query(movie_search)}")
    
    #only return the results as a list
    return response.json()['results']

#formats a string like a query 
def string_to_query(search):
    return search.replace(" ", "+")

In [4]:
date = find_movies("Der kleine Horrorladen")[0]['release_date']

In [5]:
date

'1960-01-01'

In [6]:
datetime.strptime(date, '%Y-%m-%d')

datetime.datetime(1960, 1, 1, 0, 0)

In [7]:
[x['title'] for x in find_movies("Her")]

['Her',
 'Let Her Kill You',
 'Heroic',
 'Justice League x RWBY: Super Heroes & Huntsmen, Part Two',
 'The Boy and the Heron',
 'Miraculous World: New York, United HeroeZ',
 'Dragon Ball Super: Super Hero',
 'My Hero Academia: All Might Rising',
 'Her',
 'Her',
 'her',
 'Her',
 'Her',
 'Her',
 'Her',
 'Your Name Engraved Herein',
 "Dragon Ball GT: A Hero's Legacy",
 'The Last Heretic',
 "My Hero Academia: World Heroes' Mission",
 'To Her']

In [8]:
#if titles are ambiguess the letterboxd link will 
#have the release year appended at the end
def check_title_ambiguity(title):
    titles = [x['title'] for x in find_movies(title)]
    if len(titles) != len(set(titles)):
        return 
    return False

def get_release_date(movie):
    try:
        date = datetime.strptime(movie['release_date'], '%Y-%m-%d').date()
    except:
        date = datetime.today().date()
    return date

In [10]:

def get_genre(movie):
    return movie['genre_ids']

def get_tmdb_title(movie):
    return movie['title']

def get_link(result_list):
    
    title = get_tmdb_title(result_list[0])
    # if Title is ambiguous put release year on end of string
    if check_title_ambiguity(title):
        # first 4 digits are year (yyyy-mm-dd)
        title = title + " " + str(get_release_date(result_list[0]).year)

    # remove dots etc.
    fit = re.sub("[,./()\-;:_#'+*~?!&]", "", title)
    # turn everything lower case
    fit = fit.lower()
    # turn whitespaces to -
    fit = re.sub(" ", "-", fit)

    link = f'https://letterboxd.com/film/{fit}'
    return link

def get_letterboxd_rating(result_list):
    url = get_link(result_list)
    html = requests.get(url)
    junk = BeautifulSoup(html.content, 'html.parser')

    try:
        # rating is hidden in <meta> tag named twitter:data2
        results = junk.find('meta', {"name": "twitter:data2", "content": True})
        # remove everything after first whitespace "3.5 out of 5" but we only want 3.5
        rating = re.search('\S+', results['content']).group()
        return float(rating)
    # Incase the movie does not have a rating on Letterboxd
    except Exception as e:
        return None

In [11]:
def movie_metadata_string(result_list):
    return f"Title: {get_tmdb_title(result_list[0])}\nRelease-date: {get_release_date(result_list[0])}\nRating: {get_letterboxd_rating(result_list)}\nLetterboxd Link: {get_link(result_list)}"

In [25]:
def str_to_date(string, format = '%Y-%m-%d'):
    return datetime.strptime(string, format).date()

In [None]:
result_list = find_movies("Good Vibrations")
print(movie_metadata_string(result_list))
print(check_title_ambiguity(result_list))

In [15]:
raw_data = pd.read_csv("../data/raw/sommer_07.csv")

In [26]:
raw_data['Date'] = raw_data['Date'].apply(str_to_date)

In [37]:
datetime(2010, 5, 10).month > datetime(2024, 3, 10).month


True

In [38]:
def add_semester(date: datetime):
    # Year is not important here as we only want to check Month!
    if date.month > datetime(2000, 3, 1).month & date.month < datetime(2000, 8, 1).month:
        return "Sommersemester"
    return "Wintersemester"

In [39]:
raw_data['Semester'] = raw_data['Date'].apply(add_semester)

In [1]:
from metadata import append_meta
import pandas as pd 

raw = pd.read_csv('../data/raw/sommer_05.csv')
full = append_meta(raw)

ModuleNotFoundError: No module named 'app'

In [41]:
def date_to_weekday(date: datetime):
    return calendar.day_name[date.weekday()]

In [42]:
raw_data['Weekday'] = raw_data['Date'].apply(date_to_weekday)

In [44]:
raw_data.head(10)

Unnamed: 0,Date,Titel,Attendance,Semester,Weekday
0,2007-04-11,Borat,68,Sommersemester,Wednesday
1,2007-04-17,Das Parfum,112,Sommersemester,Tuesday
2,2007-04-18,Thank you for smoking,63,Sommersemester,Wednesday
3,2007-04-24,Eine unbequeme Wahrheit,153,Sommersemester,Tuesday
4,2007-04-25,Der Tiger und der Schnee,47,Sommersemester,Wednesday
5,2007-05-02,Ein Freund von mir,124,Sommersemester,Wednesday
6,2007-05-08,Kukushka,64,Sommersemester,Tuesday
7,2007-05-09,Lichter der Vorstadt,35,Sommersemester,Wednesday
8,2007-05-15,Fremde Haut,22,Sommersemester,Tuesday
9,2007-05-16,Der die Tollkirsche ausgräbt & Nach 5 im Urwald,27,Sommersemester,Wednesday


In [45]:
def add_movie_metadata(search):
    result_list = find_movies(search)
    
    title = get_tmdb_title(result_list[0])
    release_date = get_release_date(result_list[0])
    rating = get_letterboxd_rating(result_list)
    genre_ids = get_genre(result_list[0])

    return title, release_date, rating, genre_ids

In [46]:
# return the metadata tuples to list and then dataframe, which can be appended 

metadata = pd.DataFrame(list(raw_data['Movie'].apply(add_movie_metadata)))

KeyError: 'Movie'

In [None]:
metadata.columns =['TMDB_Title', 'Release_Date', 'Rating', 'Genre_IDs']

In [None]:
metadata.head()

In [None]:
raw_data.head()

In [None]:
raw_data = pd.concat([raw_data, metadata], axis=1)

In [None]:
raw_data.head()

In [None]:
raw_data['Time_Since_Release'] = raw_data['Date'] - raw_data['Release_Date']

In [None]:
raw_data['Time_Since_Release']= pd.to_timedelta(raw_data['Time_Since_Release'])

In [None]:
raw_data