## imports

In [1]:
import polars as pl
from polars import col as c
import os, sys, requests
from dotenv import load_dotenv, find_dotenv
from tqdm import tqdm

load_dotenv(find_dotenv())

True

## constants

In [2]:
OMDB_API_KEY = os.getenv("OMDB_API_KEY")
FILES_DIR = "../data"

## load df

In [3]:
file_name = "movies_df.xlsx"
file_path = os.path.join(FILES_DIR, file_name)
movies_df = pl.read_excel(file_path)

Could not determine dtype for column 2, falling back to string
Could not determine dtype for column 3, falling back to string


## code

In [4]:
movies_df.head()

index,Title,Year,Viewed,title_eng,title_translation,parsed_date,parsed_title,checked,gpt_parsed_titles,years,titles,cleaned_title,cleaned_year
i64,str,str,str,str,str,i64,str,str,str,str,str,str,i64
1,"""Действуй сестра""",,,"""Not found""","""Act sister""",,"""Not found""",,"""Sister Act""","""1992""","""Sister Act""","""Sister Act""",1992
2,"""Summer lovers""",,,"""Summer Lovers_1982_Comedy, Dra…","""Summer lovers""",1982.0,"""Summer Lovers""","""checked""","""Summer lovers""",,,"""Summer Lovers""",1982
3,"""Джон уик""",,,"""John Wick_2014_Action, Crime, …","""John Wick""",2014.0,"""John Wick""","""checked""","""Джон уик""",,,"""John Wick""",2014
4,"""Помни""",,,"""Remembered_2015_Short, Drama_N…","""Remembered""",2015.0,"""Remembered""","""checked""","""Помни""",,,"""Remembered""",2015
5,"""Похитители велосипедов""",,,"""Not found""","""Cycles of bicycles""",,"""Not found""",,"""Bicycle Thieves""","""1948""","""Bicycle Thieves""","""Bicycle Thieves""",1948


In [5]:
def fetch_english_title(title):
    try:
        url = f"http://www.omdbapi.com/?apikey={OMDB_API_KEY}&i={requests.utils.quote(title)}"  # for index search
        url = f"http://www.omdbapi.com/?apikey={OMDB_API_KEY}&t={requests.utils.quote(title)}"
        response = requests.get(url)
        data = response.json()
        if data.get("Response") == "True" and "Title" in data:
            return data
        else:
            print(f"Not found in OMDb: {title}")
            return "Not found"
    except Exception as e:
        print(f"OMDb error for {title}: {e}")
        return "Not found"

In [6]:
fetch_english_title(movies_df.head(1)["gpt_parsed_titles"].item())["imdbID"]

'tt0105417'

In [8]:
result_dict = dict()
for row in tqdm(movies_df.iter_rows(named=True)):
    if row["cleaned_year"] is None:
        result_dict[row["index"]] = "Not found"
    else:
        omdb_result = fetch_english_title(row["cleaned_title"])
        if omdb_result != "Not found":
            result_dict[row["index"]] = omdb_result["imdbID"]
        else:
            result_dict[row["index"]] = "Not found"

621it [04:07,  2.51it/s]


In [10]:
for i, (key, val) in enumerate(result_dict.items()):
    print(f"{key}\t{val}")
    if i > 9:
        break

1	tt0105417
2	tt0084737
3	tt2911666
4	tt6081018
5	tt0040522
6	tt0128445
7	tt0083658
8	tt0081505
9	tt1623187
10	tt0108399
11	tt0113101


In [11]:
ids = []
indexes = result_dict.keys()
for key in indexes:
    ids.append(result_dict[key])

In [12]:
parsed_films = pl.from_dict(
    {
        "index": indexes,
        "ids": ids,
    }
)

parsed_films.head()

index,ids
i64,str
1,"""tt0105417"""
2,"""tt0084737"""
3,"""tt2911666"""
4,"""tt6081018"""
5,"""tt0040522"""


In [13]:
movies_df.head()

index,Title,Year,Viewed,title_eng,title_translation,parsed_date,parsed_title,checked,gpt_parsed_titles,years,titles,cleaned_title,cleaned_year
i64,str,str,str,str,str,i64,str,str,str,str,str,str,i64
1,"""Действуй сестра""",,,"""Not found""","""Act sister""",,"""Not found""",,"""Sister Act""","""1992""","""Sister Act""","""Sister Act""",1992
2,"""Summer lovers""",,,"""Summer Lovers_1982_Comedy, Dra…","""Summer lovers""",1982.0,"""Summer Lovers""","""checked""","""Summer lovers""",,,"""Summer Lovers""",1982
3,"""Джон уик""",,,"""John Wick_2014_Action, Crime, …","""John Wick""",2014.0,"""John Wick""","""checked""","""Джон уик""",,,"""John Wick""",2014
4,"""Помни""",,,"""Remembered_2015_Short, Drama_N…","""Remembered""",2015.0,"""Remembered""","""checked""","""Помни""",,,"""Remembered""",2015
5,"""Похитители велосипедов""",,,"""Not found""","""Cycles of bicycles""",,"""Not found""",,"""Bicycle Thieves""","""1948""","""Bicycle Thieves""","""Bicycle Thieves""",1948


In [14]:
movies_df = movies_df.join(parsed_films, on="index", how="left")

In [15]:
movies_df.head()

index,Title,Year,Viewed,title_eng,title_translation,parsed_date,parsed_title,checked,gpt_parsed_titles,years,titles,cleaned_title,cleaned_year,ids
i64,str,str,str,str,str,i64,str,str,str,str,str,str,i64,str
1,"""Действуй сестра""",,,"""Not found""","""Act sister""",,"""Not found""",,"""Sister Act""","""1992""","""Sister Act""","""Sister Act""",1992,"""tt0105417"""
2,"""Summer lovers""",,,"""Summer Lovers_1982_Comedy, Dra…","""Summer lovers""",1982.0,"""Summer Lovers""","""checked""","""Summer lovers""",,,"""Summer Lovers""",1982,"""tt0084737"""
3,"""Джон уик""",,,"""John Wick_2014_Action, Crime, …","""John Wick""",2014.0,"""John Wick""","""checked""","""Джон уик""",,,"""John Wick""",2014,"""tt2911666"""
4,"""Помни""",,,"""Remembered_2015_Short, Drama_N…","""Remembered""",2015.0,"""Remembered""","""checked""","""Помни""",,,"""Remembered""",2015,"""tt6081018"""
5,"""Похитители велосипедов""",,,"""Not found""","""Cycles of bicycles""",,"""Not found""",,"""Bicycle Thieves""","""1948""","""Bicycle Thieves""","""Bicycle Thieves""",1948,"""tt0040522"""


## save df

In [16]:
file_name = "movies_df.xlsx"
file_path = os.path.join(FILES_DIR, file_name)
movies_df.write_excel(file_path)

<xlsxwriter.workbook.Workbook at 0x117647080>