In [1]:
from sqlalchemy import create_engine
import pandas as pd
from dotenv import load_dotenv
import os

In [2]:
load_dotenv()  # take environment variables from .env.
database_url = os.getenv("DATABASE_URL")
engine = create_engine(database_url) 

In [3]:

from sqlalchemy import null


df_list = pd.read_csv("data/title.crew.tsv", sep="\t", names=["tconst","directors","writers"], header=0, chunksize=20000)
with engine.connect() as connection:
    with connection.begin():
        for df in df_list:
            df.to_sql('title_crew', 
                        connection, 
                        schema='sebastien', 
                        if_exists='append', 
                        index=False)

In [5]:

from sqlalchemy import null


df_list = pd.read_csv("data/title.episode.tsv", sep="\t", names=["tconst","parenttconst","seasonnumber","episodenumber"], header=0, chunksize=20000)
with engine.connect() as connection:
    with connection.begin():
        for df in df_list:
            df["seasonnumber"] = df["seasonnumber"].replace("\\N", None)
            df["episodenumber"] = df["episodenumber"].replace("\\N", None)
            df.to_sql('title_episode', 
                        connection, 
                        schema='sebastien', 
                        if_exists='append', 
                        index=False)

In [15]:

from sqlalchemy import null


df_list = pd.read_csv("data/title.akas.tsv", sep="\t", names=["tconst","ordering","title","region","language","types","attributes","isoriginaltitle"], header=0, chunksize=20000)
with engine.connect() as connection:
    with connection.begin():
        for df in df_list:
            df["ordering"] = df["ordering"].replace("\\N", None)
            df["types"] = df["types"].astype(bool)
            df["isoriginaltitle"] = df["isoriginaltitle"].astype(bool)
            df.to_sql('title_akas', 
                        connection, 
                        schema='sebastien', 
                        if_exists='append', 
                        index=False)

In [None]:
from sqlalchemy import String, null


df_list = pd.read_csv("data/title.ratings.tsv", sep="\t", names=["tconst","averagerating","numvotes"], header=0, nrows=200000)
with engine.connect() as connection:
    with connection.begin():
        df_list["averagerating"] = df_list["averagerating"].replace("\\NN", null)
        df_list["numvotes"] = df_list["numvotes"].replace("\\NN", null)
        print("un chunck")
        df_list.to_sql('title_ratings', 
                    connection, 
                    schema='sebastien', 
                    if_exists='append', 
                    index=False)

In [None]:
from sqlalchemy import String, null


df_list = pd.read_csv("data/title.basics.tsv", sep="\t", names=["tconst","titletype","primarytitle","originaltitle","isadult","startyear","endyear","runtimeminutes","genres"], header=0, nrows=200000)
with engine.connect() as connection:
    with connection.begin():
        df_list["titletype"] = df_list["titletype"].replace("\\N", None)
        df_list["primarytitle"] = df_list["primarytitle"].replace("\\N", None)
        df_list["originaltitle"] = df_list["originaltitle"].replace("\\N", None)
        df_list["isadult"] = df_list["isadult"].replace("\\N", None).astype(bool)
        df_list["startyear"] = df_list["startyear"].replace("\\N", None)
        df_list["endyear"] = df_list["endyear"].replace("\\N", None)
        df_list["runtimeminutes"] = df_list["runtimeminutes"].replace("\\N", None)
        df_list["genres"] = df_list["genres"].replace("\\N", None)
        df_list.to_sql('title_basics', 
                    connection, 
                    schema='sebastien', 
                    if_exists='append', 
                    index=False)

In [None]:
df_basics = pd.read_csv("data/title.basics.tsv", sep='\\t', names=["tconst","titletype","primarytitle","originaltitle","isadult","startyear","endyear","runtimeminutes","genres"], header=0)
df_basics.head()

In [None]:
df_ratings = pd.read_csv('data/title.ratings.tsv', sep='\\t', header=0, names=['tconst', 'averagerating', 'numvotes'])
df_ratings.head()

In [8]:
df = df_basics.merge(df_ratings, on='tconst', how='left')

In [None]:
df.head()

In [None]:
for columns in df.columns:
    df[columns] = df[columns].replace("\\N", None)
df.head()

In [11]:
df.to_csv("data/test_import.csv", index=False)

In [4]:
df = pd.read_csv("data/test_import.csv", chunksize=100000)

In [None]:
d.dtypes

In [None]:
with engine.connect() as connection:
    with connection.begin():
        for d in df:
            # Find rows with tab issues
            mask = d['primarytitle'].str.contains('\t', na=False)
            if mask.any():
                # Get all columns that need shifting
                affected_rows = d[mask]
                # Split and realign all columns
                split_rows = affected_rows['primarytitle'].str.split('\t', expand=True)
                d.loc[mask, 'primarytitle'] = split_rows[0]
                d.loc[mask, 'originaltitle'] = split_rows[1]
                d.loc[mask, 'isadult'] = affected_rows['originaltitle']
                d.loc[mask, 'startyear'] = affected_rows['isadult']
                d.loc[mask, 'endyear'] = affected_rows['startyear']
                d.loc[mask, 'runtimeminutes'] = affected_rows['endyear']  # Current genres becomes runtimeminutes
                d.loc[mask, 'genres'] = affected_rows['runtimeminutes']  # Current runtimeminutes becomes genres
                d.loc[mask, 'averagerating'] = affected_rows['genres']  # Current genres becomes averagerating
                d.loc[mask, 'numvotes'] = affected_rows['averagerating']  # Current averagerating becomes numvotes
            
            d["isadult"] = d["isadult"].astype(bool)
            d["genres"] = d["genres"].astype(str)
            
            d.to_sql('title_basics', 
                    connection, 
                    schema='sebastien', 
                    if_exists='append', 
                    index=False)
