# Same as other notebook this is for data cleaning and producing the frontend dataset.

In [None]:
import pandas as pd

title_akas_df = pd.read_csv('data/title.akas.tsv', sep='\t')
title_basics_df = pd.read_csv('data/title.basics.tsv', sep='\t')
#title_ratings_df = pd.read_csv('data/title.ratings.tsv', sep='\t')
#title_principals_df = pd.read_csv('data/title.principals.tsv', sep='\t')
#name_df = pd.read_csv('data/name.basics.tsv', sep='\t')

In [None]:
# Filter data to include only movies
movies_df = title_basics_df[title_basics_df['titleType'] == 'movie'].copy()

# drop adult movies
movies_df = movies_df[movies_df['isAdult'] == 0]
movies_df.head()

In [None]:
#drop columns that are not needed
movies_df.drop(columns=['titleType', 'isAdult', 'endYear',], inplace=True)
movies_df.head()

In [None]:
movies_df.to_csv('frontendData/frontend_movies.csv')
movies_df.to_json('frontendData/frontend_movies.json', orient='records')

In [None]:
import json
import threading
import random
import time

# Load JSON
with open("frontendData/frontend_movies.json", "r") as f:
    data = json.load(f)  # data is a LIST, not a dictionary

from webscraping import movie_api_wrapper as ib
parser = ib.IMDbParser()

# Thread-safe counter
counter = 0
lock = threading.Lock()
total = max(3, len(data))  # Only testing with 3 items initially using min(3, len(data))

# Function to Upload a Single Document
def upload_document(index, item):
    global counter
    try:
        time.sleep(random.uniform(0.1, 3.0))  # Random delay

        doc_id = item.get("tconst")  # Use "tconst" as the document ID

        if doc_id:
            results = parser.search(doc_id)
            if results:  # Ensure results is not empty
                data[index]['cast'] = results[0].get('cast', []) if results[0].get('cast', []) else []
                data[index]['poster'] = results[0].get('poster', {}).get('url', '') if results[0].get('poster', {}).get('url', '') else ''

        with lock:  # Ensure thread-safe counter update
            counter += 1
            print(f"({counter} out of {total} done so far)")

    except Exception as e:
        counter += 1
        print(f"Unkown error for {doc_id}: {e}")
        print(f"({counter} out of {total} done so far)")


# Use Threading for Faster Retrieval
threads = []
for index, item in enumerate(data[:total]):  # Limit to `total`
    thread = threading.Thread(target=upload_document, args=(index, item))
    thread.start()
    threads.append(thread)

# Wait for all threads to finish
for thread in threads:
    thread.join()

# Save updated JSON
with open('frontendData/complete_frontend_movies.json', 'w') as f:
    json.dump(data, f, indent=4)

print("Dumped new JSON with progress tracking")


In [None]:
#upload to firebase 
#!pip install firebase-admin

In [None]:
import firebase_admin
from firebase_admin import credentials, firestore
import json

# appareantly reading through stackoverflow there is no "upload a JSON to firestore" function
# so we have to read the JSON file and upload it to firestore manually
cred = credentials.Certificate("")
#firebase_admin.initialize_app(cred)

db = firestore.client()

# load json
with open("frontendData/frontend_movies.json", "r") as f:
    data = json.load(f)  # data is a LIST, not a dictionary

#upload
collection_name = "moviedata" 

for item in data:  # Iterate over the list using tconst
    doc_id = item.get("tconst")  
    if doc_id:
        db.collection(collection_name).document(doc_id).set(item)
        print(f"Uploaded document: {doc_id}")

print("uplaoded")


In [None]:
import json
import threading

db = firestore.client()

# load json
with open("frontendData/frontend_movies.json", "r") as f:
    data = json.load(f)  # data is a LIST, not a dictionary
# Find the index of the last uploaded document

from webscraping import movie_api_wrapper as ib
parser = ib.IMDbParser()


# Function to Upload a Single Document
def upload_document(item):
    try:
        doc_id = item.get("tconst")  # Use "tconst" as the document ID
        
        if doc_id:
            db.collection("moviedata").document(doc_id).set(item)
            print(f"Uploaded document: {doc_id}")
    except Exception as e:
        print(f"Error uploading {doc_id}: {e}")
        
        
#Use Threading for Faster Uploads
threads = []
for item in data:
    thread = threading.Thread(target=upload_document, args=(item,))
    thread.start()
    threads.append(thread)

# Wait for all threads to finish
for thread in threads:
    thread.join()

print("JSON Upload Complete with Threading")