In [1]:
# imports
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm import tqdm
import re

In [None]:
# defining regular expressions for cleaning the data
cell_regex = r'\",|, "|},' #\d,
dict_regex = r'\":'

In [2]:
# function for
def get_normalized_df(df: pd.DataFrame, col: str, keys: list[str], key_attribute: str="movie_id") -> pd.DataFrame:
    df_copy = df.copy()
    df_copy[col] = df_copy[col].map(clean_cell)

    clean_dfs = []
    for i, row in df_copy.iterrows():
        movie_id = row[key_attribute]
        df_new = pd.DataFrame()
        for key in keys:
            df_new[key] = [sub_row[key] for sub_row in row[col]]
        df_new["id"] = [movie_id]*len(df_new)
        clean_dfs += [df_new]
    df_clean = pd.concat(clean_dfs, ignore_index=True)

    return df_clean

trim_string = lambda string: "".join([c for c in string if c.isdigit() or c.isalpha() or c==" "]).strip()

# function for
def clean_cell(string: str) -> list[dict[str, str]]:
    if len(string)==2: return []
    n_keys = int(string.count(":")/string.count("}"))
    d, l = defaultdict(lambda: None), []
    rows = re.split(cell_regex, string)
    for i, cell in enumerate(rows):
        k, v = re.split(dict_regex, cell)
        k, v = trim_string(k), trim_string(v)
        d[k] = v
            
        if (i+1) % n_keys == 0:
            l.append(d)
            d = defaultdict(lambda: None)
    return l

In [4]:
# reading in the given csv files
df_movies = pd.read_csv("data/movies_groupA.csv")
df_credits = pd.read_csv("data/credits_groupA.csv")
df_recom = pd.read_csv("C:/Users/ibele/OneDrive/DHBW/4. Semester/Big Data Storage/Prüfungsleistung/groupA/recommendations.csv")

In [5]:
# safing movies and recoms to the clean / filtered data
df_recom.to_csv("filtered_data/recom.csv")
df_movies[["title","release_date","runtime","revenue","budget","popularity","id"]].to_csv("filtered_data/movies.csv")

In [6]:
new_tables = {
    "spoken_languages": (df_movies, "id", ["name"]),
    "genres": (df_movies, "id", ["name"]),
    "cast": (df_credits, "movie_id", ["name", "character", "gender"]),
    "crew": (df_credits, "movie_id", ["name", "job", "gender"])
}
for table, (df, id_name, cols) in new_tables.items():
    normalized_df = get_normalized_df(df, table, cols, id_name)
    normalized_df.to_csv(f"filtered_data/{table}.csv", index=False)