The loading of the dataset


In [4]:
import pandas as pd
import json
import xml.etree.ElementTree as ET
import os

# Define your file paths based on the folder structure in your image
paths = {
    # UPDATED: Pointing to the CSV file instead of XLSX
    "anilist_csv": os.path.join("Datasets", "Holy_Dataset", "anilist_anime_data_complete.csv"),
    "mal_csv": os.path.join("Datasets", "anime-dataset-2023.csv"),
    "offline_db_json": os.path.join("Datasets", "anime-offline-database.json"),
    "user_xml": os.path.join("Datasets", "anilist.xml")
}

def load_all_data():
    print("--- Loading Datasets ---")

    # 1. Load the AniList CSV File (Holy_Dataset)
    try:
        print(f"Loading AniList data from {paths['anilist_csv']}...")
        # UPDATED: Using read_csv instead of read_excel
        df_anilist = pd.read_csv(paths['anilist_csv'])
        print(f"✅ Loaded AniList Data: {df_anilist.shape[0]} rows")
    except Exception as e:
        print(f"❌ Error loading AniList CSV: {e}")
        df_anilist = pd.DataFrame()

    # 2. Load the MAL CSV File
    try:
        print(f"Loading MAL data from {paths['mal_csv']}...")
        df_mal = pd.read_csv(paths['mal_csv'])
        print(f"✅ Loaded MAL Data: {df_mal.shape[0]} rows")
    except Exception as e:
        print(f"❌ Error loading MAL CSV: {e}")
        df_mal = pd.DataFrame()

    # 3. Load the Anime Offline Database (JSON)
    try:
        print(f"Loading Offline Database from {paths['offline_db_json']}...")
        with open(paths['offline_db_json'], 'r', encoding='utf-8') as f:
            data_json = json.load(f)
        
        if 'data' in data_json:
            df_offline = pd.DataFrame(data_json['data'])
        else:
            df_offline = pd.DataFrame(data_json)
            
        print(f"✅ Loaded Offline DB: {df_offline.shape[0]} rows")
    except Exception as e:
        print(f"❌ Error loading JSON: {e}")
        df_offline = pd.DataFrame()

    # 4. Load Personal User XML Data (anilist.xml)
    try:
        print(f"Loading User XML from {paths['user_xml']}...")
        tree = ET.parse(paths['user_xml'])
        root = tree.getroot()

        xml_data = []
        for anime in root.findall('anime'):
            entry = {}
            for child in anime:
                entry[child.tag] = child.text
            xml_data.append(entry)

        df_user = pd.DataFrame(xml_data)
        print(f"✅ Loaded User Data: {df_user.shape[0]} entries")
    except Exception as e:
        print(f"❌ Error loading User XML: {e}")
        df_user = pd.DataFrame()

    return df_anilist, df_mal, df_offline, df_user

if __name__ == "__main__":
    # Run the loader
    anilist_df, mal_df, offline_df, user_df = load_all_data()

    # Preview the data
    print("\n--- Previews ---")
    if not anilist_df.empty:
        print("\nAniList Sample:")
        # Adjusting column names to likely CSV headers (may vary slightly from Excel)
        cols_to_show = [col for col in ['title_english', 'averageScore', 'popularity'] if col in anilist_df.columns]
        print(anilist_df[cols_to_show].head(3))
    
    if not user_df.empty:
        print("\nUser List Sample:")
        print(user_df[['series_title', 'my_score', 'my_status']].head(3))

--- Loading Datasets ---
Loading AniList data from Datasets\Holy_Dataset\anilist_anime_data_complete.csv...
✅ Loaded AniList Data: 20099 rows
Loading MAL data from Datasets\anime-dataset-2023.csv...
✅ Loaded MAL Data: 24905 rows
Loading Offline Database from Datasets\anime-offline-database.json...
✅ Loaded Offline DB: 39277 rows
Loading User XML from Datasets\anilist.xml...
✅ Loaded User Data: 328 entries

--- Previews ---

AniList Sample:
                title_english  averageScore  popularity
0  Tales of the Street Corner          62.0        2046
1                         NaN          57.0         409
2        Kimba the White Lion          61.0        2449

User List Sample:
     series_title my_score  my_status
0  Bungaku Shoujo        6  Completed
1  86: Eighty Six        7  Completed
2       A-Channel        4    Dropped
