# Inizializzazione dati
Questa cella di inizializzazione carica tutti i file CSV presenti nella cartella `datasets` e crea:

- un dizionario `dfs` con chiavi come i nomi dei file (es. `"characters.csv"`) e i rispettivi `DataFrame`;
- variabili globali con nomi "puliti" (es. `characters` per `characters.csv`);
- la funzione `show_info(df_or_name, n=5)` per visualizzare forma, tipi, valori mancanti ed una anteprima;
- la funzione `reload_datasets()` per ricaricare i file dal disco.

Esempi d'uso:

- `show_info('characters.csv')` o `show_info(characters)`
- `characters.head()`
- `dfs['ratings.csv'].shape`
- `reload_datasets()` (se modifichi i CSV su disco)

Nota: se i file sono molto grandi, puoi modificare `read_csv_safe` o usare `nrows` per caricare un campione durante l'esplorazione.

In [5]:
# Inizializzazione: carica tutti i CSV dalla cartella 'datasets' e crea DataFrame accessibili
import pandas as pd
import numpy as np
from pathlib import Path
import re

DATA_DIR = Path("datasets")
if not DATA_DIR.exists():
    raise FileNotFoundError(f"Cartella 'datasets' non trovata: {DATA_DIR.resolve()}")

# Trova tutti i file CSV nella cartella (ricorsivamente) e filtra file di sistema (es. __MACOSX)
csv_files = sorted([p for p in DATA_DIR.rglob('*.csv') if p.is_file() and p.suffix.lower() == '.csv' and '__MACOSX' not in p.parts and not p.name.startswith('._')])
print("CSV trovati:", [str(p.relative_to(DATA_DIR)) for p in csv_files])


def read_csv_safe(path, nrows=None):
    """Legge un CSV provando pi√π encoding se necessario."""
    last_exc = None
    for enc in (None, "utf-8", "latin1"):
        try:
            if enc is None:
                return pd.read_csv(path, nrows=nrows, low_memory=False)
            else:
                return pd.read_csv(path, nrows=nrows, low_memory=False, encoding=enc)
        except Exception as e:
            last_exc = e
    raise last_exc


dfs = {}
varnames = []
for p in csv_files:
    try:
        df = read_csv_safe(p)
    except Exception as e:
        print(f"Skipping {p} due to read error: {e}")
        continue
    try:
        df = df.convert_dtypes()
    except Exception:
        pass
    dfs[p.name] = df

    # Crea una variabile con nome leggibile senza estensione, es. 'characters'
    var_name = re.sub(r"[^0-9a-zA-Z_]", "_", p.stem.lower())
    globals()[var_name] = df
    varnames.append(var_name)


def show_info(df_or_name, n=5):
    """Mostra informazioni rapide su un DataFrame (oggetto o nome di file/variabile)."""
    if isinstance(df_or_name, str):
        if df_or_name in dfs:
            df = dfs[df_or_name]
        else:
            df = globals().get(df_or_name)
    else:
        df = df_or_name
    if df is None:
        print("DataFrame non trovato")
        return
    print(f"Shape: {df.shape}")
    print("\nColumns and dtypes:")
    print(df.dtypes)
    print("\nMissing values (%):")
    print((df.isnull().mean() * 100).round(2))
    display(df.head(n))


def reload_datasets():
    """Ricarica tutti i CSV (utile se cambiano i file su disco)."""
    global dfs, varnames
    dfs = {}
    varnames = []
    for p in sorted(DATA_DIR.rglob('*.csv')):
        if not p.is_file() or p.suffix.lower() != ".csv":
            continue
        try:
            df = read_csv_safe(p)
        except Exception as e:
            print(f"Skipping {p} due to read error: {e}")
            continue
        try:
            df = df.convert_dtypes()
        except Exception:
            pass
        dfs[p.name] = df
        var_name = re.sub(r"[^0-9a-zA-Z_]", "_", p.stem.lower())
        globals()[var_name] = df
        varnames.append(var_name)
    print("Ricaricati:", [k for k in dfs.keys()])

# Riepilogo iniziale
print("\nDataFrame caricati:")
for name, df in dfs.items():
    print(f" - {name}: {df.shape}")
print("\nVariabili disponibili (es.: 'characters' per accedere a 'characters.csv'):")
print(varnames)

CSV trovati: ['datasets\\character_anime_works.csv', 'datasets\\character_nicknames.csv', 'datasets\\characters.csv', 'datasets\\details.csv', 'datasets\\favs.csv', 'datasets\\person_alternate_names.csv', 'datasets\\person_anime_works.csv', 'datasets\\person_details.csv', 'datasets\\person_voice_works.csv', 'datasets\\profiles.csv', 'datasets\\ratings.csv', 'datasets\\recommendations.csv', 'datasets\\stats.csv']
Skipping datasets\datasets\ratings.csv due to read error: Error tokenizing data. C error: out of memory

DataFrame caricati:
 - character_anime_works.csv: (236816, 4)
 - character_nicknames.csv: (37080, 2)
 - characters.csv: (209963, 7)
 - details.csv: (28955, 29)
 - favs.csv: (4178747, 3)
 - person_alternate_names.csv: (20465, 2)
 - person_anime_works.csv: (458091, 3)
 - person_details.csv: (76699, 10)
 - person_voice_works.csv: (489516, 5)
 - profiles.csv: (337155, 10)
 - recommendations.csv: (105249, 2)
 - stats.csv: (28955, 27)

Variabili disponibili (es.: 'characters' per 

In [2]:
# Verifica working directory e contenuto della cartella 'datasets'
from pathlib import Path
p = Path.cwd()
print("Current working directory:", p)
print("Files in CWD:", [x.name for x in p.iterdir()])

data_dir = Path("datasets")
print("DATA_DIR resolved:", data_dir.resolve())
print("DATA_DIR exists:", data_dir.exists())
if data_dir.exists():
    print("Contents of DATA_DIR:", [x.name for x in sorted(data_dir.iterdir())])
else:
    # Cerca 'datasets' nelle directory superiori
    found = []
    for d in [p] + list(p.parents):
        cand = d / "datasets"
        if cand.exists():
            found.append(str(cand))
    print("Found 'datasets' at:", found)

Current working directory: c:\Users\Utente\Desktop\STUDIO\Terzo anno\1 - Human computer interaction\Lab
Files in CWD: ['Assignment.pdf', 'datasets', 'datasets.zip', 'file.ipynb', 'HCI Assignment 2025-2026.docx', 'Self Assessment Form - HCI.xlsx']
DATA_DIR resolved: C:\Users\Utente\Desktop\STUDIO\Terzo anno\1 - Human computer interaction\Lab\datasets
DATA_DIR exists: True
Contents of DATA_DIR: ['__MACOSX', 'datasets']


In [7]:
# Helpers per lavorare con CSV grandi e ispezionare 'ratings.csv' (se presente)
def count_csv_rows(path):
    """Conta il numero di righe in modo streaming (meno memoria)."""
    cnt = 0
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        for _ in f:
            cnt += 1
    return max(0, cnt - 1)  # esclude header

print("CSV files contenenti 'ratings':", [str(p) for p in csv_files if 'ratings' in p.name.lower()])
ratings_paths = [p for p in csv_files if 'ratings' in p.name.lower()]
if ratings_paths:
    p = ratings_paths[0]
    print("Ispeziono:", p)
    try:
        print("Header columns:", pd.read_csv(p, nrows=0).columns.tolist())
    except Exception as e:
        print("Impossibile leggere l'header:", e)
    try:
        sample = pd.read_csv(p, nrows=100000, low_memory=False)
        print("Sample shape:", sample.shape)
        display(sample.head())
    except Exception as e:
        print("Lettura del campione fallita:", e)
    try:
        rows = count_csv_rows(p)
        print("Stima righe:", rows)
    except Exception as e:
        print("Counting rows failed:", e)
else:
    print("Nessun file 'ratings' trovato nei CSV caricati")

CSV files contenenti 'ratings': ['datasets\\datasets\\ratings.csv']
Ispeziono: datasets\datasets\ratings.csv
Header columns: ['username', 'anime_id', 'status', 'score', 'is_rewatching', 'num_watched_episodes']
Sample shape: (100000, 6)


Unnamed: 0,username,anime_id,status,score,is_rewatching,num_watched_episodes
0,--------788,30276,watching,7,0.0,3
1,--------788,28851,completed,7,0.0,1
2,--------788,41168,completed,7,0.0,1
3,--------788,22199,completed,10,0.0,24
4,--------788,16498,completed,10,0.0,25


Stima righe: 124298357


In [None]:
# Gestione di 'ratings.csv' molto grande: campione e funzioni per processare a chunk
ratings_paths = [p for p in csv_files if 'ratings' in p.name.lower()]
ratings_path = ratings_paths[0] if ratings_paths else None

if ratings_path is not None:
    print("Ratings path:", ratings_path)
    try:
        ratings_sample = pd.read_csv(ratings_path, nrows=200000, low_memory=False)
        try:
            ratings_sample = ratings_sample.convert_dtypes()
        except Exception:
            pass
        globals()['ratings_sample'] = ratings_sample
        dfs['ratings_sample.csv'] = ratings_sample
        print("ratings_sample caricato:", ratings_sample.shape)
    except Exception as e:
        print("Impossibile caricare il sample di ratings:", e)

    def ratings_agg_by_anime(chunksize=1_000_000):
        """Esempio: calcola somma e conteggi 'score' per 'anime_id' senza caricare tutto in memoria."""
        sums = {}
        counts = {}
        usecols = ['anime_id', 'score']
        for chunk in pd.read_csv(ratings_path, usecols=usecols, chunksize=chunksize, low_memory=False):
            chunk = chunk.dropna(subset=['anime_id', 'score'])
            ch = chunk.groupby('anime_id')['score'].agg(['sum', 'count']).reset_index()
            for _, row in ch.iterrows():
                aid = int(row['anime_id'])
                sums[aid] = sums.get(aid, 0) + float(row['sum'])
                counts[aid] = counts.get(aid, 0) + int(row['count'])
        import pandas as _pd
        df = _pd.DataFrame({'anime_id': list(sums.keys()), 'sum': list(sums.values()), 'count': [counts[k] for k in sums.keys()]})
        df['avg_score'] = df['sum'] / df['count']
        return df.sort_values('count', ascending=False)

else:
    print("Nessun file 'ratings' trovato per creare sample o funzioni")