In [None]:
import re
import json
import pandas as pd
from anonymizedf.anonymizedf import anonymize

In [None]:
def find_mentions(text):
    # Cattura tutto dopo @ fino al primo spazio o fine stringa
    mentions = re.findall(r'@{1,2}[^\s]+', text)
    return mentions

In [None]:
import re

def find_mentions(text):
    """
    Estrae gli handle YouTube (@username) validi secondo le regole:
    - solo lettere, numeri, underscore e punto (a-z, 0-9, _, .)
    - lunghezza 3-30 (esclusa '@')
    - non iniziano/finiscono con '.' o '_'
    - no punti consecutivi '..'
    Restituisce gli handle completi in minuscolo (con @).
    """
    # regex: @ + username (3-30 caratteri validi)
    pattern = re.compile(r'@([A-Za-z0-9][A-Za-z0-9_.]{1,28}[A-Za-z0-9])')

    candidates = pattern.findall(text)

    # filtri extra: no '..', no '__', no inizio/fine con '.' o '_'
    mentions = [
        "@" + m.lower()
        for m in candidates
        if '..' not in m
        and '__' not in m
        and 3 <= len(m) <= 30
        and not m.startswith(('.', '_'))
        and not m.endswith(('.', '_'))
    ]
    return mentions


In [None]:
import re

def remove_info(text):
    # Pattern per URL (http, https, www, dominio.com/...)
    url_pattern = r'https?://\S+|www\.\S+|\b(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}\b/\S*'
    
    # Pattern per email
    email_pattern = r'\b[\w.-]+?@\w+?\.\w+?\b'
    
    # Pattern per numeri di telefono (es. +39 345 6789012, 345-678-9012, 3456789012)
    phone_pattern = r'(\+?\d{1,3}[\s-]?)?(\(?\d{3,4}\)?[\s-]?)?\d{3,4}[\s-]?\d{3,4}'

    # Rimuove URL, email e numeri di telefono
    text = re.sub(url_pattern, '', text)
    text = re.sub(email_pattern, '', text)
    text = re.sub(phone_pattern, '', text)

    # Sostituisce doppie @ con una sola
    text = re.sub(r'@@+', '@', text)

    # Rimuove spazi extra
    return re.sub(r'\s+', ' ', text).strip()


In [None]:
path_dataset = r" "  #PERCORSO
dataset1 = '.json' #nome dataset
dataset2 = '.json' #nome dataset
dataset3 = '.json' #nome dataset

In [None]:
with open(path_dataset+ dataset1, 'r', encoding='utf-8') as f:
    dataset1 = json.load(f)

In [None]:
with open(path_dataset+dataset2, 'r', encoding='utf-8') as f:
    dataset2 = json.load(f)

In [None]:
with open(path_dataset+dataset3, 'r', encoding='utf-8') as f:
    dataset3 = json.load(f)

###### Prima rimuovo le informazioni dalle descrizioni e dai commenti

In [None]:
len(dataset1)#,len(dataset2),len(dataset3)

per prima cosa Ã¨ importante rimuovere informazioni come EMAIL, numero di telefono, ecc.. soprattutto l'email perchÃ© potrebbe essere scambiata per un tag

In [None]:
def remove_info_from_dataset(dataset):
    for account, data in dataset.items():
        posts = data.get('posts', {})
        for post_id, post_info in posts.items():
            interactions = post_info.get('interactions_post', [])            
            for interaction in interactions:
                interaction['text'] = remove_info(interaction['text'])
    return dataset

In [None]:
dataset1 = remove_info_from_dataset(dataset1)
dataset2 = remove_info_from_dataset(dataset2)
dataset3 = remove_info_from_dataset(dataset3)

In [None]:
def set_user(dataset):
    set_author_mention = set()
    for account, data in dataset.items():
        posts = data.get('posts', {})
        for post_id, post_info in posts.items():
            interactions = post_info.get('interactions_post', [])   
            for interaction in interactions:
                set_author_mention.add(interaction['author'])
                mentions = find_mentions(interaction['text'])
                for mention in mentions:
                    set_author_mention.add(mention)
    return set_author_mention

In [None]:
set_user_gruppo1 = set_user(dataset1)
set_user_gruppo2 = set_user(dataset2)
set_user_gruppo3 = set_user(dataset3)

In [None]:
len(set_user_gruppo1), len(set_user_gruppo2), len(set_user_gruppo3)  #(524458, 527015, 209555)

In [None]:
all_set_user = set_user_gruppo1 | set_user_gruppo2 | set_user_gruppo3
len(all_set_user)

In [None]:
# le liste risultano vuote solo per privacy, Ã¨ necessario inserire il nickname dei vari account utilizzati
lst_account_famous = [ '@esempio', '@prova', '@sostituisci_con_tuo_valore' ]

lst_account_food = [ ]

lst_account_tips = [ ]

In [None]:
len(lst_account_famous)

In [None]:
lst_account_famous_lower = [user.lower() for user in lst_account_famous]
lst_account_food_lower = [user.lower() for user in lst_account_food]
lst_account_tips_lower = [user.lower() for user in lst_account_tips]

In [None]:
set_account_famous = set(lst_account_famous)
set_account_famous_lower = set(lst_account_famous_lower)

set_account_food = set(lst_account_food)
set_account_food_lower = set(lst_account_food_lower)

set_account_tips = set(lst_account_tips)
set_account_tips_lower = set(lst_account_tips_lower)

In [None]:
all_set_account = set_account_famous| set_account_food | set_account_tips
all_set_account_lower = set_account_famous_lower | set_account_food_lower | set_account_tips_lower

In [None]:
len(lst_account_famous), len(lst_account_famous_lower)

In [None]:
len(lst_account_food), len(lst_account_food_lower)

In [None]:
len(lst_account_tips), len(lst_account_tips_lower)

In [None]:
len(set_account_famous), len(set_account_food), len(set_account_tips)

In [None]:
set_user_gruppo1 -= set_account_famous
set_user_gruppo1 -= set_account_famous_lower

set_user_gruppo2 -= set_account_food
set_user_gruppo2 -= set_account_food_lower

set_user_gruppo3 -= set_account_tips
set_user_gruppo3 -= set_account_tips_lower

In [None]:
len(set_user_gruppo1), len(set_user_gruppo2), len(set_user_gruppo3)

In [None]:
len(all_set_user)

In [None]:
all_set_user -= all_set_account
all_set_user -= all_set_account_lower

In [None]:
len(all_set_user)

In [None]:
# Costruzione del DataFrame
df_users_new = pd.DataFrame({'user': list(all_set_user)})
# Rimuove duplicati se vuoi solo nomi unici
df_users_new = df_users_new.drop_duplicates().reset_index(drop=True)

In [None]:
an = anonymize(df_users_new)
fake_df_user = an.fake_categories("user")
fake_df_user['Fake_user'] = fake_df_user['Fake_user'].str.replace(' ', '_')

In [None]:
fake_df_user.head()

In [None]:
fake_df_user = fake_df_user.drop([0])  #eliminiamo perchÃ© magari user_1 con spazio vuoto potrebbe creare problemi
fake_df_user

In [None]:
fake_df_user.to_csv('fake_users_for_all_youtube_account_TRAV_B.csv', index=False)
df_fake_user = pd.read_csv('fake_users_for_all_youtube_account_TRAV_B.csv')

In [None]:
df_fake_user = pd.read_csv('fake_users_for_all_youtube_account_TRAV_B.csv')
df_fake_user

In [None]:
remap_famous = {
    'account famoso di viaggi': 'famous_1',
    "accountfamosodiviaggi" : 'famous_1',
     "account fammoso di viaggi": 'famous_1',
}

In [None]:
remap_food = { }

In [None]:
remap_tips = { }


In [None]:
len(remap_famous),len(remap_food),len(remap_tips)

In [None]:
remap_famous_lower = {k.lower(): v.lower() for k, v in remap_famous.items()}
remap_food_lower = {k.lower(): v.lower() for k, v in remap_food.items()}
remap_tips_lower = {k.lower(): v.lower() for k, v in remap_tips.items()}

In [None]:
def rinomina_dataset(dataset, remap):
    dataset_rinominato = {}
    
    for account in dataset:
        
        nuovo_nome = remap.get(account.lower(), account)  # se l'account non Ã¨ in remap, lascia il nome originale
        dataset_rinominato[nuovo_nome] = dataset[account]
    
    # Sovrascrivi il dataset originale se vuoi
    dataset = dataset_rinominato
    return dataset

In [None]:
dataset1 = rinomina_dataset(dataset1, remap_famous_lower)
dataset1.keys()

In [None]:
dataset2 = rinomina_dataset(dataset2, remap_food_lower)
dataset2.keys()

In [None]:
dataset3 = rinomina_dataset(dataset3, remap_tips_lower)
dataset3.keys()

### fake account non serve perchÃ© Ã¨ stato giÃ  fatto manualmente - serve solo per gli utenti (user_*)

###### food

###### tips

In [None]:
df_account_1 = pd.DataFrame({'account': list(remap_famous.keys()), 'fake_account': list(remap_famous.values())})
df_account_2 = pd.DataFrame({'account': list(remap_food.keys()), 'fake_account': list(remap_food.values())})
df_account_3 = pd.DataFrame({'account': list(remap_tips.keys()), 'fake_account': list(remap_tips.values())})


In [None]:
df_all = pd.concat([df_account_1, df_account_2, df_account_3], ignore_index=True)

In [None]:
len(df_all)

In [None]:
import re

def replace_mentions_safe(comment, map_user, map_account):
    def replacer(match):
        mention_raw = match.group(0)   # es. "@aakanksha_monga"
        mention_name = match.group(1)  # es. "aakanksha_monga"
        mention_lower = mention_name.lower()

        if mention_lower in map_user:
            return f"@{map_user[mention_lower]}"
        elif mention_lower in map_account:
            return f"@{map_account[mention_lower]}"
        else:
            return mention_raw  # nessuna sostituzione, lascio com'Ã¨

    # pattern: trova @ + parola valida (a-z, 0-9, underscore, punto, trattino)
    pattern = r'@([A-Za-z0-9_.\-]+)'
    return re.sub(pattern, replacer, comment)


In [None]:
df_all.columns

In [None]:
df_fake_user.columns

In [None]:

# Crea dizionari di mapping (con chiavi che includono la @)
map_account = dict(zip(df_all['account'].str.lower(), df_all['fake_account']))
map_user = dict(zip(df_fake_user['user'].str.lower(), df_fake_user['Fake_user']))


In [None]:
# Itera nel dataset
for account, data in dataset1.items():
    account_lower = account.lower()
    
    if account_lower in map_account:
        fake_account = map_account[account_lower]
        dataset_generale[fake_account] = dataset1.pop(account)
        account = fake_account
        
    posts = data.get('posts', {})
    for post_id, post_info in posts.items():
        interactions = post_info.get('interactions_post', [])       

        new_interactions = []

        for interaction in interactions:
            user = interaction.get('author')
            user = user.lstrip('@').lower()
            comment = interaction.get('text', "")
            user_lower = user.lower() if user else ""

            # Rimappa lo user (presuppone che abbia la @)
            if user_lower in map_user:
                user = map_user[user_lower]
            elif user_lower in map_account:
                user = map_account[user_lower]
                
            # Sostituisci le menzioni nel commento
            comment = replace_mentions_safe(comment, map_user, map_account)

            # Aggiorna interazione mantenendo il dizionario
            updated_interaction = interaction.copy()
            updated_interaction['author'] = user
            updated_interaction['text'] = comment

            new_interactions.append(updated_interaction)

        post_info['interactions_post'] = new_interactions


In [None]:
def transform_emotions(dataset):
    for account in dataset.values():
        for media in account['posts'].values():
            for comment in media['interactions_post']:
                if isinstance(comment.get('emotion'), list):  
                    # trasforma lista [{'label': x, 'score': y}, ...] in dict {x: y, ...}
                    comment['emotion'] = {e['label']: e['score'] for e in comment['emotion']}
    return dataset

# esempio di utilizzo
dataset1 = transform_emotions(dataset1)


In [None]:
with open('YT_gruppo1_anonymous.json', 'w', encoding='utf-8') as f:
    json.dump(dataset1, f, ensure_ascii=False, indent=4)

In [None]:
#foooooooooooooooooooooood 
for account, data in dataset2.items():
    account_lower = account.lower()
    
    if account_lower in map_account:
        fake_account = map_account[account_lower]
        dataset_generale[fake_account] = dataset2.pop(account)
        account = fake_account
        
    posts = data.get('posts', {})
    for post_id, post_info in posts.items():
        interactions = post_info.get('interactions_post', [])       

        new_interactions = []

        for interaction in interactions:
            user = interaction.get('author')
            user = user.lstrip('@').lower()
            comment = interaction.get('text', "")
            user_lower = user.lower() if user else ""

            # Rimappa lo user (presuppone che abbia la @)
            if user_lower in map_user:
                user = map_user[user_lower]
            elif user_lower in map_account:
                user = map_account[user_lower]
                
            # Sostituisci le menzioni nel commento
            comment = replace_mentions_safe(comment, map_user, map_account)

            # Aggiorna interazione mantenendo il dizionario
            updated_interaction = interaction.copy()
            updated_interaction['author'] = user
            updated_interaction['text'] = comment

            new_interactions.append(updated_interaction)

        post_info['interactions_post'] = new_interactions


In [None]:
dataset2 = transform_emotions(dataset2)

In [None]:
with open('YT_gruppo2_anonymous.json', 'w', encoding='utf-8') as f:
    json.dump(dataset2, f, ensure_ascii=False, indent=4)

In [None]:
with open('YT_gruppo2_anonymous.json', 'r', encoding='utf-8') as f:
    dataset2 = json.load(f)

In [None]:
#tips

# Itera nel dataset
for account, data in dataset3.items():
    account_lower = account.lower()
    
    if account_lower in map_account:
        fake_account = map_account[account_lower]
        dataset_generale[fake_account] = dataset3.pop(account)
        account = fake_account
        
    posts = data.get('posts', {})
    for post_id, post_info in posts.items():
        interactions = post_info.get('interactions_post', [])       

        new_interactions = []

        for interaction in interactions:
            user = interaction.get('author')
            user = user.lstrip('@').lower()
            comment = interaction.get('text', "")
            user_lower = user.lower() if user else ""

            # Rimappa lo user (presuppone che abbia la @)
            if user_lower in map_user:
                user = map_user[user_lower]
            elif user_lower in map_account:
                user = map_account[user_lower]
                
            # Sostituisci le menzioni nel commento
            comment = replace_mentions_safe(comment, map_user, map_account)

            # Aggiorna interazione mantenendo il dizionario
            updated_interaction = interaction.copy()
            updated_interaction['author'] = user
            updated_interaction['text'] = comment

            new_interactions.append(updated_interaction)

        post_info['interactions_post'] = new_interactions


In [None]:
dataset3 = transform_emotions(dataset3)

In [None]:
with open('YT_gruppo3_anonymous.json', 'w', encoding='utf-8') as f:
    json.dump(dataset3, f, ensure_ascii=False, indent=4)

In [None]:
import re

pattern = r'^(?:famous_|food_|tips_|user_)\d+$'

total_comments = 0
removed_comments = 0
removed_log = []  # ðŸ‘ˆ Lista dei commenti rimossi

for account, data in dataset3.items():
    posts = data.get('posts', {})
    for post_id, post_info in posts.items():
        interactions = post_info.get('interactions_post', []) 
        total_comments += len(interactions)

        cleaned_interactions = []

        for interaction in interactions:
            user = interaction['author'].strip()
            comment = interaction['text']
            mentions = find_mentions(comment)  # es: ['@user_1', '@nome']

            reason = None

            # Verifica user
            if not re.match(pattern, user):
                reason = f"user non valido: {user}"

            # Verifica tutte le mentions (solo se user era valido)
            elif mentions:
                for mention in mentions:
                    mention_clean = mention.lstrip('@').strip()
                    if not re.match(pattern, mention_clean):
                        reason = f"mention non valida: {mention}"
                        break

            if reason:
                removed_comments += 1
                removed_log.append({
                    "account": account,
                    "post_id": post_id,   # âœ… Corretto: prima câ€™era key
                    "user": user,
                    "comment": comment,
                    "motivo": reason
                })
                continue  # Salta questo commento

            # Se tutto Ã¨ valido, mantieni lâ€™interazione
            cleaned_interactions.append(interaction)

        post_info['interactions_post'] = cleaned_interactions

# ðŸ§¾ Riepilogo finale
print(f"Totale commenti originali: {total_comments}")
print(f"Commenti rimossi: {removed_comments}")
print(f"Commenti mantenuti: {total_comments - removed_comments}")

# ðŸ“„ Output dei commenti rimossi (prime 5 righe di esempio)
for r in removed_log[:5]:
    print(f"[{r['account']} | Post: {r['post_id']}] -> Utente: {r['user']} | Motivo: {r['motivo']}")
    print(f"Commento: {r['comment']}\n")
#Totale commenti originali: 351067
#Commenti rimossi: 1606
#Commenti mantenuti: 349461

In [None]:
with open('youtube_healthy_anonymous_senza_problemi.json', 'w', encoding='utf-8') as f:
    json.dump(dataset_healthy_anony, f, ensure_ascii=False, indent=4)

## devo togliere i nomi dai titoli, gli hashtag dai titoli, dalle descrizioni e dai tag

In [None]:
import json
with open('youtube_healthy_anonymous_senza_problemi.json', 'r', encoding='utf-8') as f:
    dataset_healthy_anony = json.load(f)

In [None]:
lst_account_famous =  [ ] 
 

extra_names = [ ]

# 1. Rimuovere la @
all_names = [name.lstrip('@') for name in (lst_account_famous + lst_account_food + lst_account_tips)] + extra_names

# 2. Creare una lista unica
unique_names = set(all_names)

# 3. Creare lista con anche i singoli token (split su spazi, punti, trattini, ecc.)
final_list = set()
for name in unique_names:
    final_list.add(name)  # nome intero
    # tokenizzare per spazi e simboli comuni
    tokens = name.replace('-', ' ').replace('_', ' ').replace('.', ' ').split()
    for t in tokens:
        final_list.add(t)

# 4. Convertire in lista ordinata
final_list = sorted(final_list)

In [None]:
final_list.remove('&')

In [None]:
to_remove = [ ]

# rimuovere tutti quelli che sono in to_remove
final_list = [item for item in final_list if item not in to_remove]

# opzionale: ordina di nuovo
final_list = sorted(final_list)

In [None]:
lst_key_remove = final_list

In [None]:
lst_set = set(lst_key_remove)

In [None]:
lst_key_remove = list(lst_set)


In [None]:
import re

def clean_title(title, remove_list):
    title = title.strip()
    for name in remove_list:
        # Pattern per la rimozione del nome con delimitatori finali (- | \ o nessuno)
        pattern_end = r'[\-\|\\]?\s*' + re.escape(name) + r'$'
        title = re.sub(pattern_end, '', title).strip()
        
        # Pattern per hashtag, case-insensitive, rimuove es: #FattoInCasaDaBenedetta
        hashtag_name = re.sub(r'\s+', '', name)  # rimuove spazi per fare hashtag
        pattern_hashtag = r'#' + re.escape(hashtag_name)
        title = re.sub(pattern_hashtag, '', title, flags=re.IGNORECASE).strip()
    
    # Rimuove spazi multipli lasciati dopo sostituzioni
    title = re.sub(r'\s{2,}', ' ', title)
    return title

In [None]:
import re

def clean_title(title, remove_list):
    original_title = title.strip()
    title_lower = original_title.lower()

    for name in remove_list:
        name_lower = name.lower()
        
        # Rimuove il nome alla fine con eventuali delimitatori
        pattern_end = r'[\-\|\\]?\s*' + re.escape(name_lower) + r'$'
        title_lower = re.sub(pattern_end, '', title_lower).strip()
        
        # Rimuove hashtag con il nome (senza spazi), case-insensitive
        hashtag_name = re.sub(r'\s+', '', name_lower)
        pattern_hashtag = r'#' + re.escape(hashtag_name)
        title_lower = re.sub(pattern_hashtag, '', title_lower, flags=re.IGNORECASE).strip()

    # Rimuove spazi multipli
    title_lower = re.sub(r'\s{2,}', ' ', title_lower)

    return title_lower


In [None]:
#       famous
for account, data in dataset_generale_anony.items():
    for key, post_info in data.items():
        title = post_info.get('title', '')
        cleaned_title = clean_title(title, lst_key_remove)
        post_info['title'] = cleaned_title  # aggiorna il titolo pulito

In [None]:
#     food
for account, data in dataset_veg_anony.items():
    for key, post_info in data.items():
        title = post_info.get('title', '')
        cleaned_title = clean_title(title, lst_key_remove)
        post_info['title'] = cleaned_title  # aggiorna il titolo pulito

In [None]:
#     tips
for account, data in dataset_healthy_anony.items():
    for key, post_info in data.items():
        title = post_info.get('title', '')
        cleaned_title = clean_title(title, lst_key_remove)
        post_info['title'] = cleaned_title  # aggiorna il titolo pulito

In [None]:
dataset_healthy_anony['healthy_1']['5']['title']

In [None]:
import re

def remove_info(text, remove_list=None):
    if remove_list is None:
        remove_list = []

    text = text.lower().strip()

    # 1. Rimuove URL
    text = re.sub(r'https?://\S+|www\.\S+|\b(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}\b/\S*', '', text)

    # 2. Rimuove email
    text = re.sub(r'\b[\w.-]+?@\w+?\.\w+?\b', '', text)

    # 3. Rimuove numeri di telefono
    text = re.sub(r'(\+?\d{1,3}[\s-]?)?(\(?\d{3,4}\)?[\s-]?)?\d{3,4}[\s-]?\d{3,4}', '', text)

    # 4. Rimuove parole indicative di indirizzi
    address_keywords = ['via', 'viale', 'piazza', 'indirizzo', 'corso', 'strada', 'largo', 'vicolo']
    pattern_address = r'\b(?:' + '|'.join(address_keywords) + r')\b.*?(?=[,.]|$)'
    text = re.sub(pattern_address, '', text)

    # 5. Rimuove menzioni (@username)
    text = re.sub(r'@\w+', '', text)

    # 6. Rimuove nomi e hashtag relativi a remove_list
    for name in remove_list:
        name_clean = name.lower().strip()
        if not name_clean:
            continue

        # Rimuove l'intero nome come testo
        text = re.sub(re.escape(name_clean), '', text)

        # Rimuove hashtag con il nome senza spazi
        hashtag_form = '#' + re.sub(r'\s+', '', name_clean)
        text = re.sub(re.escape(hashtag_form), '', text)

        # Rimuove anche le singole parole del nome
        for word in name_clean.split():
            text = re.sub(r'\b' + re.escape(word) + r'\b', '', text)

    # 7. Rimuove spazi multipli
    text = re.sub(r'\s+', ' ', text).strip()

    return text


In [None]:
#            famous
for account, data in dataset3.items():
    posts = data.get('posts', {})
    for post_id, post_info in posts.items():
        description = post_info.get('description', '') 
       # print(description)
        cleaned_description = remove_info(description, lst_key_remove)
        post_info['description'] = cleaned_description  # aggiorna il titolo pulito

In [None]:
#        food
for account, data in dataset_veg_anony.items():
    for key, post_info in data.items():
        description = post_info.get('description', '')
       # print(description)
        cleaned_description = remove_info(description, lst_key_remove)
        post_info['description'] = cleaned_description  # aggiorna il titolo pulito

In [None]:
#           tips
for account, data in dataset_healthy_anony.items():
    for key, post_info in data.items():
        description = post_info.get('description', '')
       # print(description)
        cleaned_description = remove_info(description, lst_key_remove)
        post_info['description'] = cleaned_description  # aggiorna il titolo pulito

In [None]:
def clean_text_list_from_names(text_list, remove_list):
    cleaned_list = []
    
    # Converti ogni nome in lista di parole chiave (in lowercase)
    remove_keywords = []
    for name in remove_list:
        name_parts = name.lower().split()
        if name_parts:
            remove_keywords.append(name_parts)

    for text in text_list:
        text_lower = text.lower()
        remove = False
        for keywords in remove_keywords:
            # Se tutte le parole chiave del nome sono nella stringa, la escludi
            if all(kw in text_lower for kw in keywords):
                remove = True
                break
        if not remove:
            cleaned_list.append(text)

    return cleaned_list


In [None]:
#       GENERALEEEEEEEE
for account, data in dataset_generale_anony.items():
    for key, post_info in data.items():
        tags = post_info.get('tags', '')
        cleaned_tags = clean_text_list_from_names(tags, lst_key_remove)
        post_info['tags'] = cleaned_tags  # aggiorna il titolo pulito

In [None]:
#        VEGETALEEEEEEEE
for account, data in dataset_veg_anony.items():
    for key, post_info in data.items():
        tags = post_info.get('tags', '')
        cleaned_tags = clean_text_list_from_names(tags, lst_key_remove)
        post_info['tags'] = cleaned_tags  # aggiorna il titolo pulito

In [None]:
#        HEALTHYYYYY
for account, data in dataset_healthy_anony.items():
    for key, post_info in data.items():
        tags = post_info.get('tags', '')
        cleaned_tags = clean_text_list_from_names(tags, lst_key_remove)
        post_info['tags'] = cleaned_tags  # aggiorna il titolo pulito

In [None]:
dataset_healthy_anony['healthy_1']['0'].keys()

In [None]:
for account, posts in dataset_healthy_anony.items():
    for post_id, post_data in posts.items():
        post_data.pop('post_id', None)


In [None]:
dataset_healthy_anony['healthy_1']['0'].keys()

In [None]:
with open('youtube_healthy_anonymous_COMPLETO.json', 'w', encoding='utf-8') as f:
    json.dump(dataset_healthy_anony, f, ensure_ascii=False, indent=4)