# Articles

### Scrape code

In [None]:
# CRITICAL FIX: Use naive datetime for cutoff to avoid comparison errors
from datetime import datetime, timedelta, timezone
import pandas as pd
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from time import sleep
import time
import json

# FIXED: Use naive datetime (no timezone) for consistent comparisons
CUTOFF_DATE = datetime(2025, 10, 1, 0, 0, 0)  # Removed tzinfo=timezone.utc

CATEGORIES = {
    'Stocks': 'https://finance.yahoo.com/markets/stocks/most-active/',
    'Crypto': 'https://finance.yahoo.com/markets/crypto/all/',
    'Currencies': 'https://finance.yahoo.com/markets/currencies/',
    'Private companies': 'https://finance.yahoo.com/markets/private-companies/highest-valuation/',
    'Treasury bond': 'https://finance.yahoo.com/markets/bonds/'
}

def normalize_datetime(dt):
    """Convert timezone-aware datetime to naive (remove timezone info)"""
    if dt is None:
        return None
    if isinstance(dt, str):
        return dt
    if hasattr(dt, 'replace'):
        return dt.replace(tzinfo=None)
    return dt

def parse_date(date_str):
    """Parse date string and normalize to naive datetime"""
    if pd.isna(date_str):
        return None
    
    date_str = date_str.strip()
    
    formats = [
        "%a, %B %d, %Y at %I:%M %p GMT%z",
        "%a, %B %d, %Y at %I:%M %p",
        "%B %d, %Y",
        "%a, %B %d, %Y",
        "%Y-%m-%d",
    ]
    
    try:
        fixed_str = re.sub(r'GMT([+-]\d{1,2})$', lambda m: f"GMT{m.group(1).zfill(3)}00", date_str)
        parsed = datetime.strptime(fixed_str, "%a, %B %d, %Y at %I:%M %p GMT%z")
        return normalize_datetime(parsed)
    except:
        pass
    
    date_str_no_tz = re.sub(r'\s+[A-Z]{3,4}$', '', date_str)
    
    for fmt in formats:
        try:
            return datetime.strptime(date_str_no_tz, fmt)
        except:
            continue
    
    return None

def parse_relative_time(relative_str):
    """Parse relative time strings like '2h ago', '30m ago', '1d ago' into datetime"""
    if not relative_str or 'ago' not in relative_str.lower():
        return None
    
    relative_str = relative_str.lower().strip()
    now = datetime.now()
    
    try:
        parts = relative_str.split()
        if len(parts) < 2:
            return None
        
        amount = int(parts[0])
        unit = parts[1].lower()
        
        if unit.startswith('h'):
            return now - timedelta(hours=amount)
        elif unit.startswith('m'):
            return now - timedelta(minutes=amount)
        elif unit.startswith('d'):
            return now - timedelta(days=amount)
        elif unit.startswith('w'):
            return now - timedelta(weeks=amount)
        elif unit.startswith('s'):
            return now - timedelta(seconds=amount)
        else:
            return None
    except:
        return None

def check_cutoff_reached(parsed_date):
    """Check if date is before cutoff - safely handles naive datetimes"""
    if parsed_date is None:
        return False
    # Normalize to ensure comparison works
    normalized = normalize_datetime(parsed_date)
    return normalized < CUTOFF_DATE

def handle_cookie_consent(driver):
    """Handle Yahoo cookie consent"""
    try:
        accept_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "button[name='agree'][value='agree'].accept-all"))
        )
        accept_button.click()
        sleep(1)
        print("‚úì Cookie consent accepted!\n")
        return True
    except:
        try:
            scroll_button = WebDriverWait(driver, 3).until(
                EC.element_to_be_clickable((By.ID, "scroll-down-btn"))
            )
            scroll_button.click()
            sleep(1)
            accept_button = WebDriverWait(driver, 3).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "button[name='agree'][value='agree'].accept-all"))
            )
            accept_button.click()
            sleep(1)
            print("‚úì Cookie consent accepted!\n")
            return True
        except:
            print("‚ö† No consent dialog found (may be already accepted)\n")
            return False

def smart_scroll_and_load(driver, max_scroll_attempts=200):
    """Scroll intelligently and extract relative dates from homepage"""
    print(f"\nüîÑ Smart scrolling to load articles with dates...")

    last_height = driver.execute_script("return document.body.scrollHeight")
    articles_found = {}
    scroll_count = 0
    no_new_content_count = 0
    no_height_change_count = 0

    while scroll_count < max_scroll_attempts:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        sleep(2)
        driver.execute_script("window.scrollBy(0, 500);")
        sleep(1.5)

        try:
            article_sections = driver.find_elements(By.CSS_SELECTOR, 'section[data-testid="storyitem"]')
            new_articles_count = 0

            for section in article_sections:
                try:
                    url = None
                    try:
                        link = section.find_element(By.CSS_SELECTOR, 'a.subtle-link.titles[href*="/news/"]')
                        url = link.get_attribute('href')
                    except:
                        try:
                            link = section.find_element(By.CSS_SELECTOR, 'a.subtle-link.titles[href*="/m/"]')
                            url = link.get_attribute('href')
                        except:
                            continue
                    
                    if url and (('/news/' in url) or ('/m/' in url)):
                        last_part = url.split('/')[-1]
                        if '.html' in last_part or any(c.isdigit() for c in last_part):
                            if url not in articles_found:
                                relative_date = "N/A"
                                try:
                                    publishing_div = section.find_element(By.CSS_SELECTOR, 'div.publishing.yf-m1e6lz')
                                    text = publishing_div.text.strip()
                                    if 'ago' in text.lower():
                                        parts = text.split('‚Ä¢')
                                        if len(parts) > 1:
                                            relative_date = parts[1].strip()
                                except:
                                    pass
                                
                                articles_found[url] = relative_date
                                new_articles_count += 1
                except:
                    continue

            if new_articles_count > 0:
                no_new_content_count = 0
                no_height_change_count = 0  # Reset both counters when we find new articles
                print(f"  üìä Scroll {scroll_count + 1}: {len(articles_found)} articles found (+{new_articles_count} new)")
            else:
                no_new_content_count += 1
                print(f"  ‚è∏ Scroll {scroll_count + 1}: No new articles (attempt {no_new_content_count}/10)")

            # Check if we've exhausted scrolling (both no new articles AND page height not changing)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                no_height_change_count += 1
                print(f"    ‚ö† Page height not changing (attempt {no_height_change_count}/3)")
                
                # Only stop if BOTH conditions are true: no new articles AND no height change
                if no_new_content_count >= 10 and no_height_change_count >= 3:
                    print(f"  ‚ö† No new content AND page not loading more. Stopping.")
                    break
            else:
                no_height_change_count = 0  # Reset if page is still growing
                print(f"    ‚úì Page height increased (new height: {new_height})")

            last_height = new_height
            scroll_count += 1

        except Exception as e:
            print(f"  ‚ö† Error during scroll: {e}")
            scroll_count += 1
            continue

    print(f"\nüîÑ Parsing relative times and sorting articles...")
    articles_with_dates = []
    
    for url, relative_str in articles_found.items():
        parsed_date = parse_relative_time(relative_str)
        articles_with_dates.append({
            'url': url,
            'relative_date': relative_str,
            'parsed_date': parsed_date
        })
    
    articles_with_dates.sort(
        key=lambda x: x['parsed_date'] if x['parsed_date'] is not None else datetime.min, 
        reverse=True
    )
    
    print(f"‚úì Sorted articles preview:")
    for i, article in enumerate(articles_with_dates[:3]):
        date_str = article['relative_date'] if article['relative_date'] != "N/A" else 'N/A'
        print(f"  [{i+1}] {date_str}")
    if len(articles_with_dates) > 3:
        print(f"  ... ({len(articles_with_dates) - 3} more articles)")
    
    print(f"\n‚úì Scrolling complete: {len(articles_with_dates)} unique articles found and sorted by date\n")
    return articles_with_dates

def scrape_article_details(driver, article_url):
    """Scrape article details"""
    try:
        driver.get(article_url)
        sleep(1.5)

        article_data = {
            'url': article_url,
            'title': '',
            'author': '',
            'date': '',
            'datetime': '',
            'parsed_date': None,
            'text': '',
            'article_type': '',
            'stock_tickers': [],
            'cutoff_reached': False
        }

        try:
            title_elem = WebDriverWait(driver, 3).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'h1.cover-title'))
            )
            article_data['title'] = title_elem.text.strip()
        except:
            article_data['title'] = "N/A"

        is_video = False
        try:
            publishing_div = driver.find_element(By.CSS_SELECTOR, 'div.publishing.yf-m1e6lz')
            if 'Yahoo Finance Video' in publishing_div.text:
                is_video = True
                article_data['article_type'] = 'video'
        except:
            article_data['article_type'] = 'text'

        try:
            author_elem = driver.find_element(By.CSS_SELECTOR, 'div.byline-attr-author a.primary-link')
            article_data['author'] = author_elem.text.strip()
        except:
            try:
                author_elem = driver.find_element(By.CSS_SELECTOR, 'a.primary-link[data-ylk*="author"]')
                article_data['author'] = author_elem.text.strip()
            except:
                try:
                    author_div = driver.find_element(By.CSS_SELECTOR, 'div.byline-attr-author')
                    try:
                        author_link = author_div.find_element(By.TAG_NAME, 'a')
                        article_data['author'] = author_link.text.strip()
                    except:
                        full_text = author_div.text.strip()
                        if '¬∑' in full_text:
                            article_data['author'] = full_text.split('¬∑')[0].strip()
                        else:
                            article_data['author'] = full_text
                except:
                    article_data['author'] = "N/A"

        try:
            date_elem = driver.find_element(By.CSS_SELECTOR, 'time.byline-attr-meta-time, time[datetime]')
            article_data['date'] = date_elem.text.strip()
            article_data['datetime'] = date_elem.get_attribute('datetime') or "N/A"
            
            parsed_date = parse_date(article_data['date'])
            # Normalize to naive datetime for consistency
            article_data['parsed_date'] = normalize_datetime(parsed_date)
            
            if article_data['parsed_date'] is not None and check_cutoff_reached(article_data['parsed_date']):
                print(f"    ‚õî CUTOFF REACHED! Date: {article_data['parsed_date'].strftime('%d/%m/%Y %H:%M')}")
                article_data['cutoff_reached'] = True
                return article_data
        except:
            article_data['date'] = "N/A"
            article_data['datetime'] = "N/A"

        try:
            ticker_elements = driver.find_elements(By.CSS_SELECTOR, 'span.symbol.yf-90gdtp')
            tickers = [ticker.text.strip() for ticker in ticker_elements if ticker.text.strip()]
            article_data['stock_tickers'] = tickers if tickers else []
        except:
            article_data['stock_tickers'] = []

        if is_video:
            try:
                transcript_button = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[data-testid="accordionItem"][aria-controls*="acccon"]'))
                )
                if 'transcript' in transcript_button.text.lower():
                    transcript_button.click()
                    sleep(1)
                    transcript_paragraphs = driver.find_elements(By.CSS_SELECTOR, 'div.transcript-content p.type-body-md-reg')
                    if transcript_paragraphs:
                        transcript_parts = [p.text.strip() for p in transcript_paragraphs if p.text.strip()]
                        article_data['text'] = '\n\n'.join(transcript_parts)
            except:
                pass
        else:
            try:
                try:
                    read_more_button = WebDriverWait(driver, 3).until(
                        EC.element_to_be_clickable((By.CSS_SELECTOR, 'button.readmore-button[data-ylk*="readmore"]'))
                    )
                    read_more_button.click()
                    sleep(1)
                except:
                    pass
                
                paragraphs = driver.find_elements(By.CSS_SELECTOR, 'div.body[data-testid="article-body"] p.yf-1090901')
                text_parts = [p.text.strip() for p in paragraphs if p.text.strip() and len(p.text.strip()) > 20]
                
                if text_parts:
                    article_data['text'] = '\n\n'.join(text_parts)
            except:
                pass

        return article_data
    except Exception as e:
        print(f"    ‚úó Error: {e}")
        return None

def scrape_category(driver, category_name, category_url):
    """Scrape articles from a single category"""
    print("\n" + "=" * 80)
    print(f"üìÇ SCRAPING CATEGORY: {category_name}")
    print(f"‚õî CUTOFF DATE: {CUTOFF_DATE.strftime('%d/%m/%Y %H:%M')}")
    print("=" * 80)
    
    start_time = time.time()
    
    try:
        print(f"\nüåê Loading: {category_url}")
        driver.get(category_url)
        sleep(1.2)
        handle_cookie_consent(driver)
        
        articles_data = smart_scroll_and_load(driver)
        
        if not articles_data:
            print(f"‚ö† No articles found for {category_name}!")
            return pd.DataFrame()
        
        print(f"‚úì Found {len(articles_data)} articles, starting scraping until cutoff date...\n")
        print(f"üîç Scraping articles:\n")
        
        all_articles = []
        cutoff_reached = False
        failed_count = 0
        consecutive_before_cutoff = 0
        
        for idx, article_info in enumerate(articles_data, 1):
            if cutoff_reached:
                print(f"\n‚õî CUTOFF REACHED! Stopping scraping for category {category_name}")
                break
            
            article_url = article_info['url']
            
            if idx % 10 == 0 or idx == 1:
                elapsed = time.time() - start_time
                print(f"\nüìà Progress: {idx}/{len(articles_data)} | Elapsed: {elapsed:.1f}s | Articles scraped: {len(all_articles)}")
            
            if not article_url or not isinstance(article_url, str):
                print(f"  ‚ö† Skipping invalid URL at index {idx}")
                continue
            
            data = scrape_article_details(driver, article_url)
            
            if data:
                if data['cutoff_reached']:
                    cutoff_reached = True
                    break
                
                # Check if article is before cutoff - with proper datetime handling
                if data['parsed_date'] is not None:
                    normalized_date = normalize_datetime(data['parsed_date'])
                    if normalized_date < CUTOFF_DATE:
                        consecutive_before_cutoff += 1
                        print(f"\n  ‚õî Article #{idx} is before cutoff ({normalized_date.strftime('%d/%m/%Y %H:%M')} < {CUTOFF_DATE.strftime('%d/%m/%Y %H:%M')})")
                        if consecutive_before_cutoff >= 3:
                            print(f"  ‚õî {consecutive_before_cutoff} consecutive articles before cutoff. Stopping scraping.")
                            cutoff_reached = True
                            break
                    else:
                        consecutive_before_cutoff = 0
                
                if data['title'] != "N/A":
                    data['category'] = category_name
                    all_articles.append(data)
                    if idx <= 3:
                        date_str = data['parsed_date'].strftime('%d/%m/%Y %H:%M') if data['parsed_date'] is not None else 'N/A'
                        print(f"  ‚úì [{idx}] {data['title'][:50]}... ({date_str})")
                    
                    if len(all_articles) % 20 == 0:
                        current_date = data['parsed_date'].strftime('%d/%m/%Y %H:%M') if data['parsed_date'] is not None else 'N/A'
                        print(f"\n  ‚úÖ [{len(all_articles)} articles] - Most recent: {current_date}")
                else:
                    failed_count += 1
            
            sleep(1.2)
        
        if not all_articles:
            print(f"\n‚ö† No articles extracted for {category_name}")
            return pd.DataFrame()
        
        df = pd.DataFrame(all_articles)
        expected_cols = ['category', 'title', 'author', 'date', 'datetime', 'parsed_date', 'text', 'url', 'stock_tickers']
        for col in expected_cols:
            if col not in df.columns:
                df[col] = [] if col == 'stock_tickers' else ""
        df = df[expected_cols]
        
        total_time = time.time() - start_time
        print(f"\n‚è± Category scraping time: {total_time:.1f} seconds")
        print(f"üìä Articles kept: {len(all_articles)}")
        if failed_count > 0:
            print(f"‚ö† Failed: {failed_count} articles")
        
        return df
    
    except Exception as e:
        print(f"\n‚ùå Error scraping category {category_name}: {e}")
        import traceback
        traceback.print_exc()
        return pd.DataFrame()

def scrape_all_categories():
    """Main function to scrape all categories"""
    chrome_options = Options()
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-blink-features=AutomationControlled')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--window-size=1920,1080')
    chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
    
    driver = webdriver.Chrome(options=chrome_options)
    driver.set_page_load_timeout(30)
    
    all_categories_data = []
    overall_start = time.time()
    
    try:
        print("=" * 80)
        print("üöÄ YAHOO FINANCE SCRAPER - WITH DATE CUTOFF (FIXED)")
        print("=" * 80)
        print(f"\nüìã Categories to scrape: {len(CATEGORIES)}")
        print(f"‚õî CUTOFF DATE: {CUTOFF_DATE.strftime('%d/%m/%Y %H:%M')}\n")
        
        for idx, (category_name, category_url) in enumerate(CATEGORIES.items(), 1):
            print(f"\n{'='*80}")
            print(f"üîÑ CATEGORY {idx}/{len(CATEGORIES)}")
            print(f"{'='*80}")
            
            try:
                category_df = scrape_category(driver, category_name, category_url)
                if not category_df.empty:
                    all_categories_data.append(category_df)
                    print(f"\n‚úÖ {category_name}: {len(category_df)} articles scraped")
                else:
                    print(f"\n‚ö†Ô∏è {category_name}: No articles scraped")
            except Exception as e:
                print(f"\n‚ùå Error: {e}")
                import traceback
                traceback.print_exc()
                continue
            
            if idx < len(CATEGORIES):
                sleep(3)
        
        if all_categories_data:
            final_df = pd.concat(all_categories_data, ignore_index=True)
            final_df['text_length'] = final_df['text'].apply(len)
            final_df['word_count'] = final_df['text'].apply(lambda x: len(x.split()))
            
            try:
                final_df = final_df.sort_values(by='parsed_date', ascending=False).reset_index(drop=True)
            except:
                pass
            
            total_time = time.time() - overall_start
            
            print("\n" + "=" * 80)
            print("‚úÖ SCRAPING COMPLETED!")
            print("=" * 80)
            print(f"‚è± Total time: {total_time:.1f}s ({total_time/60:.1f}m)")
            print(f"üìä Total articles: {len(final_df)}")
            print(f"üìã Articles per category:")
            for cat in CATEGORIES.keys():
                count = len(final_df[final_df['category'] == cat])
                print(f"  ‚Ä¢ {cat}: {count}")
            
            return final_df
        else:
            print("\n‚ö†Ô∏è No data scraped")
            return pd.DataFrame()
    
    finally:
        try:
            driver.quit()
        except:
            pass

# Main execution
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200)
pd.set_option('display.max_colwidth', 100)

df_all = scrape_all_categories()

if not df_all.empty:
    print("\n" + "=" * 80)
    print("üìä FINAL RESULTS")
    print("=" * 80)
    
    print("\nüìã DataFrame Info:")
    df_all.info()
    print("\nüìÑ First 5 Articles:")
    print(df_all[['title', 'category', 'parsed_date']].head())
    
    # Prepare for export
    df_export = df_all.copy()
    df_export['parsed_date'] = df_export['parsed_date'].apply(
        lambda x: x.isoformat() if x is not None else None
    )
    df_export['stock_tickers'] = df_export['stock_tickers'].apply(
        lambda x: ','.join(x) if isinstance(x, list) and x else ''
    )
    
    # Save CSV
    df_export.to_csv('yahoo_finance_articles.csv', index=False, encoding='utf-8')
    print("\nüíæ Saved to 'yahoo_finance_articles.csv'")
    
    # Save JSON
    class CustomEncoder(json.JSONEncoder):
        def default(self, obj):
            if obj is None:
                return None
            elif isinstance(obj, (pd.Timestamp, datetime)):
                return obj.isoformat()
            return str(obj)
    
    with open('yahoo_finance_articles.json', 'w', encoding='utf-8') as f:
        json.dump(df_export.to_dict(orient='records'), f, cls=CustomEncoder, indent=2)
    print("üíæ Saved to 'yahoo_finance_articles.json'")
else:
    print("\n‚ùå No data to display")

### Set dataframe

In [None]:
# 1. Conta quanti titoli sono duplicati
duplicati = df_all[df_all.duplicated(subset='title', keep=False)]
print(f"Numero di record con titoli duplicati: {len(duplicati)}")

In [None]:
# 2. Show how many unique titles we have
num_unici = df_all['title'].nunique()
print(f"\nNumero di titoli unici: {num_unici}")

# 3. Delete duplicates
df_all = df_all.drop_duplicates(subset='title', keep='first')

# 4. Final check
print(f"\nNumero di righe dopo la rimozione duplicati: {len(df_all)}")


In [None]:
# display list of columns in df
print(df_all.columns.tolist())

In [None]:
# print first 3 rows of df_all
print(df_all.head(3))

In [None]:
# Remove 'datetime','date' from df_all
df_all = df_all.drop(columns=['datetime','date'])

In [None]:
# First convert to string type if not already
df_all['stock_tickers'] = df_all['stock_tickers'].astype(str)

# Remove brackets and quotes, replace ', ' with ','
df_all['stock_tickers'] = df_all['stock_tickers'].str.replace(r'[\[\]"]', '', regex=True)
df_all['stock_tickers'] = df_all['stock_tickers'].str.replace(r"', '", ',', regex=True)
df_all['stock_tickers'] = df_all['stock_tickers'].str.replace("'", '', regex=True)

# Rename column
df_all = df_all.rename(columns={'stock_tickers': 'tickers'})

# Check the result
print(df_all['tickers'].head(5))

In [None]:
# print first 3 rows of df_all
print(df_all.head(3))

### Create a copy of 'df_all'

In [None]:
# Crea copia e pulisci in un passaggio
df_export = df_all.copy()

### Remove problematic article and export to csv

In [None]:
import pandas as pd
import re
import csv

# Crea copia
df_export = df_export[df_export['title'] != "Huntington (HBAN) Q3 2025 Earnings Call Transcript"]

# Pulisci tutte le colonne di tipo object/string
df_export = df_export.apply(
    lambda x: x.str.replace('\t|\r|\n', ' ', regex=True).str.replace(r'\s+', ' ', regex=True).str.strip() 
    if x.dtype == 'object' else x
)

# Esporta
df_export.to_csv(
    'output.csv', 
    sep=';', 
    index=False, 
    encoding='utf-8',
    quoting=csv.QUOTE_NONNUMERIC
)

### Check code

In [None]:
# =====================================================
# Filtra le prime 10 news nella categoria "stocks"
# =====================================================

# Filtra solo le righe in cui la categoria √® "stocks"
df_stocks = df_export[df_export['category'].str.lower() == 'crypto']

# Controlla se ci sono articoli in questa categoria
if not df_stocks.empty:
    # Converti a datetime se non √® gi√† stato fatto
    df_stocks['parsed_date'] = pd.to_datetime(df_stocks['parsed_date'], errors='coerce')
    
    # Ordina per data decrescente
    df_sorted = df_stocks.sort_values(by='parsed_date', ascending=False)

    # Aggiungi una colonna formattata per una visualizzazione pi√π leggibile
    df_sorted['formatted_date'] = df_sorted['parsed_date'].dt.strftime("%d %B %Y, %H:%M")

    # Mostra le prime 5 righe (pi√π recenti)
    print("\nüìà Le 5 news pi√π recenti nella categoria 'stocks':")
    print(df_sorted[['formatted_date', 'title', 'category', 'url']].head(10))
else:
    print("‚ùå Nessuna notizia trovata nella categoria 'stocks'.")

In [None]:
## Show extracted text from first article

import pandas as pd

# Assicurati che Pandas non tronchi le stringhe lunghe
pd.set_option('display.max_colwidth', None)

# Ciclo per stampare titolo e testo completo dei primi 5 articoli
for i, row in enumerate(df_sorted.head(1).itertuples(), 1):
    print(f"\nüì∞ Articolo {i}")
    print(f"Titolo: {row.title}")
    print(f"Testo completo:\n{row.text}")


## Text cleaning, lemmatization , vectorization

In [None]:
"""
Complete NLP Text Processing Pipeline
- Text cleaning
- Collocation extraction (POS-based and PMI-based)
- Lemmatization with stopword removal
- Term-Document Matrix creation
"""

# ============================================================================
# REQUIRED LIBRARIES
# ============================================================================
import re
import pandas as pd
import string
import math
import os
from collections import Counter
from tqdm import tqdm
import spacy
import nltk
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import seaborn as sns

# Download NLTK stopwords
try:
    nltk_stopwords.words('english')
except LookupError:
    nltk.download('stopwords')

# ============================================================================
# 1. TEXT CLEANING FUNCTION
# ============================================================================

def clean_text(x=None, 
               hashtag=True, 
               mention=True, 
               numbers=False, 
               punctuation=False,
               lowercase=True):
    """Clean text by removing/replacing various elements"""
    
    if x is None or not isinstance(x, (str, list)):
        raise ValueError("Invalid text input - must be string or list of strings")
    
    if isinstance(x, str):
        x = [x]
        return_single = True
    else:
        return_single = False
    
    for arg in [hashtag, mention, numbers, punctuation, lowercase]:
        if not isinstance(arg, bool):
            raise ValueError("All flags must be boolean (True/False)")
    
    html_symbols = [
        "&copy;", "&reg;", "&trade;", "&ldquo;", "&lsquo;", "&rsquo;", "&bull;",
        "&middot;", "&sdot;", "&ndash;", "&mdash;", "&cent;", "&pound;", "&euro;",
        "&ne;", "&frac12;", "&frac14;", "&frac34;", "&deg;", "&larr;", "&rarr;",
        "&hellip;", "&nbsp;", "&lt;", "&gt;", "&amp;", "&quot;"
    ]
    html_symbols_pattern = "|".join(re.escape(sym) for sym in html_symbols)
    
    char_map = {
        "\034": '"',
        "\035": '"',
        "\036": '"',
        "\030": "'",
        "\031": "'"
    }
    
    cleaned_texts = []
    
    for text in x:
        if not isinstance(text, str):
            cleaned_texts.append(text)
            continue
        
        xtxt = text
        xtxt = re.sub(r"(f|ht)(tp)(s?)(://)(.\S+)[.|/](.\S+)", " ", xtxt)
        xtxt = re.sub(r"(RT|via)((?:\b\W*@\w+)+)", " ", xtxt)
        xtxt = re.sub(r"(rt|via)((?:\b\W*@\w+)+)", " ", xtxt)
        xtxt = re.sub(html_symbols_pattern, " ", xtxt)
        
        if punctuation:
            xtxt = re.sub(r"([#@])|[^\w\s]", r" \1", xtxt)
        
        for old_char, new_char in char_map.items():
            xtxt = xtxt.replace(old_char, new_char)
        
        xtxt = re.sub(r'[\x00-\x1F\x7F]', ' ', xtxt)
        xtxt = re.sub(r'[^\x20-\x7E]', ' ', xtxt)
        
        if hashtag:
            xtxt = re.sub(r"#\S+", " ", xtxt)
        
        if mention:
            xtxt = re.sub(r"@\S+", " ", xtxt)
        
        if numbers:
            xtxt = re.sub(r"[0-9]", "", xtxt)
        
        xtxt = re.sub(r"[ \t]{2,}", " ", xtxt)
        xtxt = re.sub(r"\s+", " ", xtxt)
        xtxt = xtxt.strip()
        
        if lowercase:
            xtxt = xtxt.lower()
        
        cleaned_texts.append(xtxt)
    
    if return_single:
        return cleaned_texts[0]
    else:
        return cleaned_texts


# ============================================================================
# 2. SPACY UTILITIES & COLLOCATIONS
# ============================================================================

def load_spacy_model(model_name):
    """Load spaCy model"""
    try:
        nlp = spacy.load(model_name)
    except OSError:
        raise FileNotFoundError(f"Cannot load spaCy model: {model_name}. "
                                f"Run: python -m spacy download {model_name}")
    return nlp


def annotate_text(nlp, text):
    """Annotate single text with spaCy"""
    doc = nlp(text or "")
    tokens = []
    for token in doc:
        tokens.append({
            'form': token.text,
            'lemma': token.lemma_,
            'upos': token.pos_
        })
    return tokens


def annotate_texts(nlp, texts, show_progress=True):
    """Annotate list of texts"""
    out = []
    iterable = texts if not show_progress else tqdm(texts, desc="Annotating")
    for t in iterable:
        out.append(annotate_text(nlp, str(t) if pd.notna(t) else ""))
    return out


def my_collocations_POS(texts, nlp, pos_patterns=[('ADJ','NOUN'), ('NOUN','NOUN'), 
                                                    ('NOUN','PROPN'), ('PROPN','PROPN')], 
                        min_freq=2, xlsx_save=True, xlsx_name="colloc_POS.xlsx", verbose=True):
    """Extract POS-based collocations"""
    
    if verbose: print("Annotating texts for POS collocations...")
    annotated = annotate_texts(nlp, texts, show_progress=verbose)
    counts = Counter()
    pattern_map = {}
    
    if verbose: print("Counting adjacent bigrams filtered by POS...")
    for doc in annotated:
        for i in range(len(doc)-1):
            t1, t2 = doc[i], doc[i+1]
            p = (t1['upos'], t2['upos'])
            if p in pos_patterns:
                w1 = t1['lemma'] if t1['lemma'] and t1['lemma'] != '_' else t1['form']
                w2 = t2['lemma'] if t2['lemma'] and t2['lemma'] != '_' else t2['form']
                coll = f"{w1.lower()} {w2.lower()}"
                counts[coll] += 1
                pattern_map[coll] = f"{p[0]} {p[1]}"
    
    rows = [{'collocation': c, 'freq': f, 'pos_pattern': pattern_map.get(c, '')}
            for c, f in counts.items() if f >= min_freq]
    
    if not rows:
        print(f"‚ö† Warning: No collocations found with min_freq={min_freq}")
        df = pd.DataFrame(columns=['collocation', 'freq', 'pos_pattern'])
    else:
        df = pd.DataFrame(rows).sort_values(['freq','collocation'], ascending=[False,True]).reset_index(drop=True)
    
    if xlsx_save:
        df.to_excel(xlsx_name, index=False)
        if verbose: print(f"Saved {xlsx_name} ({len(df)} rows)")
    
    return df


def my_collocations(texts, nlp, nn=200, sort_ord="pmi", min_freq=2,
                    xlsx_save=True, xlsx_name="colloc_PMI.xlsx", verbose=True):
    """Extract generic collocations using PMI"""
    
    if verbose: print("Annotating texts for PMI collocations...")
    annotated = annotate_texts(nlp, texts, show_progress=verbose)
    unigram = Counter()
    bigram = Counter()
    total_unigrams = 0
    
    for doc in annotated:
        lemmas = []
        for tok in doc:
            if tok['upos'] == 'PUNCT':
                continue
            lemma = tok['lemma'] if tok['lemma'] and tok['lemma'] != '_' else tok['form']
            w = lemma.lower()
            lemmas.append(w)
            unigram[w] += 1
            total_unigrams += 1
        
        for i in range(len(lemmas)-1):
            big = f"{lemmas[i]} {lemmas[i+1]}"
            bigram[big] += 1
    
    N = total_unigrams if total_unigrams > 0 else 1
    rows = []
    
    for big, freq in bigram.items():
        if freq < min_freq: 
            continue
        w1, w2 = big.split(" ", 1)
        p_w1 = unigram[w1] / N
        p_w2 = unigram[w2] / N
        p_w1w2 = freq / max(1, N-1)
        
        if p_w1 > 0 and p_w2 > 0 and p_w1w2 > 0:
            pmi = math.log2(p_w1w2 / (p_w1 * p_w2))
        else:
            pmi = float('-inf')
        
        rows.append({'collocation': big, 'freq': freq, 'pmi': pmi})
    
    if not rows:
        print(f"‚ö† Warning: No collocations found with min_freq={min_freq}")
        df = pd.DataFrame(columns=['collocation', 'freq', 'pmi'])
    else:
        df = pd.DataFrame(rows)
        
        if sort_ord.lower() in ['pmi', 'pmi_desc']:
            df = df.sort_values(['pmi', 'freq'], ascending=[False, False])
        else:
            df = df.sort_values(['freq', 'pmi'], ascending=[False, False])
    
    if nn is not None:
        df = df.head(nn).reset_index(drop=True)
    
    if xlsx_save:
        df.to_excel(xlsx_name, index=False)
        if verbose: print(f"Saved {xlsx_name} ({len(df)} rows)")
    
    return df


def corMultWord_xlsx(texts, xlsx_file, verbose=True):
    """Replace multi-word collocations with underscores"""
    
    if not os.path.exists(xlsx_file):
        if verbose:
            print(f"‚ö† Warning: File not found: {xlsx_file}. Skipping replacement.")
        return pd.Series(texts, index=None)
    
    df = pd.read_excel(xlsx_file, engine='openpyxl')
    if 'collocation' not in df.columns:
        raise ValueError("Excel file missing 'collocation' column")
    
    colls = sorted(df['collocation'].dropna().astype(str).unique(), 
                   key=lambda s: len(s.split()), reverse=True)
    
    if not colls:
        if verbose:
            print(f"‚ö† No collocations found in {xlsx_file}")
        return pd.Series(texts, index=None)
    
    patterns = []
    for c in colls:
        escaped = r'\s+'.join(re.escape(part) for part in c.split())
        pattern = re.compile(rf'\b{escaped}\b', flags=re.IGNORECASE)
        replacement = "_".join(c.split())
        patterns.append((pattern, replacement))
    
    out_texts = []
    iterator = texts if not verbose else tqdm(texts, desc=f"Applying {os.path.basename(xlsx_file)}")
    
    for t in iterator:
        if pd.isna(t):
            out_texts.append(t)
            continue
        s = str(t)
        for pattern, repl in patterns:
            s = pattern.sub(repl, s)
        out_texts.append(s)
    
    return pd.Series(out_texts, index=None)


def apply_pipeline_on_df(df_export, text_col='text_cleaned', verbose=True):
    """Apply complete collocation pipeline"""
    
    if text_col not in df_export.columns:
        raise KeyError(f"Column {text_col} not found in df_export")
    
    if verbose: print("Loading spaCy model...")
    nlp = load_spacy_model("en_core_web_sm")
    
    if verbose: print("Step 1: POS-based collocations")
    out_pos = my_collocations_POS(df_export[text_col], nlp, min_freq=1,
                                  xlsx_save=True, xlsx_name="colloc_POS.xlsx", verbose=verbose)
    
    if verbose: print("Step 2: Applying POS substitutions")
    df_export[text_col] = corMultWord_xlsx(df_export[text_col], xlsx_file="colloc_POS.xlsx", verbose=verbose).values
    
    if verbose: print("Step 3: PMI-based collocations")
    out_pmi = my_collocations(df_export[text_col], nlp, nn=200, sort_ord="pmi", min_freq=2,
                              xlsx_save=True, xlsx_name="colloc_PMI.xlsx", verbose=verbose)
    
    if verbose: print("Step 4: Applying PMI substitutions")
    df_export[text_col] = corMultWord_xlsx(df_export[text_col], xlsx_file="colloc_PMI.xlsx", verbose=verbose).values
    
    if verbose:
        print("Pipeline complete.")
        print(f"POS collocations saved: colloc_POS.xlsx ({len(out_pos)} rows)")
        print(f"PMI collocations saved: colloc_PMI.xlsx ({len(out_pmi)} rows)")
    
    return df_export, out_pos, out_pmi


def check_collocation_files(xlsx_pos="colloc_POS.xlsx", xlsx_pmi="colloc_PMI.xlsx"):
    """Verify and display statistics of generated collocation files"""
    
    print("=" * 80)
    print("COLLOCATION FILES VERIFICATION")
    print("=" * 80)
    
    try:
        df_pos = pd.read_excel(xlsx_pos)
        print(f"\n‚úì POS-based file loaded: {xlsx_pos}")
        print(f"  - Number of collocations: {len(df_pos)}")
        print(f"  - Columns: {list(df_pos.columns)}")
        print(f"  - Average frequency: {df_pos['freq'].mean():.2f}")
        print(f"  - Max frequency: {df_pos['freq'].max()}")
        print(f"  - Min frequency: {df_pos['freq'].min()}")
        print("\n  Top 10 POS collocations:")
        print(df_pos.head(10)[['collocation', 'freq', 'pos_pattern']].to_string(index=False))
        print("\n  POS pattern distribution:")
        print(df_pos['pos_pattern'].value_counts())
    except FileNotFoundError:
        print(f"\n‚úó File not found: {xlsx_pos}")
        df_pos = None
    
    try:
        df_pmi = pd.read_excel(xlsx_pmi)
        print(f"\n‚úì PMI-based file loaded: {xlsx_pmi}")
        print(f"  - Number of collocations: {len(df_pmi)}")
        print(f"  - Columns: {list(df_pmi.columns)}")
        print(f"  - Average PMI: {df_pmi['pmi'].mean():.2f}")
        print(f"  - Max PMI: {df_pmi['pmi'].max():.2f}")
        print(f"  - Min PMI: {df_pmi['pmi'].min():.2f}")
        print("\n  Top 10 PMI collocations:")
        print(df_pmi.head(10)[['collocation', 'freq', 'pmi']].to_string(index=False))
    except FileNotFoundError:
        print(f"\n‚úó File not found: {xlsx_pmi}")
        df_pmi = None
    
    return df_pos, df_pmi


# ============================================================================
# 3. LEMMATIZATION WITH STOPWORDS
# ============================================================================

def lemmatize_spacy_en(x, model, stopwords_list=None, doc_id=None, verbose=False):
    """Lemmatize text preserving multi-word expressions"""
    
    if x is None:
        raise ValueError("missing text (x is None)")
    if model is None:
        raise ValueError("missing language model (model is None)")
    
    if stopwords_list is None:
        stopwords_list = list(nltk_stopwords.words('english'))
    
    stopwords_lower = [w.lower() for w in stopwords_list]
    
    if doc_id is None:
        doc_id = [f"docid{i}" for i in range(len(x))]
    
    if len(doc_id) != len(x):
        raise ValueError("doc_id length must match length(x)")
    
    results = []
    iterator = zip(x, doc_id)
    if verbose:
        iterator = tqdm(list(iterator), desc="Lemmatizing documents")
    
    for doc_idx, (text, d_id) in enumerate(iterator):
        doc = model(str(text) if pd.notna(text) else "")
        
        for token_idx, token in enumerate(doc):
            if token.is_punct or token.is_space:
                continue
            
            token_text = token.text
            lemma_text = token.lemma_
            pos_tag = token.pos_
            dep = token.dep_
            
            is_stopword = (token_text.lower() in stopwords_lower or 
                          lemma_text.lower() in stopwords_lower)
            
            results.append({
                'doc_id': d_id,
                'token_id': token_idx + 1,
                'token': token_text,
                'lemma': lemma_text,
                'upos': pos_tag,
                'dep': dep,
                'head_token_id': token.head.i + 1,
                'STOP': is_stopword
            })
    
    result_df = pd.DataFrame(results)
    
    if result_df.empty:
        print("Warning: No tokens extracted!")
        return result_df
    
    print(f"‚úì Lemmatization complete: {len(result_df)} tokens from {len(doc_id)} documents")
    return result_df


def apply_lemmatization_to_df(df_export, text_column='text_cleaned', verbose=True):
    """Apply lemmatization to df_export"""
    
    if text_column not in df_export.columns:
        raise KeyError(f"Column '{text_column}' not found")
    
    if verbose:
        print("Loading spaCy model...")
    
    nlp = load_spacy_model("en_core_web_sm")
    
    if verbose:
        print("Loading NLTK stopwords...")
    stopwords_list = list(nltk_stopwords.words('english'))
    
    doc_ids = [f"doc_{i}" for i in range(len(df_export))]
    texts = df_export[text_column].fillna("").astype(str).tolist()
    
    if verbose:
        print(f"Lemmatizing {len(texts)} documents...")
    
    lem_df = lemmatize_spacy_en(
        x=texts,
        model=nlp,
        stopwords_list=stopwords_list,
        doc_id=doc_ids,
        verbose=verbose
    )
    
    return lem_df

# ============================================================================
# 4. TERM-DOCUMENT MATRIX
# ============================================================================

def create_term_document_matrix(texts, remove_punctuation=False, min_word_length=1, lowercase=True):
    """
    Create Document-Term Matrix (equivalent to R's textmineR::CreateDtm)
    
    Parameters:
    -----------
    texts : list
        List of text documents
    remove_punctuation : bool
        If True, removes punctuation (default: False, matching R behavior)
    min_word_length : int
        Minimum word length to include (default: 1)
    lowercase : bool
        Convert tokens to lowercase (default: True)
    
    Returns:
    --------
    tuple
        (dtm_df: DataFrame with terms as rows, docs as columns), (vectorizer: fitted CountVectorizer)
    """
    
    print("\n" + "=" * 80)
    print("CREATING DOCUMENT-TERM MATRIX (DTM)")
    print("=" * 80)
    
    # Build token pattern based on punctuation handling
    if remove_punctuation:
        # Remove punctuation: keep only word characters and spaces
        token_pattern = r'\b\w{' + str(min_word_length) + r',}\b'
    else:
        # Keep punctuation as part of tokens (like R's textmineR)
        token_pattern = r'(?u)\b\w+\b|[^\w\s]'
    
    # Create vectorizer with options matching R's CreateDtm
    vectorizer = CountVectorizer(
        lowercase=lowercase,
        token_pattern=token_pattern,
        analyzer='word',
        ngram_range=(1, 1),  # Only unigrams
        min_df=1,  # Include all terms
        max_df=1.0
    )
    
    # Fit and transform texts to get Document-Term Matrix
    dtm_sparse = vectorizer.fit_transform(texts)
    
    # Convert to DataFrame: rows=terms, columns=documents
    # This matches R's textmineR output structure
    dtm_df = pd.DataFrame(
        dtm_sparse.toarray().T,
        index=vectorizer.get_feature_names_out(),
        columns=[f'doc_{i}' for i in range(dtm_sparse.shape[0])]
    )
    
    print(f"\nDTM Shape: {dtm_df.shape} (terms √ó documents)")
    print(f"  ‚Ä¢ Total unique terms (vocabulary): {dtm_df.shape[0]}")
    print(f"  ‚Ä¢ Total documents: {dtm_df.shape[1]}")
    print(f"  ‚Ä¢ Sparsity: {(dtm_sparse.nnz / (dtm_sparse.shape[0] * dtm_sparse.shape[1]) * 100):.2f}%")
    
    print(f"\nFirst 10 terms (rows) √ó all documents (columns):")
    print(dtm_df.head(10))
    
    print(f"\nTerm frequency statistics:")
    term_freq = dtm_df.sum(axis=1)
    print(f"  ‚Ä¢ Mean term frequency: {term_freq.mean():.2f}")
    print(f"  ‚Ä¢ Median term frequency: {term_freq.median():.2f}")
    print(f"  ‚Ä¢ Max term frequency: {term_freq.max():.0f}")
    print(f"  ‚Ä¢ Min term frequency: {term_freq.min():.0f}")
    
    return dtm_df, vectorizer


# ============================================================================
# MAIN EXECUTION
# ============================================================================

if __name__ == "__main__":
    
    print("=" * 80)
    print("NLP TEXT PROCESSING PIPELINE")
    print("=" * 80)
    
    df_export = pd.read_csv('output.csv', sep=';') 

    print(f"\nLoaded {len(df_export)} documents from df_export['text']")
    
    # ---- STEP 1: TEXT CLEANING ----
    print("\n" + "=" * 80)
    print("STEP 1: TEXT CLEANING")
    print("=" * 80)
    
    df_export['text_cleaned'] = df_export['text'].apply(
        lambda x: clean_text(x, hashtag=True, mention=True, numbers=False, 
                            punctuation=False, lowercase=True)
    )
    
    print(f"\n‚úì Cleaned {len(df_export)} documents")
    print(f"Average text length before: {df_export['text'].str.len().mean():.0f} chars")
    print(f"Average text length after: {df_export['text_cleaned'].str.len().mean():.0f} chars")
    
    # Show first 3 examples
    print("\nFirst 3 Before/After Examples:")
    for i, row in df_export.head(3).iterrows():
        print(f"\nüìÑ ROW {i} BEFORE:\n{row['text'][:100]}...")
        print(f"‚ú® AFTER:\n{row['text_cleaned'][:100]}...")
    
    # ---- STEP 2: COLLOCATIONS ----
    print("\n" + "=" * 80)
    print("STEP 2: EXTRACTING COLLOCATIONS")
    print("=" * 80)
    
    df_export, out_pos, out_pmi = apply_pipeline_on_df(df_export, text_col='text_cleaned')
    
    # Verify generated files
    check_collocation_files("colloc_POS.xlsx", "colloc_PMI.xlsx")
    
    # ---- STEP 3: LEMMATIZATION ----
    print("\n" + "=" * 80)
    print("STEP 3: LEMMATIZATION WITH STOPWORD REMOVAL")
    print("=" * 80)
    
    lemmatized_tokens = apply_lemmatization_to_df(df_export, text_column='text_cleaned')
    
    print(f"\nTotal tokens extracted: {len(lemmatized_tokens)}")
    print(f"Unique documents: {lemmatized_tokens['doc_id'].nunique()}")
    print(f"Stopwords found: {lemmatized_tokens['STOP'].sum()}")
    print(f"Content words: {(~lemmatized_tokens['STOP']).sum()}")
    
    first_doc = lemmatized_tokens['doc_id'].iloc[0]
    sample_tokens = lemmatized_tokens[lemmatized_tokens['doc_id'] == first_doc].head(15)
    
    print(f"\nSample tokens from {first_doc}:")
    print(sample_tokens[['token', 'lemma', 'upos', 'STOP']].to_string(index=False))
    
    print("\nMost common content words:")
    print(lemmatized_tokens[~lemmatized_tokens['STOP']]['lemma'].value_counts().head(10))
    
    lemmatized_tokens.to_csv('lemmatized_tokens.csv', index=False, encoding='utf-8')
    print(f"\nüíæ Lemmatized tokens saved to 'lemmatized_tokens.csv'")
    

    # ---- STEP 3.5: CREATE text_nostop COLUMN (NO STOPWORDS) ----
    print("\n" + "=" * 80)
    print("STEP 3.5: CREATING text_nostop COLUMN (STOPWORDS REMOVED)")
    print("=" * 80)

    def reconstruct_text_without_stopwords(doc_id, lemmatized_df):
        """Ricostruisce il testo usando solo lemmi NON-stopword"""
        content_lemmas = lemmatized_df[
            (lemmatized_df['doc_id'] == doc_id) & 
            (lemmatized_df['STOP'] == False)
        ]['lemma'].tolist()
        return ' '.join(content_lemmas)

    # Crea colonna text_nostop
    print("Building text_nostop column...")
    df_export['text_nostop'] = [
        reconstruct_text_without_stopwords(f'doc_{i}', lemmatized_tokens)
        for i in range(len(df_export))
    ]

    print(f"\n‚úì Created 'text_nostop' column in df_export")

    # Statistics
    avg_len_with = df_export['text_cleaned'].str.split().str.len().mean()
    avg_len_without = df_export['text_nostop'].str.split().str.len().mean()
    reduction = ((avg_len_with - avg_len_without) / avg_len_with * 100)

    print(f"\nüìä Statistics:")
    print(f"  Average tokens WITH stopwords (text_cleaned): {avg_len_with:.1f}")
    print(f"  Average tokens WITHOUT stopwords (text_nostop): {avg_len_without:.1f}")
    print(f"  Reduction: {reduction:.1f}%")

    # Show examples
    print("\nüìÑ First 3 Examples (text_cleaned vs text_nostop):")
    for i in range(min(3, len(df_export))):
        print(f"\n--- Document {i} ---")
        print(f"WITH stopwords:    {df_export.iloc[i]['text_cleaned'][:120]}...")
        print(f"WITHOUT stopwords: {df_export.iloc[i]['text_nostop'][:120]}...")


    # ---- STEP 4: DOCUMENT-TERM MATRIX (FROM text_nostop) ----
    print("\n" + "=" * 80)
    print("STEP 4: DOCUMENT-TERM MATRIX (DTM) - WITHOUT STOPWORDS")
    print("=" * 80)

    # Use text_nostop (no stopwords) instead of text_cleaned
    texts = df_export['text_nostop'].tolist()

    dtm_df, vectorizer = create_term_document_matrix(
        texts, 
        remove_punctuation=False,
        min_word_length=1,
        lowercase=True
    )
    dtm_df.to_csv('document_term_matrix.csv')
    print(f"\nüíæ Document-Term Matrix (NO STOPWORDS) saved to 'document_term_matrix.csv'")
    
    # ---- FINAL SUMMARY ----
    print("\n" + "=" * 80)
    print("PIPELINE COMPLETE!")
    print("=" * 80)
    print(f"\nüìä PROCESSING STATISTICS:")
    print(f"  Total documents processed: {len(df_export)}")
    print(f"  Total tokens extracted: {len(lemmatized_tokens)}")
    print(f"  Average tokens per document: {len(lemmatized_tokens) / lemmatized_tokens['doc_id'].nunique():.1f}")
    print(f"  Content words (non-stopwords): {(~lemmatized_tokens['STOP']).sum()}")
    print(f"  Stopwords filtered: {lemmatized_tokens['STOP'].sum()}")
    print(f"  POS-based collocations found: {len(out_pos)}")
    print(f"  PMI-based collocations found: {len(out_pmi)}")
    #print(f"  Unique terms in vocabulary: {tdm_df.shape[0]}")
    
    print(f"\nüíæ GENERATED FILES:")
    print(f"  ‚úì colloc_POS.xlsx - POS-based collocations ({len(out_pos)} rows)")
    print(f"  ‚úì colloc_PMI.xlsx - PMI-based collocations ({len(out_pmi)} rows)")
    print(f"  ‚úì lemmatized_tokens.csv - All lemmatized tokens with stopword flags")
    print(f"  ‚úì document_term_matrix.csv - DTM Matrix ({dtm_df.shape[0]} terms √ó {dtm_df.shape[1]} docs)")
    
    print(f"\nüìà TOP INSIGHTS:")
    
    print(f"\n  Top 10 Most Frequent Content Words:")
    top_words = lemmatized_tokens[~lemmatized_tokens['STOP']]['lemma'].value_counts().head(10)
    for word, freq in top_words.items():
        print(f"    ‚Ä¢ {word}: {freq}")
    
    print(f"\n  Top 5 POS Collocations by Frequency:")
    if len(out_pos) > 0:
        for _, row in out_pos.head(5).iterrows():
            print(f"    ‚Ä¢ {row['collocation']} ({row['pos_pattern']}): freq={row['freq']}")
    
    print(f"\n  Top 5 PMI Collocations:")
    if len(out_pmi) > 0:
        for _, row in out_pmi.head(5).iterrows():
            pmi_val = f"{row['pmi']:.2f}" if row['pmi'] != float('-inf') else "inf"
            print(f"    ‚Ä¢ {row['collocation']}: PMI={pmi_val}, freq={row['freq']}")
    
    print(f"\n  Document Statistics:")
    doc_token_counts = lemmatized_tokens.groupby('doc_id').size()
    print(f"    ‚Ä¢ Average tokens per document: {doc_token_counts.mean():.1f}")
    print(f"    ‚Ä¢ Max tokens in a document: {doc_token_counts.max()}")
    print(f"    ‚Ä¢ Min tokens in a document: {doc_token_counts.min()}")
    
    print("\n" + "=" * 80)

In [None]:
print(df_export['text_nostop'].head(5))

In [None]:
# Crea copia e pulisci in un passaggio
df_clean = df_export.copy()

# Pulisci tutte le colonne di tipo object/string
df_clean = df_clean.apply(
    lambda x: x.str.replace('\t|\r|\n', ' ', regex=True).str.replace(r'\s+', ' ', regex=True).str.strip() 
    if x.dtype == 'object' else x
)

# Esporta
df_clean.to_csv(
    'df_clean.csv', 
    sep=';', 
    index=False, 
    encoding='utf-8',
    quoting=csv.QUOTE_NONNUMERIC
)

## Explorative data analysis

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt

#### Remove duplicate records

In [None]:
df = pd.read_csv('df_clean.csv', sep=';')

df['id'] = range(len(df))

# Deletion of repeated texts
dfControl = df.groupby('text_nostop').agg({'id': 'min'}).reset_index()
dfControl['n'] = df.groupby('text_nostop').size().values
dfControl = dfControl.sort_values('n', ascending=False)

df = df[df['id'].isin(dfControl['id'])].reset_index(drop=True)

#### Remove useless frequent stopwords and comma

In [None]:
df['text_nostop'] = df['text_nostop'].str.replace(r"(?i)\b(also|say|could|make)s?\b", "", regex=True)
df["text_nostop"] = df["text_nostop"].str.replace(",", "", regex=False)


#### Which is the most frequent ticker ?

In [None]:
# Prendi i top 10 ticker dal tuo snippet
top_tickers = df['tickers'].str.split(',').explode().value_counts().head(10).index

# Espandi i tickers nel dataframe (una riga per ciascun ticker)
df_exploded = df.copy()
df_exploded = df_exploded.assign(ticker = df_exploded['tickers'].str.split(',')).explode('ticker')

# Filtra solo i top 10
df_top = df_exploded[df_exploded['ticker'].isin(top_tickers)]

# Raggruppa per ticker e calcola numero di news, prima e ultima data
ticker_stats = df_top.groupby('ticker').agg(
    n_news=('ticker', 'count'),
    first_date=('parsed_date', 'min'),
    last_date=('parsed_date', 'max')
).reset_index()

# Ordina per numero di news decrescente
ticker_stats = ticker_stats.sort_values(by='n_news', ascending=False)

# Mostra il risultato
print(ticker_stats)

# Get top 30 tickers
ticker_counts = df['tickers'].str.split(',').explode().value_counts().head(30)

# Create bar chart
plt.figure(figsize=(10, 6))
ticker_counts.plot(kind='barh', color='steelblue')
plt.xlabel('Frequency')
plt.ylabel('Ticker')
plt.title('30 Most Frequent Tickers')
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# -------------------------
# Parametri
# -------------------------
tickers_of_interest = ['BTC-USD', 'NVDA', 'OPAI.PVT']
start_date = "2025-10-01"
end_date   = "2025-10-20"

# -------------------------
# 1. Assicura datetime
# -------------------------
df['parsed_date'] = pd.to_datetime(df['parsed_date'], errors='coerce')

# -------------------------
# 2. Filtra per range di date (solo giorno)
# -------------------------
start = pd.to_datetime(start_date).date()
end   = pd.to_datetime(end_date).date()
mask = df['parsed_date'].dt.date.between(start, end)
df_period = df.loc[mask].copy()

# -------------------------
# 3. Espandi i ticker separati da virgola e normalizza
# -------------------------
# Se ci sono NaN, trasformali in stringa vuota per evitare errori
df_period['tickers'] = df_period['tickers'].fillna('')

# split + explode
df_exploded = df_period.assign(ticker = df_period['tickers'].str.split(',')).explode('ticker')

# strip spazi e rimuovi eventuali ticker vuoti
df_exploded['ticker'] = df_exploded['ticker'].astype(str).str.strip()
df_exploded = df_exploded[df_exploded['ticker'] != ''].copy()

# -------------------------
# 4. Raggruppa per ticker e giorno (date senza orario)
# -------------------------
# crea una colonna date_only come datetime (00:00:00) per poter reindexare con date_range
df_exploded['date_only'] = pd.to_datetime(df_exploded['parsed_date'].dt.date)

daily_counts = (
    df_exploded
    .groupby(['ticker', 'date_only'])
    .size()
    .reset_index(name='n_articles')
)

# -------------------------
# 5. Pivot e reindex robusto
# -------------------------
daily_counts_pivot = daily_counts.pivot(index='date_only', columns='ticker', values='n_articles').fillna(0)

# crea indice giornaliero completo (DatetimeIndex) e reindexa per avere tutti i giorni
full_index = pd.date_range(start=start_date, end=end_date, freq='D')
daily_counts_pivot = daily_counts_pivot.reindex(full_index, fill_value=0)
daily_counts_pivot.index.name = 'date'

# assicurati che le colonne contengano esattamente i ticker che ti interessano
# se un ticker non √® presente viene creato con tutti zeri (evita KeyError)
for t in tickers_of_interest:
    if t not in daily_counts_pivot.columns:
        daily_counts_pivot[t] = 0

# ordina le colonne come nella lista richiesta
daily_counts_pivot = daily_counts_pivot[tickers_of_interest]

# -------------------------
# 6. Plot
# -------------------------
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import sys

plt.figure(figsize=(12,6))

# Palette professionale (toni di blu)
colors = ["#1f77b4", "#869fb8", "#0b3d91"]  # tre tonalit√† di blu

for i, ticker in enumerate(tickers_of_interest):
    plt.plot(
        daily_counts_pivot.index,
        daily_counts_pivot[ticker],
        color=colors[i % len(colors)],
        linewidth=2,
        label=ticker
    )

plt.title(f"Daily number of articles for {', '.join(tickers_of_interest)}", fontsize=14, weight='bold')
plt.xlabel("Date", fontsize=12)
plt.ylabel("Number of articles", fontsize=12)

# Formatta l'asse x: giorno + abbreviazione del mese, orizzontale
ax = plt.gca()

if sys.platform.startswith("win"):
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%#d %b'))  # Windows
else:
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%-d %b'))  # macOS/Linux

plt.xticks(rotation=0, fontsize=10)
plt.legend(frameon=False, fontsize=11)
plt.grid(alpha=0.2)
plt.tight_layout()
plt.show()


#### Lexicometric measures (N, V ,H)

In [None]:
# Preparazione testi: converti in stringhe e riempi NaN con stringhe vuote
texts = df['text_nostop'].fillna("").astype(str).tolist()

# --- 1) Creazione della Document-Term Matrix (conteggi) ---
# token_pattern con \w{2,} per includere solo token di lunghezza >= 2 (equivalente a wordLengths = c(2, Inf))
vectorizer = CountVectorizer(token_pattern=r'\b\w{2,}\b', lowercase=True)
dtm = vectorizer.fit_transform(texts)   # matrice (n_docs x n_terms) sparse
terms = vectorizer.get_feature_names_out()

# --- 2) DataFrame dei termini con frequenze complessive
term_freq = np.array(dtm.sum(axis=0)).flatten()   # somma per colonna -> frequenza totale di ciascun termine
dfTerms2 = pd.DataFrame({'term': terms, 'Freq': term_freq}).sort_values('Freq', ascending=False).reset_index(drop=True)

# --- 3) Calcoli lessicometrici ---
N = int(dfTerms2['Freq'].sum())            # corpus size: totale occorrenze
V = int(dfTerms2.shape[0])                 # vocabulary size: numero termini distinti
H = int((dfTerms2['Freq'] == 1).sum())     # hapax: termini che compaiono una sola volta

TTR = V / N if N > 0 else np.nan           # Type-Token Ratio
HTR = H / V if V > 0 else np.nan           # Hapax-Type Ratio
TMF = N / V if V > 0 else np.nan           # Type Mean Frequency
G = V / np.sqrt(N) if N > 0 else np.nan    # Guiraud Index

# Stampa risultati simili a cbind in R
metrics = pd.DataFrame({
    'N': [N],
    'V': [V],
    'H': [H],
    'TTR': [TTR],
    'HTR': [HTR],
    'TMF': [TMF],
    'G': [G]
})
print(metrics.to_string(index=False))

# --- 4) Numero di token/lemmi per documento (nlem2) ---
# Qui interpretiamo "lemmi per review" come numero di token nella stringa gi√† pre-elaborata;
# si pu√≤ usare la matrice dtm per contare parole per documento:
nlem2 = np.array(dtm.sum(axis=1)).flatten()   # somma riga per documento

# Statistiche descrittive (equivalente a summary(nlem2))
desc = {
    'Min': nlem2.min(),
    '1st Qu.': np.percentile(nlem2, 25),
    'Median': np.median(nlem2),
    'Mean': nlem2.mean(),
    '3rd Qu.': np.percentile(nlem2, 75),
    'Max': nlem2.max()
}
print("\nSummary of number of lemmas (tokens) per news:")
for k, v in desc.items():
    # stampo valori con due decimali per Mean
    if k == 'Mean':
        print(f"{k:8s}: {v:.2f}")
    else:
        print(f"{k:8s}: {int(v)}")

# --- 5) Aggiungo la colonna n.lemma a df (equivalente a mutate(n.lemma=nlem2)) ---
df = df.copy()
df['n_lemma'] = nlem2

# --- 6) Istogramma della distribuzione del numero di lemmi per review ---
max_n = int(nlem2.max())
bins = np.arange(0, max_n + 10, 10)  # breakpoints every 10 (0,10,20,...)
plt.figure(figsize=(9,5))
plt.hist(nlem2, bins=bins, edgecolor='gray', linewidth=0.5)
plt.title("Distribution of the number of lemmas per news article")
plt.xlabel("Number of lemmas (tokens) per news")
plt.ylabel("Count of news")
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

tickers_of_interest = ["BTC-USD", "NVDA"]

# Assicurati che 'tickers' non contenga NaN
df_plot = df.copy()
df_plot['tickers'] = df_plot['tickers'].fillna("")

# Crea colonna ticker_filtered: lista dei ticker presenti
df_plot['ticker_filtered'] = df_plot['tickers'].apply(
    lambda x: [t for t in tickers_of_interest if t in x]
)
df_plot = df_plot.explode('ticker_filtered')
df_plot = df_plot.dropna(subset=['ticker_filtered'])

# Imposta figura con 2 subplot affiancati
fig, axes = plt.subplots(1, 2, figsize=(14,5), sharey=True)

# Colori professionali
colors = ["#1f77b4", "#869fb8"]

for i, ticker in enumerate(tickers_of_interest):
    sns.histplot(
        data=df_plot[df_plot['ticker_filtered'] == ticker],
        x='n_lemma',
        bins=30,
        color=colors[i],
        edgecolor='gray',
        alpha=0.7,
        ax=axes[i]
    )
    axes[i].set_title(f"Distribution of lemmas for {ticker}", fontsize=12, weight='bold')
    axes[i].set_xlabel("Number of lemmas per news")
    axes[i].set_ylabel("Count of news")
    axes[i].grid(axis='y', alpha=0.2)

plt.tight_layout()
plt.show()


#### Top 10 most frequent lemmas

In [None]:
print("\nTop 10 terms by total frequency:")
print(dfTerms2.head(10).to_string(index=False))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Prendi le prime 20 righe
top_terms = dfTerms2.head(20)

plt.figure(figsize=(10,6))
sns.barplot(
    data=top_terms,
    y='term',          
    x='Freq',          
    color="#4c9dd6"
)

# Titolo pi√π elegante
plt.title("20 most frequent lemmas", 
          fontsize=13,       
          #weight='bold',      
          pad=20,             
          loc='left')       


plt.ylabel("")
plt.xlabel("Freq", fontsize=10, weight='bold')
plt.grid(axis='x', alpha=0.2)  
plt.tight_layout()
plt.show()


In [None]:
print(dfTerms2)

### New Wordcloud for News Article

In [None]:
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import matplotlib.cm as cm

# --- Creiamo una maschera circolare (nero=riempito, bianco=vuoto) ---
size = 1200
x, y = np.ogrid[:size, :size]
radius = size // 2
mask = (x - radius)**2 + (y - radius)**2 <= radius**2
mask = 255 * mask.astype(int)   # 255 = nero
mask = 255 - mask                # invertiamo: testo dentro il cerchio

# --- Creiamo una funzione di colorazione personalizzata ---
def color_func(word, font_size, position, orientation, random_state=None, **kwargs):
    # Otteniamo la frequenza della parola
    freq = word_freq_dict[word]
    # Normalizziamo tra 0.3 e 1 (iniziamo da 0.3 per evitare colori troppo chiari)
    normalized = 0.3 + 0.7 * ((freq - min_freq) / (max_freq - min_freq)) if max_freq != min_freq else 0.65
    
    # Usiamo la colormap Blues standard di matplotlib
    blues = cm.get_cmap('Blues')
    rgb = blues(normalized)[:3]  # Prendiamo solo RGB, non alpha
    
    return f"rgb({int(rgb[0]*255)}, {int(rgb[1]*255)}, {int(rgb[2]*255)})"

# --- Prepariamo il dizionario delle frequenze ---
word_freq_dict = dict(zip(dfTerms2['term'], dfTerms2['Freq']))
max_freq = dfTerms2['Freq'].max()
min_freq = dfTerms2['Freq'].min()

# --- WordCloud ---
wc = WordCloud(
    background_color="white",
    max_words=150,
    mask=mask,
    contour_width=0,
    width=size,
    height=size
)

# --- Generiamo dai termini/frequenze ---
wc.generate_from_frequencies(word_freq_dict)

# --- Applichiamo la funzione di colorazione ---
wc.recolor(color_func=color_func)

# --- Visualizzazione ---
plt.figure(figsize=(8,8))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.tight_layout()
plt.savefig('wordcloud_bitcoin.png', dpi=300, bbox_inches='tight', facecolor='white')
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
import math

# -----------------------------
# 1Ô∏è‚É£ FILTRAGGIO E PREPARAZIONE
# -----------------------------

# Funzione per capire se un articolo parla di BTC o NVDA
def contains_ticker(tickers_str, ticker):
    if pd.isna(tickers_str):
        return False
    return ticker in [t.strip() for t in tickers_str.split(',')]

# Filtriamo gli articoli che contengono almeno uno dei due tickers
df = df[df['tickers'].apply(lambda x: contains_ticker(x, 'BTC-USD') or contains_ticker(x, 'NVDA'))].copy()

# Creiamo una colonna "group" con il tipo di ticker dominante (BTC o NVDA)
def assign_group(tickers_str):
    tickers = [t.strip() for t in tickers_str.split(',')]
    if 'BTC-USD' in tickers and 'NVDA' not in tickers:
        return 'BTC-USD'
    elif 'NVDA' in tickers and 'BTC-USD' not in tickers:
        return 'NVDA'
    elif 'BTC-USD' in tickers and 'NVDA' in tickers:
        return 'BOTH'
    else:
        return None

df['group'] = df['tickers'].apply(assign_group)
df = df[df['group'].isin(['BTC-USD', 'NVDA'])]  # rimuoviamo quelli con BOTH per evitare ambiguit√†

# Raggruppiamo e uniamo i testi per gruppo (come in R con summarise)
texts = df.groupby('group')['text_nostop'].apply(lambda x: ' '.join(x)).to_dict()

# -----------------------------
# 2Ô∏è‚É£ CREAZIONE MATRICE TERMINI-DOCUMENTI
# -----------------------------

vectorizer = CountVectorizer(min_df=2, token_pattern=r'\b[a-zA-Z]{2,}\b')  # parole con almeno 2 lettere
X = vectorizer.fit_transform([texts['BTC-USD'], texts['NVDA']])
terms = vectorizer.get_feature_names_out()

# Matrice termine-documento
tdm = pd.DataFrame(X.toarray().T, index=terms, columns=['BTC-USD', 'NVDA'])

# -----------------------------
# 3Ô∏è‚É£ CALCOLO FREQUENZE RELATIVE
# -----------------------------

tdm['rfBTC'] = tdm['BTC-USD'] / tdm['BTC-USD'].sum()
tdm['rfNVDA'] = tdm['NVDA'] / tdm['NVDA'].sum()

# -----------------------------
# 4Ô∏è‚É£ SCATTER PLOT FREQUENZE RELATIVE (con etichette)
# -----------------------------

plt.figure(figsize=(10, 8))
sns.scatterplot(data=tdm, x='rfBTC', y='rfNVDA', alpha=0.6, s=30)

# Linea diagonale (parit√† di frequenze)
max_val = max(tdm['rfBTC'].max(), tdm['rfNVDA'].max())
plt.plot([0, max_val], [0, max_val], color='red', linestyle='--', linewidth=1)

# Etichette parole
# (mostriamo solo le parole pi√π frequenti per evitare sovrapposizione)
tdm_sorted = tdm.sort_values(by=['rfBTC', 'rfNVDA'], ascending=False)
subset = tdm_sorted.head(100)  # mostra fino a 100 parole pi√π frequenti
for i, row in subset.iterrows():
    plt.text(row['rfBTC'], row['rfNVDA'], i, fontsize=8, alpha=0.7)

# Scala log per leggibilit√†
plt.xscale('log')
plt.yscale('log')

plt.xlabel('Frequenza relativa - BTC-USD')
plt.ylabel('Frequenza relativa - NVDA')
plt.title('Confronto delle distribuzioni delle parole (BTC vs NVDA)')
plt.grid(True, which="both", ls="--", lw=0.5, alpha=0.4)
plt.tight_layout()
plt.show()

# -----------------------------
# 5Ô∏è‚É£ CALCOLO LOG ODDS RATIO
# -----------------------------

tdm['pBTC'] = (tdm['BTC-USD'] + 1) / (tdm['BTC-USD'].sum() + 1)
tdm['pNVDA'] = (tdm['NVDA'] + 1) / (tdm['NVDA'].sum() + 1)
tdm['log_ratio'] = np.log(tdm['pBTC'] / tdm['pNVDA'])

# -----------------------------
# 6Ô∏è‚É£ GRAFICO DEL LOG ODDS RATIO
# -----------------------------

# Selezioniamo le parole pi√π caratteristiche
top_words = (
    tdm.reindex(tdm['log_ratio'].abs().sort_values(ascending=False).index)
    .head(30)
    .assign(word=lambda d: d.index)
)

plt.figure(figsize=(8, 10))
sns.barplot(
    data=top_words,
    y='word',
    x='log_ratio',
    hue=(top_words['log_ratio'] < 0),
    dodge=False,
    palette={False: 'steelblue', True: 'seagreen'}  # colori validi
)
plt.axvline(0, color='black', linewidth=1)
plt.ylabel('Parole')
plt.xlabel('Log Odds Ratio (BTC-USD / NVDA)')
plt.title('Parole pi√π caratteristiche tra BTC e NVDA')
plt.legend(title='', labels=['BTC-USD', 'NVDA'])
plt.tight_layout()
plt.show()

# -----------------------------
# 7Ô∏è‚É£ OPZIONALE: OUTPUT PER WORDCLOUD
# -----------------------------

# Puoi riusare questa variabile con il tuo script WordCloud
dfTerms2 = (
    tdm[['BTC-USD', 'NVDA', 'rfBTC', 'rfNVDA', 'log_ratio']]
    .reset_index()
    .rename(columns={'index': 'term'})
)

# Ad esempio per creare una wordcloud solo per BTC:
dfTerms2_btc = dfTerms2[['term', 'BTC-USD']].rename(columns={'BTC-USD': 'Freq'})


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# -----------------------------
# DATI: si assume che tu abbia gi√† df pronto
# -----------------------------

def contains_ticker(tickers_str, ticker):
    if pd.isna(tickers_str):
        return False
    return ticker in [t.strip() for t in tickers_str.split(',')]

df = df[df['tickers'].apply(lambda x: contains_ticker(x, 'BTC-USD') or contains_ticker(x, 'NVDA'))].copy()

def assign_group(tickers_str):
    tickers = [t.strip() for t in tickers_str.split(',')]
    if 'BTC-USD' in tickers and 'NVDA' not in tickers:
        return 'BTC-USD'
    elif 'NVDA' in tickers and 'BTC-USD' not in tickers:
        return 'NVDA'
    elif 'BTC-USD' in tickers and 'NVDA' in tickers:
        return 'BOTH'
    else:
        return None

df['group'] = df['tickers'].apply(assign_group)
df = df[df['group'].isin(['BTC-USD', 'NVDA'])]

texts = df.groupby('group')['text_nostop'].apply(lambda x: ' '.join(x)).to_dict()

# -----------------------------
# MATRICE TERMINI-DOCUMENTI
# -----------------------------
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=2, token_pattern=r'\b[a-zA-Z]{2,}\b')
X = vectorizer.fit_transform([texts['BTC-USD'], texts['NVDA']])
terms = vectorizer.get_feature_names_out()

tdm = pd.DataFrame(X.toarray().T, index=terms, columns=['BTC-USD', 'NVDA'])
tdm['rfBTC'] = tdm['BTC-USD'] / tdm['BTC-USD'].sum()
tdm['rfNVDA'] = tdm['NVDA'] / tdm['NVDA'].sum()

# -----------------------------
# SCATTER PLOT (frequenze relative)
# -----------------------------
tdm['distance_from_diagonal'] = np.abs(np.log(tdm['rfBTC'] + 1e-10) - np.log(tdm['rfNVDA'] + 1e-10))
tdm['total_freq'] = tdm['rfBTC'] + tdm['rfNVDA']

btc_words = tdm[tdm['rfBTC'] > tdm['rfNVDA'] * 1.5].nlargest(15, 'rfBTC')
nvda_words = tdm[tdm['rfNVDA'] > tdm['rfBTC'] * 1.5].nlargest(15, 'rfNVDA')
common_words = tdm[
    (tdm['rfBTC'] / (tdm['rfNVDA'] + 1e-10) > 0.7) & 
    (tdm['rfBTC'] / (tdm['rfNVDA'] + 1e-10) < 1.3)
].nlargest(10, 'total_freq')

words_to_label = pd.concat([btc_words, nvda_words, common_words])

plt.figure(figsize=(10, 8))
plt.scatter(tdm['rfBTC'], tdm['rfNVDA'], alpha=0.3, s=30, color='lightgray')
plt.scatter(btc_words['rfBTC'], btc_words['rfNVDA'], color='black', s=50, alpha=0.8, edgecolors='black')
plt.scatter(nvda_words['rfBTC'], nvda_words['rfNVDA'], color='black', s=50, alpha=0.8, edgecolors='black')
plt.scatter(common_words['rfBTC'], common_words['rfNVDA'], color='black', s=50, alpha=0.8, edgecolors='black')

max_val = max(tdm['rfBTC'].max(), tdm['rfNVDA'].max())
plt.plot([0, max_val], [0, max_val], color='red', linestyle='-', linewidth=2, alpha=0.5)

for word, row in words_to_label.iterrows():
    plt.text(row['rfBTC'], row['rfNVDA'], word, fontsize=10, color='black', ha='left', va='bottom')

plt.xscale('log')
plt.yscale('log')
plt.xlabel('Frequenza relativa - BTC-USD (scala logaritmica)', fontsize=13, fontweight='bold')
plt.ylabel('Frequenza relativa - NVDA (scala logaritmica)', fontsize=13, fontweight='bold')
plt.title('Confronto Distribuzioni Parole: BTC-USD vs NVDA', fontsize=15, fontweight='bold', pad=15)
plt.grid(True, which="both", ls="-", lw=0.5, alpha=0.3, color='lightgray')
plt.tight_layout()
plt.savefig('scatter_plot_frequenze.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Patch

# -----------------------------
# LOG ODDS RATIO
# -----------------------------
tdm['pBTC'] = (tdm['BTC-USD'] + 1) / (tdm['BTC-USD'].sum() + 1)
tdm['pNVDA'] = (tdm['NVDA'] + 1) / (tdm['NVDA'].sum() + 1)
tdm['log_ratio'] = np.log(tdm['pBTC'] / tdm['pNVDA'])

top_words = (
    tdm.reindex(tdm['log_ratio'].abs().sort_values(ascending=False).index)
    .head(40)
    .assign(word=lambda d: d.index)
    .sort_values('log_ratio')
)

# -----------------------------
# MINIMAL COLORS
# -----------------------------
color_btc = "#1f77b4"   # blue
color_nvda = "#869fb8"  # light blue
colors = [color_btc if x > 0 else color_nvda for x in top_words['log_ratio']]

# -----------------------------
# PLOT
# -----------------------------
plt.figure(figsize=(10, 10))
bars = plt.barh(
    range(len(top_words)),
    top_words['log_ratio'],
    color=colors,
    edgecolor='none'
)

plt.yticks(range(len(top_words)), top_words['word'], fontsize=11)
plt.axvline(0, color='black', linewidth=1.2)

# Labels inside the bars
for i, (idx, row) in enumerate(top_words.iterrows()):
    value = row['log_ratio']
    # Space from the inner edge (slightly reduced for smaller font)
    offset = 0.03 if value > 0 else -0.03
    # Position text just inside the bar
    x_pos = value - offset if value > 0 else value - offset
    ha = 'right' if value > 0 else 'left'
    plt.text(
        x_pos, i, f'{value:.2f}',
        va='center', ha=ha,
        fontsize=8, 
        color='black'
    )

plt.xlabel('Log Odds Ratio (BTC-USD / NVDA)', fontsize=13, fontweight='bold')
plt.ylabel('Words', fontsize=13, fontweight='bold')
plt.title('Characteristic Words: BTC-USD vs NVDA\n(Positive = more BTC | Negative = more NVDA)',
          fontsize=14, fontweight='bold', pad=20)

legend_elements = [
    Patch(facecolor=color_btc, label='More characteristic of BTC-USD'),
    Patch(facecolor=color_nvda, label='More characteristic of NVDA')
]
plt.legend(handles=legend_elements, loc='lower right', fontsize=11)

plt.grid(axis='x', alpha=0.25, linestyle='--')
plt.tight_layout()
plt.savefig('log_odds_ratio_inside_edge_labels.png', dpi=300, bbox_inches='tight')
plt.show()


### Clustering Terms

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy.cluster.hierarchy import dendrogram, linkage, cut_tree, fcluster
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

In [None]:
vectorizer = CountVectorizer(token_pattern=r'\b\w{2,}\b')
tdm_matrix = vectorizer.fit_transform(df['text_nostop'])
tdm = tdm_matrix.T
print(f"Term Document Matrix: {tdm.shape[0]} terms, {tdm.shape[1]} documents")


In [None]:
min_docs = int(np.ceil(0.01 * tdm.shape[1]))
term_counts = np.array((tdm > 0).sum(axis=1)).flatten()
mask = term_counts >= min_docs
tdm_rs = tdm[mask]
terms = np.array(vectorizer.get_feature_names_out())[mask]

print(f"Reduced TDM: {tdm_rs.shape[0]} terms, {tdm_rs.shape[1]} documents")

In [None]:
# Calcola distanze cosine
tdm_rs_dense = tdm_rs.toarray()
dst_cs_rs = pdist(tdm_rs_dense, metric='cosine')

In [None]:
# Hierarchical Clustering con metodo Ward
h_cl_cs_rs = linkage(dst_cs_rs, method='ward')

# Plot dendrogram SENZA colori (tutto nero come in R)
plt.figure(figsize=(10, 6))
plt.title("Hierarchical Cluster\ncosine - Ward", fontsize=12)
dendrogram(h_cl_cs_rs, 
           labels=terms, 
           leaf_font_size=2,
           color_threshold=0,  
           above_threshold_color="#111111")  
plt.xlabel("")
plt.xticks(fontsize=6)
plt.yticks(fontsize=8)
plt.tight_layout()
plt.show()

In [None]:
# Calcola Silhouette scores per diversi numeri di cluster
lim_clu = range(2, 21)
vSilIn = []

dst_cs_rs_matrix = squareform(dst_cs_rs)

for k in lim_clu:
    clusters = fcluster(h_cl_cs_rs, k, criterion='maxclust')
    sil_score = silhouette_score(dst_cs_rs_matrix, clusters, metric='precomputed')
    vSilIn.append(sil_score)

# Plot Silhouette scores
df_sil = pd.DataFrame({'n_clust': list(lim_clu), 'silhouette': vSilIn})
plt.figure(figsize=(10, 6))
plt.plot(df_sil['n_clust'], df_sil['silhouette'], color='blue', linewidth=2, marker='o', markersize=6)
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette score')
plt.xticks(lim_clu)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
from scipy.cluster.hierarchy import dendrogram, fcluster
import matplotlib.pyplot as plt
import numpy as np

# Plot dendrogram con rettangoli per k=6 cluster
fig, ax = plt.subplots(figsize=(15, 8))

# Crea il dendrogramma con TUTTI i rami neri
dend = dendrogram(h_cl_cs_rs, 
                  labels=terms, 
                  leaf_font_size=6,
                  color_threshold=0,  # Tutto nero
                  above_threshold_color='black',  # Colore nero per tutti i rami
                  ax=ax)

# Ottieni i cluster per k=6
clu6 = fcluster(h_cl_cs_rs, 6, criterion='maxclust')

# Calcola l'altezza di taglio per k=6 cluster
# Questo √® il punto dove il dendrogramma viene tagliato per ottenere k cluster
n = len(h_cl_cs_rs) + 1
# L'altezza di taglio √® tra la (n-k)esima e la (n-k+1)esima fusione
cut_height = (h_cl_cs_rs[-(6-1), 2] + h_cl_cs_rs[-(6), 2]) / 2

# Funzione per disegnare i rettangoli esattamente come rect.hclust in R
def rect_hclust_r_style(linkage_matrix, k, dend_dict, cut_height, ax, color='red', linewidth=1.2):
    """
    Replica ESATTAMENTE rect.hclust() di R
    """
    clusters = fcluster(linkage_matrix, k, criterion='maxclust')
    
    # Ottieni le posizioni x delle foglie
    leaves = np.array(dend_dict['leaves'])
    leaf_x = {leaf: (i * 10 + 5) for i, leaf in enumerate(leaves)}
    
    # Per ogni cluster, disegna un rettangolo
    for cluster_id in range(1, k + 1):
        # Trova le foglie del cluster
        cluster_indices = np.where(clusters == cluster_id)[0]
        
        if len(cluster_indices) == 0:
            continue
        
        # Trova le posizioni x delle foglie nel dendrogramma
        x_positions = []
        for idx in cluster_indices:
            if idx in leaf_x:
                x_positions.append(leaf_x[idx])
        
        if len(x_positions) == 0:
            continue
        
        x_min = min(x_positions)
        x_max = max(x_positions)
        
        # Usa l'altezza di taglio come altezza del rettangolo
        # Questo √® il comportamento di rect.hclust in R
        y_max = cut_height
        
        # Disegna il rettangolo
        rect_width = x_max - x_min + 10
        rect = plt.Rectangle((x_min - 5, 0), rect_width, y_max,
                            fill=False, 
                            edgecolor=color, 
                            linewidth=linewidth,
                            zorder=10)  # zorder alto per essere sopra il dendrogramma
        ax.add_patch(rect)

# Aggiungi i rettangoli rossi con linewidth=1.2
rect_hclust_r_style(h_cl_cs_rs, 6, dend, cut_height, ax, color='red', linewidth=1.2)

plt.title("Hierarchical Cluster\ncosine - Ward", fontsize=14, fontweight='bold')
plt.xlabel("")
plt.xticks(rotation=90, fontsize=5)
plt.yticks(fontsize=10)
plt.tight_layout()
plt.show()

# Crea dataframe con cluster assignment per k=6
dfclu6 = pd.DataFrame({'term': terms, 'clu6': clu6})
dfclu6_summary = dfclu6.groupby('clu6').agg(
    n_terms=('term', 'count'),
    terms=('term', lambda x: '; '.join(x))
).reset_index()

print(dfclu6_summary)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, fcluster, to_tree
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage

def plot_dendrogram_2d_layout(linkage_matrix, labels, k=6, figsize=(16, 12),
                              max_labels=80, label_fontsize=8):
    """
    Crea un dendrogramma con layout 2D a coordinate cartesiane
    simile all'immagine fornita
    """
    
    # Determina i cluster
    clusters = fcluster(linkage_matrix, k, criterion='maxclust')
    
    # Converti linkage in albero
    tree = to_tree(linkage_matrix, rd=False)
    
    # Calcola le posizioni usando il dendrogramma standard
    dend = dendrogram(linkage_matrix, no_plot=True)
    
    # Estrai le posizioni x dalle foglie del dendrogramma
    leaves_positions = {}
    for i, (icoord, dcoord) in enumerate(zip(dend['icoord'], dend['dcoord'])):
        for j in range(len(icoord)):
            if dcoord[j] == 0:  # √à una foglia
                leaf_idx = int(icoord[j] / 10)
                if leaf_idx not in leaves_positions:
                    leaves_positions[leaf_idx] = icoord[j]
    
    # Crea mappatura posizione foglia -> indice originale
    leaf_order = dend['leaves']
    
    # Funzione ricorsiva per calcolare le posizioni dei nodi
    def get_node_positions(node, x_positions, y_positions, node_id=None):
        if node_id is None:
            node_id = id(node)
            
        if node.is_leaf():
            # Trova la posizione x di questa foglia
            leaf_idx = node.id
            position_idx = leaf_order.index(leaf_idx)
            x = position_idx * 10 + 5
            y = 0
            x_positions[node_id] = x
            y_positions[node_id] = y
            return x, y
        else:
            # Calcola ricorsivamente per i figli
            left_x, left_y = get_node_positions(node.left, x_positions, y_positions, id(node.left))
            right_x, right_y = get_node_positions(node.right, x_positions, y_positions, id(node.right))
            
            # La posizione x del nodo √® la media dei figli
            x = (left_x + right_x) / 2
            y = node.dist
            
            x_positions[node_id] = x
            y_positions[node_id] = y
            
            return x, y
    
    # Calcola tutte le posizioni
    x_positions = {}
    y_positions = {}
    get_node_positions(tree, x_positions, y_positions)
    
    # Filtra le etichette da mostrare
    if len(labels) > max_labels:
        labels_per_cluster = max_labels // k
        indices_to_show = set()
        
        for cluster_id in range(1, k+1):
            cluster_indices = np.where(clusters == cluster_id)[0]
            selected = cluster_indices[:min(labels_per_cluster, len(cluster_indices))]
            indices_to_show.update(selected)
    else:
        indices_to_show = set(range(len(labels)))
    
    # Colori per i cluster
    colors = plt.cm.Set3(np.linspace(0, 1, k))
    
    # Crea il plot
    fig, ax = plt.subplots(figsize=figsize)
    
    # Disegna i collegamenti dell'albero
    def draw_tree_edges(node, parent_pos=None):
        node_id = id(node)
        x = x_positions[node_id]
        y = y_positions[node_id]
        
        if parent_pos is not None:
            # Disegna linea dal parent al nodo corrente
            ax.plot([parent_pos[0], x], [parent_pos[1], y], 
                   'k-', linewidth=0.5, alpha=0.3)
        
        if not node.is_leaf():
            draw_tree_edges(node.left, (x, y))
            draw_tree_edges(node.right, (x, y))
    
    draw_tree_edges(tree)
    
    # Disegna i nodi foglia con etichette
    for i, label in enumerate(labels):
        if i in leaf_order:
            position_idx = leaf_order.index(i)
            x = position_idx * 10 + 5
            y = 0
            
            cluster_id = clusters[i]
            color = colors[cluster_id - 1]
            
            # Disegna il punto
            ax.plot(x, y, 'o', color=color, markersize=4, alpha=0.8)
            
            # Aggiungi etichetta se selezionata
            if i in indices_to_show:
                ax.text(x, y, label, fontsize=label_fontsize, 
                       color=color, rotation=90, ha='right', va='bottom',
                       fontweight='bold')
    
    # Imposta limiti e aspetto
    ax.set_xlabel('Terms', fontsize=12)
    ax.set_ylabel('Distance', fontsize=12)
    ax.set_title(f'Hierarchical Clustering Dendrogram - {k} Clusters', 
                fontsize=16, fontweight='bold', pad=20)
    
    # Legenda
    from matplotlib.patches import Patch
    legend_elements = [Patch(facecolor=colors[i], label=f'Cluster {i+1}') 
                      for i in range(k)]
    ax.legend(handles=legend_elements, loc='upper left', 
             fontsize=10, title='Clusters', title_fontsize=12, framealpha=0.95)
    
    plt.tight_layout()
    return fig, ax


def plot_dendrogram_network_style(linkage_matrix, labels, k=6, figsize=(18, 12),
                                  max_labels=70, label_fontsize=9, seed=42):
    """
    Crea un dendrogramma con layout a grafo come nell'immagine
    I nodi sono posizionati in uno spazio 2D usando un algoritmo di layout
    """
    
    # Determina i cluster
    clusters = fcluster(linkage_matrix, k, criterion='maxclust')
    
    # Converti in albero
    tree = to_tree(linkage_matrix, rd=False)
    
    # Crea un grafo NetworkX dall'albero
    import networkx as nx
    
    G = nx.Graph()
    node_labels_map = {}
    node_clusters = {}
    
    def add_edges(node, parent=None):
        node_id = id(node)
        
        if node.is_leaf():
            leaf_label = labels[node.id]
            node_labels_map[node_id] = leaf_label
            node_clusters[node_id] = clusters[node.id]
            G.add_node(node_id, is_leaf=True, label=leaf_label, 
                      cluster=clusters[node.id])
        else:
            node_labels_map[node_id] = ""
            G.add_node(node_id, is_leaf=False, label="")
            add_edges(node.left, node_id)
            add_edges(node.right, node_id)
        
        if parent is not None:
            G.add_edge(parent, node_id)
    
    add_edges(tree)
    
    # Usa un layout gerarchico personalizzato
    # Prova diversi layout per ottenere quello desiderato
    pos = nx.spring_layout(G, k=3, iterations=100, seed=seed, scale=30000)
    
    # Colori per i cluster
    colors = plt.cm.Set3(np.linspace(0, 1, k))
    
    # Filtra le etichette
    leaf_nodes = [n for n in G.nodes() if G.nodes[n].get('is_leaf', False)]
    
    if len(leaf_nodes) > max_labels:
        labels_per_cluster = max_labels // k
        labels_to_show = set()
        
        for cluster_id in range(1, k+1):
            cluster_nodes = [n for n in leaf_nodes 
                           if node_clusters.get(n) == cluster_id]
            selected = cluster_nodes[:labels_per_cluster]
            labels_to_show.update(selected)
    else:
        labels_to_show = set(leaf_nodes)
    
    # Crea il plot
    fig, ax = plt.subplots(figsize=figsize)
    
    # Disegna gli archi
    nx.draw_networkx_edges(G, pos, alpha=0.15, width=0.5, edge_color='black', ax=ax)
    
    # Disegna i nodi per cluster
    for cluster_id in range(1, k+1):
        cluster_nodes = [n for n in leaf_nodes if node_clusters.get(n) == cluster_id]
        
        # Nodi con etichetta
        labeled = [n for n in cluster_nodes if n in labels_to_show]
        unlabeled = [n for n in cluster_nodes if n not in labels_to_show]
        
        if labeled:
            nx.draw_networkx_nodes(G, pos, nodelist=labeled,
                                  node_color=[colors[cluster_id-1]],
                                  node_size=50, alpha=0.9, ax=ax)
        
        if unlabeled:
            nx.draw_networkx_nodes(G, pos, nodelist=unlabeled,
                                  node_color=[colors[cluster_id-1]],
                                  node_size=20, alpha=0.5, ax=ax)
    
    # Disegna nodi interni
    internal = [n for n in G.nodes() if not G.nodes[n].get('is_leaf', False)]
    nx.draw_networkx_nodes(G, pos, nodelist=internal,
                          node_color='lightgray', node_size=10, alpha=0.3, ax=ax)
    
    # Aggiungi etichette
    for node in labels_to_show:
        x, y = pos[node]
        label = node_labels_map[node]
        cluster_id = node_clusters[node]
        
        ax.text(x, y, label, fontsize=label_fontsize,
               color=colors[cluster_id-1], ha='center', va='center',
               fontweight='bold',
               bbox=dict(boxstyle='round,pad=0.3', facecolor='white',
                        edgecolor='none', alpha=0.7))
    
    ax.set_title(f'Hierarchical Clustering Dendrogram - {k} Clusters',
                fontsize=16, fontweight='bold', pad=20)
    ax.axis('off')
    
    # Legenda
    from matplotlib.patches import Patch
    legend_elements = [Patch(facecolor=colors[i], label=f'Cluster {i+1}')
                      for i in range(k)]
    ax.legend(handles=legend_elements, loc='upper right',
             fontsize=10, title='Clusters', title_fontsize=12, framealpha=0.95)
    
    plt.tight_layout()
    return fig, ax


# ============================================
# Dendrogramma
# ============================================
print("Creazione dendrogramma con layout a grafo...")
fig, ax = plot_dendrogram_network_style(
    h_cl_cs_rs,
    terms,
    k=6,
    figsize=(20, 14),
    max_labels=70,
    label_fontsize=8,
    seed=42
)
plt.show()

### Clustering documents

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy.cluster.hierarchy import dendrogram, linkage, cut_tree, fcluster
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import matplotlib.patches as mpatches

In [None]:
# Crea il Document Term Matrix (DTM - documenti come righe)
vectorizer = CountVectorizer(token_pattern=r'\b\w{2,}\b')
dtm_matrix = vectorizer.fit_transform(df['text_nostop'])

print(f"Document Term Matrix: {dtm_matrix.shape[0]} documents, {dtm_matrix.shape[1]} terms")

# Rimuovi termini sparsi (equivalente a removeSparseTerms con sparse=0.99)
min_docs = int(np.ceil(0.01 * dtm_matrix.shape[0]))
term_counts = np.array((dtm_matrix > 0).sum(axis=0)).flatten()
mask = term_counts >= min_docs
dtm_rs = dtm_matrix[:, mask]
terms = np.array(vectorizer.get_feature_names_out())[mask]

print(f"Reduced DTM: {dtm_rs.shape[0]} documents, {dtm_rs.shape[1]} terms")

In [None]:
# Calcola distanze cosine sui DOCUMENTI
dtm_rs_dense = dtm_rs.toarray()
dst_cs_rs_d = pdist(dtm_rs_dense, metric='cosine')

# Hierarchical Clustering con metodo Ward
h_cl_cs_rs_d = linkage(dst_cs_rs_d, method='ward')

In [None]:
# Plot dendrogram (documenti)
plt.figure(figsize=(12, 6))
plt.title("Hierarchical Cluster\ndocuments\ncosine - Ward", fontsize=12)
dendrogram(h_cl_cs_rs_d, 
           leaf_font_size=5,
           color_threshold=0,
           above_threshold_color='black')
plt.xlabel("")
plt.xticks(fontsize=6)
plt.yticks(fontsize=8)
plt.tight_layout()
plt.show()

In [None]:
# Calcola Silhouette scores per diversi numeri di cluster
lim_clu = range(2, 21)
vSilIn = []

dst_cs_rs_d_matrix = squareform(dst_cs_rs_d)

for k in lim_clu:
    clusters = fcluster(h_cl_cs_rs_d, k, criterion='maxclust')
    sil_score = silhouette_score(dst_cs_rs_d_matrix, clusters, metric='precomputed')
    vSilIn.append(sil_score)

# Plot Silhouette scores
df_sil = pd.DataFrame({'n_clust': list(lim_clu), 'silhouette': vSilIn})
plt.figure(figsize=(10, 6))
plt.plot(df_sil['n_clust'], df_sil['silhouette'], 
         color='blue', linewidth=2, marker='o', markersize=6)
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette score')
plt.xticks(lim_clu)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.patches import Wedge

def comparison_cloud_sectorial(dfclu, max_words=200, words_per_cluster=None,
                               figsize=(14, 14), title="comparison cloud terms by cluster"):
    """
    Crea una comparison cloud SETTORIALE come comparison.cloud() in R
    Ogni cluster occupa un settore circolare
    Le parole NON vengono ripetute tra i cluster (ogni parola appare solo nel cluster dominante)
    """
    
    n_clusters = len(dfclu.columns)
    colors_map = plt.cm.Set3(np.linspace(0, 1, n_clusters))
    
    # Se non specificato, distribuisci equamente le parole
    if words_per_cluster is None:
        words_per_cluster = max_words // n_clusters
    
    # FASE 1: Assegna ogni parola al cluster dove ha il valore massimo
    cluster_words = {}
    used_words = set()
    
    for idx, clust in enumerate(dfclu.columns):
        cluster_words[clust] = {}
    
    # Per ogni parola, trova il cluster dove ha il valore massimo
    for word in dfclu.index:
        max_cluster = dfclu.loc[word].idxmax()
        max_value = dfclu.loc[word, max_cluster]
        
        if max_value > 0:
            cluster_words[max_cluster][word] = max_value
    
    # FASE 2: Seleziona top N parole per ogni cluster
    for clust in dfclu.columns:
        sorted_words = sorted(cluster_words[clust].items(), key=lambda x: x[1], reverse=True)
        cluster_words[clust] = dict(sorted_words[:words_per_cluster])
    
    # Crea figura
    fig, ax = plt.subplots(figsize=figsize, subplot_kw=dict(aspect="equal"))
    
    # Dimensioni della word cloud
    size = 1200
    center_x, center_y = size // 2, size // 2
    radius = 550
    
    # Crea un'immagine vuota
    full_image = np.ones((size, size, 3), dtype=np.uint8) * 255
    
    # Angolo per ogni settore
    angle_step = 360 / n_clusters
    
    for idx, clust in enumerate(dfclu.columns):
        # Angolo iniziale e finale per questo cluster
        start_angle = idx * angle_step
        end_angle = (idx + 1) * angle_step
        
        # Usa le parole uniche assegnate a questo cluster
        frequencies = cluster_words[clust]
        
        if not frequencies:
            continue
        
        # Crea maschera settoriale (wedge/spicchio)
        x, y = np.ogrid[:size, :size]
        
        # Converti coordinate cartesiane in polari
        dx = x - center_x
        dy = y - center_y
        distance = np.sqrt(dx**2 + dy**2)
        angle = np.degrees(np.arctan2(dy, dx)) % 360
        
        # Crea maschera: True fuori dal settore
        mask = np.ones((size, size), dtype=bool)
        
        # Dentro il cerchio
        in_circle = distance <= radius
        
        # Dentro il settore angolare
        if start_angle < end_angle:
            in_sector = (angle >= start_angle) & (angle < end_angle)
        else:  # Caso che attraversa lo 0
            in_sector = (angle >= start_angle) | (angle < end_angle)
        
        # La maschera finale: False dove vogliamo le parole
        mask = ~(in_circle & in_sector)
        mask = (mask * 255).astype(np.uint8)
        
        # Colore per questo cluster
        color = colors_map[idx]
        
        def color_func_single(word, **kwargs):
            return tuple(int(c * 255) for c in color[:3])
        
        # Crea word cloud per questo settore
        wc = WordCloud(
            width=size,
            height=size,
            background_color='white',
            max_words=words_per_cluster,
            relative_scaling=0.4,
            min_font_size=6,
            max_font_size=60,
            prefer_horizontal=0.7,
            margin=5,
            mask=mask,
            color_func=color_func_single,
            mode='RGBA'
        ).generate_from_frequencies(frequencies)
        
        # Sovrapponi questa word cloud sull'immagine completa
        wc_array = np.array(wc)
        # Copia solo le parti non bianche
        non_white = (wc_array[:,:,0] < 255) | (wc_array[:,:,1] < 255) | (wc_array[:,:,2] < 255)
        full_image[non_white] = wc_array[non_white, :3]
    
    # Mostra l'immagine finale
    ax.imshow(full_image)
    ax.set_title(title, fontsize=16, fontweight='bold', pad=15)
    ax.axis('off')
    
    # Aggiungi legenda
    legend_elements = []
    for idx, clust in enumerate(dfclu.columns):
        color = colors_map[idx]
        legend_elements.append(plt.Line2D([0], [0], marker='o', color='w', 
                                         markerfacecolor=color, markersize=10, 
                                         label=clust))
    
    ax.legend(handles=legend_elements, loc='upper right', fontsize=10, 
             framealpha=0.9, title='Clusters', title_fontsize=12)
    
    plt.tight_layout()
    return fig, ax


def comparison_cloud_sectorial_balanced(dfclu, max_words=200, 
                                       figsize=(14, 14), 
                                       title="comparison cloud terms by cluster"):
    """
    Versione bilanciata: distribuisce equamente le parole tra i cluster
    Le parole non vengono ripetute
    """
    n_clusters = len(dfclu.columns)
    words_per_cluster = max_words // n_clusters
    
    return comparison_cloud_sectorial(dfclu, max_words=max_words, 
                                     words_per_cluster=words_per_cluster,
                                     figsize=figsize, title=title)


# ============================================
# Crea le comparison clouds SETTORIALI
# ============================================

# Opzione 1: Comparison cloud settoriale bilanciata
print("Creazione comparison cloud settoriale bilanciata...")
fig, ax = comparison_cloud_sectorial_balanced(dfclu, max_words=200, figsize=(14, 14))
plt.show()

# Opzione 2: Comparison cloud settoriale con TF-IDF
print("Creazione comparison cloud settoriale con TF-IDF...")
fig, ax = comparison_cloud_sectorial_balanced(dfclu_tfidf, max_words=200, 
                                             figsize=(14, 14),
                                             title="comparison cloud terms by cluster (TF-IDF)")
plt.show()

# Opzione 3: Settoriale con numero personalizzato di parole per cluster
print("Creazione comparison cloud settoriale (30 parole/cluster)...")
fig, ax = comparison_cloud_sectorial(dfclu, max_words=240, words_per_cluster=30,
                                    figsize=(14, 14))
plt.show()

## Tickers Networks

In [None]:
import pandas as pd
import numpy as np

articles_df = pd.read_csv("output.csv", sep=';')

tickers_df = articles_df['tickers'].str.get_dummies(sep=',')
tickers_df.head()

# Estrai i nomi dei ticker dalle colonne del DataFrame
ticker_names = list(tickers_df.columns)

print(f"‚úÖ Estratti {len(ticker_names)} ticker unici")
print(f"üìã Primi 10 ticker: {ticker_names[:10]}")

In [None]:
# Moltiplichiamo la matrice binaria per la sua trasposta
co_occurrence_matrix = tickers_df.T.dot(tickers_df)

# Rimuovere la diagonale (opzionale)
import numpy as np
np.fill_diagonal(co_occurrence_matrix.values, 0)

co_occurrence_matrix.head()

In [None]:
# Converti i DataFrame in numpy array
tickers_df = tickers_df.values 
co_occurrence_matrix = co_occurrence_matrix.values  

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

# ============================================
# 1Ô∏è‚É£ ANALISI MATRICE ONE-HOT (tickers_adj_m)
# ============================================

def analyze_onehot(tickers_df, ticker_names):
    """Analisi della matrice binaria ticker-articoli"""
    
    print("=" * 60)
    print("üìä ANALISI MATRICE ONE-HOT")
    print("=" * 60)
    
    # Top ticker per frequenza
    freq = tickers_df.sum(axis=0)
    top_idx = np.argsort(freq)[::-1][:15]
    
    df_top = pd.DataFrame({
        'Ticker': [ticker_names[i] for i in top_idx],
        'Menzioni': freq[top_idx],
        '% Articoli': (freq[top_idx] / len(tickers_df) * 100).round(1)
    })
    
    print("\nüîù TOP 15 TICKER PER FREQUENZA:")
    print(df_top.to_string(index=False))
    
    # Distribuzione ticker per articolo
    tickers_per_article = tickers_df.sum(axis=1)
    
    print(f"\nüìà DISTRIBUZIONE TICKER PER ARTICOLO:")
    print(f"  Media: {tickers_per_article.mean():.2f}")
    print(f"  Mediana: {np.median(tickers_per_article):.0f}")
    print(f"  Max: {tickers_per_article.max():.0f}")
    print(f"  Articoli con 1 solo ticker: {(tickers_per_article == 1).sum()} ({(tickers_per_article == 1).sum()/len(tickers_df)*100:.1f}%)")
    print(f"  Articoli con 3+ ticker: {(tickers_per_article >= 3).sum()} ({(tickers_per_article >= 3).sum()/len(tickers_df)*100:.1f}%)")
    
    # Sparsit√†
    sparsity = 100 * (1 - np.count_nonzero(tickers_df) / tickers_df.size)
    print(f"\nüí≠ SPARSIT√Ä MATRICE: {sparsity:.1f}%")
    
    return df_top, tickers_per_article


# ============================================
# 2Ô∏è‚É£ ANALISI CO-OCCURRENCE MATRIX
# ============================================

def analyze_cooccurrence(co_matrix, ticker_names, threshold=5):
    """Analisi della matrice di co-occorrenza"""
    
    print("\n" + "=" * 60)
    print("üï∏Ô∏è  ANALISI NETWORK CO-OCCURRENCE")
    print("=" * 60)
    
    # Rimuovi diagonale (auto-occorrenze)
    co_clean = co_matrix.copy()
    np.fill_diagonal(co_clean, 0)
    
    # Top co-occorrenze
    triu_idx = np.triu_indices_from(co_clean, k=1)
    edges = [(ticker_names[i], ticker_names[j], co_clean[i, j]) 
             for i, j in zip(*triu_idx) if co_clean[i, j] >= threshold]
    edges_sorted = sorted(edges, key=lambda x: x[2], reverse=True)[:20]
    
    print(f"\nüîó TOP 20 CO-OCCORRENZE (min {threshold} articoli):")
    for t1, t2, count in edges_sorted[:10]:
        print(f"  {t1:6s} ‚Üî {t2:6s}: {count:3.0f} articoli")
    
    # Metriche di rete
    G = nx.Graph()
    for t1, t2, w in edges:
        G.add_edge(t1, t2, weight=w)
    
    if len(G.nodes()) > 0:
        print(f"\nüåê METRICHE NETWORK:")
        print(f"  Nodi (ticker connessi): {G.number_of_nodes()}")
        print(f"  Archi (relazioni): {G.number_of_edges()}")
        print(f"  Densit√†: {nx.density(G):.3f}")
        
        # Centralit√† (top 10)
        if G.number_of_nodes() > 0:
            centrality = nx.degree_centrality(G)
            top_central = sorted(centrality.items(), key=lambda x: x[1], reverse=True)[:10]
            
            print(f"\n‚≠ê TOP 10 TICKER PER CENTRALIT√Ä (pi√π connessi):")
            for ticker, cent in top_central:
                print(f"  {ticker:6s}: {cent:.3f}")
    
    return G, edges_sorted


# ============================================
# 3Ô∏è‚É£ CLUSTERING & COMMUNITIES
# ============================================

def find_communities(co_matrix, ticker_names, min_cooccur=3):
    """Trova comunit√† di ticker correlati"""
    
    print("\n" + "=" * 60)
    print("üéØ COMMUNITY DETECTION")
    print("=" * 60)
    
    # Filtra ticker con almeno min_cooccur co-occorrenze
    mask = (co_matrix.sum(axis=0) - np.diag(co_matrix)) >= min_cooccur
    co_filtered = co_matrix[mask][:, mask]
    tickers_filtered = [ticker_names[i] for i in range(len(ticker_names)) if mask[i]]
    
    if len(tickers_filtered) < 3:
        print("‚ö†Ô∏è  Troppi pochi ticker per clustering")
        return None
    
    # Similarity matrix (Cosine)
    sim_matrix = cosine_similarity(co_filtered)
    np.fill_diagonal(sim_matrix, 0)
    
    # Hierarchical clustering
    condensed_dist = 1 - sim_matrix[np.triu_indices_from(sim_matrix, k=1)]
    linkage_matrix = linkage(condensed_dist, method='ward')
    
    # Identifica cluster (threshold empirico)
    from scipy.cluster.hierarchy import fcluster
    clusters = fcluster(linkage_matrix, t=0.7, criterion='distance')
    
    # Raggruppa ticker per cluster
    cluster_dict = {}
    for ticker, cluster_id in zip(tickers_filtered, clusters):
        cluster_dict.setdefault(cluster_id, []).append(ticker)
    
    print(f"\nüîç Trovati {len(cluster_dict)} cluster:")
    for cid, members in sorted(cluster_dict.items(), key=lambda x: len(x[1]), reverse=True):
        if len(members) >= 2:
            print(f"\n  Cluster {cid} ({len(members)} ticker):")
            print(f"    {', '.join(members)}")
    
    return cluster_dict, linkage_matrix, tickers_filtered


# ============================================
# 4Ô∏è‚É£ VISUALIZZAZIONI COMPATTE
# ============================================

def plot_summary(tickers_adj_m, co_matrix, ticker_names, df_top):
    """4 plot compatti in una figura"""
    
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    fig.suptitle('üìä Ticker Analysis Dashboard', fontsize=16, fontweight='bold')
    
    # 1. Top ticker
    ax = axes[0, 0]
    ax.barh(df_top['Ticker'][:10], df_top['Menzioni'][:10], color='steelblue')
    ax.set_xlabel('Numero di Menzioni')
    ax.set_title('üîù Top 10 Ticker per Frequenza')
    ax.invert_yaxis()
    ax.grid(axis='x', alpha=0.3)
    
    # 2. Distribuzione ticker per articolo
    ax = axes[0, 1]
    tickers_per_article = tickers_adj_m.sum(axis=1)
    ax.hist(tickers_per_article, bins=range(0, int(tickers_per_article.max())+2), 
            color='coral', edgecolor='black', alpha=0.7)
    ax.set_xlabel('Numero di Ticker per Articolo')
    ax.set_ylabel('Frequenza')
    ax.set_title('üìà Distribuzione Ticker/Articolo')
    ax.grid(axis='y', alpha=0.3)
    
    # 3. Heatmap co-occorrenze (top ticker)
    ax = axes[1, 0]
    top_idx = np.argsort(tickers_adj_m.sum(axis=0))[::-1][:15]
    co_sub = co_matrix[top_idx][:, top_idx]
    sns.heatmap(co_sub, xticklabels=[ticker_names[i] for i in top_idx],
                yticklabels=[ticker_names[i] for i in top_idx],
                cmap='YlOrRd', annot=False, fmt='g', ax=ax, cbar_kws={'label': 'Co-occorrenze'})
    ax.set_title('üî• Heatmap Co-occurrence (Top 15)')
    plt.setp(ax.get_xticklabels(), rotation=45, ha='right')
    
    # 4. Network visualization (top edges)
    ax = axes[1, 1]
    co_clean = co_matrix.copy()
    np.fill_diagonal(co_clean, 0)
    triu_idx = np.triu_indices_from(co_clean, k=1)
    edges = [(ticker_names[i], ticker_names[j], co_clean[i, j]) 
             for i, j in zip(*triu_idx) if co_clean[i, j] >= 8]
    
    G = nx.Graph()
    for t1, t2, w in edges:
        G.add_edge(t1, t2, weight=w)
    
    if len(G.nodes()) > 0:
        pos = nx.spring_layout(G, k=0.5, seed=42)
        weights = [G[u][v]['weight'] for u, v in G.edges()]
        nx.draw_networkx_nodes(G, pos, node_color='lightblue', node_size=500, ax=ax)
        nx.draw_networkx_labels(G, pos, font_size=8, ax=ax)
        nx.draw_networkx_edges(G, pos, width=[w/3 for w in weights], alpha=0.5, ax=ax)
        ax.set_title('üï∏Ô∏è  Network Co-occurrence (‚â•8 articoli)')
    else:
        ax.text(0.5, 0.5, 'Nessuna co-occorrenza forte', ha='center', va='center')
        ax.set_title('üï∏Ô∏è  Network Co-occurrence')
    
    ax.axis('off')
    
    plt.tight_layout()
    plt.show()


# ============================================
# üöÄ ESECUZIONE COMPLETA
# ============================================

# Assumendo che hai gi√†:
# - tickers_adj_m: matrice (num_articoli √ó num_ticker)
# - co_occurrence_matrix: matrice (num_ticker √ó num_ticker)
# - ticker_names: lista dei nomi dei ticker

# Esempio di utilizzo:

# Esegui analisi
df_top, tickers_per_art = analyze_onehot(tickers_df, ticker_names)
G, edges = analyze_cooccurrence(co_occurrence_matrix, ticker_names, threshold=5)
clusters, linkage_m, tickers_filt = find_communities(co_occurrence_matrix, ticker_names, min_cooccur=3)
plot_summary(tickers_df, co_occurrence_matrix, ticker_names, df_top)


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.patches import Polygon
import networkx as nx
from scipy.spatial import ConvexHull
from itertools import cycle

# ============================================
# üï∏Ô∏è NETWORK CON COMMUNITY DETECTION
# ============================================

def create_network_with_communities(co_matrix, ticker_names, threshold=5):
    """
    Crea network e rileva community usando Louvain algorithm
    """
    # Costruisci grafo
    co_clean = co_matrix.copy()
    np.fill_diagonal(co_clean, 0)
    
    triu_idx = np.triu_indices_from(co_clean, k=1)
    edges = [(ticker_names[i], ticker_names[j], co_clean[i, j]) 
             for i, j in zip(*triu_idx) if co_clean[i, j] >= threshold]
    
    G = nx.Graph()
    for t1, t2, w in edges:
        G.add_edge(t1, t2, weight=w)
    
    if len(G.nodes()) == 0:
        print("‚ö†Ô∏è  Nessun nodo nel grafo. Abbassa la threshold.")
        return None, None
    
    # Community detection (Louvain)
    communities = nx.community.louvain_communities(G, seed=42)
    
    # Crea mapping nodo -> community
    node_to_comm = {}
    for idx, comm in enumerate(communities):
        for node in comm:
            node_to_comm[node] = idx
    
    print(f"\nüéØ COMMUNITY DETECTION:")
    print(f"  Comunit√† trovate: {len(communities)}")
    for idx, comm in enumerate(sorted(communities, key=len, reverse=True)[:10]):
        print(f"  Community {idx+1}: {len(comm)} nodi - {list(comm)[:5]}{'...' if len(comm) > 5 else ''}")
    
    return G, communities


# ============================================
# üé® VISUALIZZAZIONE CON ELLISSI/HULLS (FIXED)
# ============================================

def plot_network_with_communities(G, communities, highlight_top=30, figsize=(18, 14)):
    """
    Visualizza network con community colorate e convex hulls
    ‚ö†Ô∏è FIXED: Gestione corretta dei nodi mancanti nelle posizioni
    """
    if G is None or len(G.nodes()) == 0:
        print("‚ö†Ô∏è  Grafo vuoto, impossibile visualizzare")
        return
    
    fig, ax = plt.subplots(figsize=figsize)
    
    # Layout spring (migliore per community)
    pos = nx.spring_layout(G, k=0.8, iterations=50, seed=42)
    
    # Verifica: stampa nodi nel grafo vs nodi con posizioni
    print(f"\nüîç DEBUG INFO:")
    print(f"   Nodi nel grafo G: {G.number_of_nodes()}")
    print(f"   Nodi con posizioni: {len(pos)}")
    
    # Colori per community
    colors = plt.cm.tab20(np.linspace(0, 1, len(communities)))
    node_colors = []
    node_to_comm = {}
    
    for idx, comm in enumerate(communities):
        for node in comm:
            node_to_comm[node] = idx
    
    for node in G.nodes():
        comm_idx = node_to_comm.get(node, 0)
        node_colors.append(colors[comm_idx])
    
    # Disegna convex hulls per ogni community (solo se >= 3 nodi)
    for idx, comm in enumerate(communities):
        if len(comm) < 3:
            continue
        
        # ‚úÖ FIX: Verifica che i nodi siano in pos
        points = np.array([pos[node] for node in comm if node in pos])
        if len(points) < 3:
            continue
        
        try:
            hull = ConvexHull(points)
            hull_points = points[hull.vertices]
            
            # Espandi leggermente l'hull per estetica
            center = points.mean(axis=0)
            expanded = center + 1.2 * (hull_points - center)
            
            polygon = Polygon(expanded, alpha=0.15, color=colors[idx], 
                            ec=colors[idx], linewidth=2, linestyle='--')
            ax.add_patch(polygon)
        except:
            continue  # Skip se punti collineari
    
    # Node sizes basati su degree centrality
    centrality = nx.degree_centrality(G)
    node_sizes = [500 + 3000 * centrality.get(node, 0) for node in G.nodes()]
    
    # ‚úÖ FIX: Evidenzia top nodi CHE SONO NEL GRAFO E HANNO POSIZIONI
    top_nodes = sorted(centrality.items(), key=lambda x: x[1], reverse=True)[:highlight_top]
    top_node_names = [n[0] for n in top_nodes if n[0] in pos]  # ‚Üê FIX PRINCIPALE
    
    print(f"   Top nodes selezionati: {len(top_node_names)}")
    
    # Disegna edges
    weights = [G[u][v]['weight'] for u, v in G.edges()]
    max_weight = max(weights) if weights else 1
    edge_widths = [1 + 3 * (w / max_weight) for w in weights]
    
    nx.draw_networkx_edges(G, pos, alpha=0.3, width=edge_widths, 
                           edge_color='gray', ax=ax)
    
    # Disegna nodi
    nx.draw_networkx_nodes(G, pos, node_color=node_colors, 
                          node_size=node_sizes, alpha=0.8, 
                          edgecolors='black', linewidths=1.5, ax=ax)
    
    # ‚úÖ FIX: Labels solo per nodi che esistono in pos
    labels_dict = {node: node for node in top_node_names if node in pos}
    if labels_dict:
        nx.draw_networkx_labels(G, pos, labels=labels_dict, 
                               font_size=8, font_weight='bold', ax=ax)
    
    # Legend per community (top 5)
    legend_elements = []
    for idx, comm in enumerate(sorted(communities, key=len, reverse=True)[:5]):
        sample = list(comm)[:3]
        label = f"Comm {idx+1} ({len(comm)} nodi): {', '.join(sample)}..."
        legend_elements.append(mpatches.Patch(color=colors[idx], label=label))
    
    ax.legend(handles=legend_elements, loc='upper left', fontsize=9)
    
    ax.set_title(f'Network Analysis: {len(communities)} Communities Detection\n(Hulls = aree di community, dimensione nodo = centralit√†)', 
                fontsize=14, fontweight='bold')
    ax.axis('off')
    plt.tight_layout()
    plt.show()
    
    return pos


# ============================================
# üìä LAYOUT ALTERNATIVI (FIXED)
# ============================================

def plot_circular_by_community(G, communities, figsize=(14, 14)):
    """
    Layout circolare: ogni community in un anello separato
    ‚ö†Ô∏è FIXED: Gestione corretta dei nodi
    """
    if G is None or len(G.nodes()) == 0:
        return
    
    fig, ax = plt.subplots(figsize=figsize)
    
    # Ordina communities per dimensione
    sorted_comms = sorted(communities, key=len, reverse=True)
    
    # Posiziona nodi in cerchi concentrici per community
    pos = {}
    radius_step = 2
    
    for comm_idx, comm in enumerate(sorted_comms):
        radius = 3 + comm_idx * radius_step
        n_nodes = len(comm)
        
        for node_idx, node in enumerate(comm):
            angle = 2 * np.pi * node_idx / n_nodes
            pos[node] = (radius * np.cos(angle), radius * np.sin(angle))
    
    # Colori
    colors = plt.cm.tab20(np.linspace(0, 1, len(communities)))
    node_colors = []
    node_to_comm = {node: idx for idx, comm in enumerate(sorted_comms) for node in comm}
    
    for node in G.nodes():
        comm_idx = node_to_comm.get(node, 0)
        node_colors.append(colors[comm_idx])
    
    # Draw
    centrality = nx.degree_centrality(G)
    node_sizes = [300 + 1500 * centrality.get(node, 0) for node in G.nodes()]
    
    nx.draw_networkx_edges(G, pos, alpha=0.2, width=0.5, edge_color='gray', ax=ax)
    nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=node_sizes, 
                          alpha=0.8, edgecolors='black', linewidths=1, ax=ax)
    
    # ‚úÖ FIX: Labels selettivi - verifica nodi in pos
    top_nodes = sorted(centrality.items(), key=lambda x: x[1], reverse=True)[:15]
    labels_dict = {n[0]: n[0] for n in top_nodes if n[0] in pos and n[0] in G.nodes()}
    
    if labels_dict:
        nx.draw_networkx_labels(G, pos, labels=labels_dict, font_size=8, ax=ax)
    
    ax.set_title('Network Layout Circolare (per Community)', fontsize=14, fontweight='bold')
    ax.axis('off')
    plt.tight_layout()
    plt.show()


# ============================================
# üìà COMMUNITY STATISTICS
# ============================================

def analyze_communities(G, communities):
    """
    Statistiche dettagliate sulle community
    """
    print("\n" + "="*60)
    print("üìä STATISTICHE COMMUNITY")
    print("="*60)
    
    comm_stats = []
    
    for idx, comm in enumerate(communities):
        subgraph = G.subgraph(comm)
        
        # Metriche
        density = nx.density(subgraph) if len(comm) > 1 else 0
        avg_degree = np.mean([d for n, d in subgraph.degree()]) if len(comm) > 0 else 0
        
        # Nodi pi√π centrali nella community
        if len(comm) > 0:
            sub_centrality = nx.degree_centrality(subgraph)
            top_node = max(sub_centrality.items(), key=lambda x: x[1])[0] if sub_centrality else "N/A"
        else:
            top_node = "N/A"
        
        comm_stats.append({
            'Community': idx + 1,
            'Nodi': len(comm),
            'Archi': subgraph.number_of_edges(),
            'Densit√†': round(density, 3),
            'Grado Medio': round(avg_degree, 2),
            'Nodo Centrale': top_node,
            'Membri': ', '.join(list(comm)[:5]) + ('...' if len(comm) > 5 else '')
        })
    
    df_stats = pd.DataFrame(comm_stats).sort_values('Nodi', ascending=False)
    
    print("\nüîù TOP COMMUNITIES:")
    print(df_stats.head(10).to_string(index=False))
    
    # Modularity (qualit√† del clustering)
    modularity = nx.community.modularity(G, communities)
    print(f"\nüìê MODULARITY SCORE: {modularity:.3f}")
    print("   (>0.3 = buona separazione delle community)")
    
    return df_stats


# ============================================
# üöÄ ESECUZIONE COMPLETA
# ============================================

def run_advanced_network_analysis(co_occurrence_matrix, ticker_names, threshold=5):
    """
    Pipeline completa di analisi network avanzata
    ‚ö†Ô∏è FIXED: Gestione robusta degli errori
    """
    print("üöÄ AVVIO ANALISI NETWORK AVANZATA")
    print("="*60)
    
    # 1. Crea network e rileva community
    G, communities = create_network_with_communities(
        co_occurrence_matrix, ticker_names, threshold=threshold
    )
    
    if G is None:
        print("\n‚ö†Ô∏è  Analisi interrotta: grafo vuoto")
        return None, None, None
    
    # 2. Statistiche community
    df_stats = analyze_communities(G, communities)
    
    # 3. Visualizzazione con hulls
    print("\nüìä Generazione visualizzazioni...")
    try:
        plot_network_with_communities(G, communities, highlight_top=50)  # ‚Üê Pi√π nomi!
    except Exception as e:
        print(f"‚ö†Ô∏è  Errore nella visualizzazione con hulls: {e}")
    
    print("\n‚úÖ ANALISI COMPLETATA")
    
    return G, communities, df_stats


# ============================================
# üí° ESEMPIO DI UTILIZZO
# ============================================

if __name__ == "__main__":
    # Assumendo che hai gi√†:
    # - co_occurrence_matrix (numpy array)
    # - ticker_names (lista di nomi ticker)
    
    # Esegui l'analisi completa
    G, communities, df_stats = run_advanced_network_analysis(
        co_occurrence_matrix, 
        ticker_names, 
        threshold=5  # Abbassa per pi√π nodi, alza per network pi√π pulito
    )
    
    # Salva risultati
    if df_stats is not None:
        df_stats.to_csv('community_statistics.csv', index=False)
        print("\nüíæ Statistiche salvate in: community_statistics.csv")

# Tweets

In [None]:
import time
import pandas as pd
import networkx as nx
from networkx.algorithms import community as nx_comm
import matplotlib.pyplot as plt
from datetime import datetime
import community as community_louvain
import numpy as np
from scipy import stats
import seaborn as sns
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import re

## Bitcoin

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from datetime import datetime
import time
import re

def scrape_tweets_for_date(since_date, until_date, target_count=2500):
    """
    Scrape tweets per una specifica data range
    
    Args:
        since_date: data inizio (formato YYYY-MM-DD)
        until_date: data fine (formato YYYY-MM-DD)
        target_count: numero target di tweet da raccogliere
    
    Returns:
        Lista di dizionari contenenti i dati dei tweet
    """
    # Configurazione opzioni Chrome
    chrome_options = Options()
    # chrome_options.add_argument('--headless')  
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')

    # Inizializza il driver
    driver = webdriver.Chrome(options=chrome_options)

    # URL della pagina con le date parametrizzate
    url = f"https://nitter.net/search?f=tweets&q=bitcoin+lang%3Aen&e-nativeretweets=on&since={since_date}&until={until_date}&near="
    
    print(f"\n{'='*60}")
    print(f"Inizio scraping per periodo: {since_date} ‚Üí {until_date}")
    print(f"Target: {target_count} tweet")
    print(f"{'='*60}")
    
    # Apri la pagina
    driver.get(url)

    wait = WebDriverWait(driver, 10)

    tweets_data = []
    page_count = 0

    while len(tweets_data) < target_count:
        print(f"\n--- Pagina {page_count + 1} (Raccolti: {len(tweets_data)}/{target_count}) ---")
        
        time.sleep(5)
        
        all_items = driver.find_elements(By.CLASS_NAME, "timeline-item")
        tweets = [item for item in all_items if "show-more" not in item.get_attribute("class")]
        print(f"Trovati {len(tweets)} elementi timeline-item")
        
        for tweet_elem in tweets:
            # Controlla se abbiamo gi√† raggiunto il target
            if len(tweets_data) >= target_count:
                print(f"\n‚úì Target raggiunto: {len(tweets_data)} tweet")
                break
                
            try:
                # Extract username
                username_elem = tweet_elem.find_element(By.CLASS_NAME, "username")
                username = username_elem.text.strip().replace('@', '')
                
                # Extract tweet text
                tweet_content = tweet_elem.find_element(By.CLASS_NAME, "tweet-content")
                tweet_text = tweet_content.text.strip()
                
                # Extract reply-to username (se presente)
                reply = None
                try:
                    reply_elem = tweet_elem.find_element(By.CLASS_NAME, "replying-to")
                    reply = reply_elem.text.strip().replace('Replying to @', '').replace('@', '')
                except NoSuchElementException:
                    pass  

                # Skip retweets
                if tweet_text.startswith('RT @'):
                    continue
                
                # Extract engagement metrics
                stats_container = tweet_elem.find_element(By.CLASS_NAME, "tweet-stats")
                icon_containers = stats_container.find_elements(By.CLASS_NAME, "icon-container")

                comments = retweets = likes = quotes = 0

                for container in icon_containers:
                    container_text = container.text.strip()
                    number_parts = [part for part in container_text.split() if part.replace(',', '').isdigit()]
                    num = int(number_parts[0].replace(',', '')) if number_parts else 0
        
                    if container.find_elements(By.CLASS_NAME, "icon-comment"):
                        comments = num
                    elif container.find_elements(By.CLASS_NAME, "icon-retweet"):
                        retweets = num
                    elif container.find_elements(By.CLASS_NAME, "icon-quote"):
                        quotes = num
                    elif container.find_elements(By.CLASS_NAME, "icon-heart"):
                        likes = num
                
                # Extract mentions
                mentions = re.findall(r'@(\w+)', tweet_text)
                mentions = [m for m in mentions if m != username]

                # Extract tags
                hashtags = re.findall(r'#(\w+)', tweet_text)
                
                # Trova parole precedute da $ ma non seguite da un numero
                cashtags = re.findall(r'\$([A-Za-z][A-Za-z0-9_]*)', tweet_text)

                # Unisci hashtag e cashtag, evitando duplicati
                hashtags = list(set(hashtags + cashtags))
                
                # Aggiungi reply alle mentions (se presente e non gi√† incluso)
                if reply and reply not in mentions:
                    mentions.insert(0, reply)
                
                # Extract date
                date_elem = tweet_elem.find_element(By.CLASS_NAME, "tweet-date")
                date_link = date_elem.find_element(By.TAG_NAME, "a")  
                date_str = date_link.get_attribute('title')  
                tweet_date = datetime.strptime(date_str, '%b %d, %Y ¬∑ %I:%M %p %Z')
                
                # Avoid duplicates
                tweet_id = f"{username}_{tweet_text[:10]}"
                if not any(t['text_id'] == tweet_id for t in tweets_data):
                    tweets_data.append({
                        'thread_title': None,
                        'thread_author': None,
                        'thread_score': None,
                        'thread_num_comments': None,
                        'text_id': tweet_id,
                        'comment_parent_id': None,
                        'text_author': username,
                        'text': tweet_text,
                        'likes': likes,
                        'text_date': tweet_date,
                        'text_num_replies': comments,
                        'retweets': retweets,
                        'comment_parent_author': None,
                        'text_mentions': ' '.join(mentions) if mentions else '',
                        'text_hashtags': ' '.join(hashtags) if hashtags else '',
                        'argument': 'Bitcoin',
                        'site':'Nitter'
                    })
                    print(f"‚úì @{username}: {tweet_date} (Totale: {len(tweets_data)})")
            
            except Exception as e:
                print(f"Errore nell'estrazione del tweet: {e}")
                continue
        
        # Controlla se abbiamo raggiunto il target dopo il loop dei tweet
        if len(tweets_data) >= target_count:
            print(f"\n‚úì Target raggiunto: {len(tweets_data)} tweet")
            break
        
        # Bottone "Load more" 
        try:
            # Trova il div.show-more 
            load_more_div = driver.find_element(By.CSS_SELECTOR, "div.show-more:not(.timeline-item)")
            load_more_link = load_more_div.find_element(By.TAG_NAME, "a")
            
            link_text = load_more_link.text.strip()
            if "Load more" in link_text:
                print(f"\n‚Üí Cliccando '{link_text}'...")
                driver.execute_script("arguments[0].scrollIntoView();", load_more_link)
                time.sleep(1)
                load_more_link.click()
                page_count += 1
            else:
                print(f"\n‚úì Fine scraping - trovato '{link_text}' invece di 'Load more'")
                break
                
        except NoSuchElementException:
            print(f"\n‚ö† Fine scraping - nessun bottone 'Load more' trovato")
            print(f"‚ö† Raccolti solo {len(tweets_data)}/{target_count} tweet per questo periodo")
            break
        except Exception as e:
            print(f"\n‚ö† Fine scraping - errore: {e}")
            print(f"‚ö† Raccolti solo {len(tweets_data)}/{target_count} tweet per questo periodo")
            break

    print(f"\n{'='*60}")
    print(f"‚úì SCRAPING COMPLETATO per {since_date} ‚Üí {until_date}")
    print(f"{'='*60}")
    print(f"Totale tweet raccolti: {len(tweets_data)}")
    print(f"Pagine caricate: {page_count + 1}")
    
    # Chiudi il driver
    driver.quit()
    
    return tweets_data


# MAIN SCRIPT
if __name__ == "__main__":
    # Definisci i periodi da scrapare
    date_ranges = [
        ("2025-10-17", "2025-10-18"),
        ("2025-10-18", "2025-10-19")
    ]
    
    # Lista per raccogliere tutti i tweet
    all_tweets = []
    
    # Loop attraverso tutti i periodi
    for since, until in date_ranges:
        tweets = scrape_tweets_for_date(since, until, target_count=2500)
        all_tweets.extend(tweets)
        print(f"\n‚Üí Tweet totali raccolti finora: {len(all_tweets)}")
        
        # Pausa tra una sessione e l'altra
        if (since, until) != date_ranges[-1]:  # Non aspettare dopo l'ultima iterazione
            print("\n‚è≥ Pausa di 10 secondi prima del prossimo periodo...")
            time.sleep(10)
    
    # Risultato finale
    print(f"\n{'#'*60}")
    print(f"‚úì‚úì‚úì SCRAPING TOTALE COMPLETATO ‚úì‚úì‚úì")
    print(f"{'#'*60}")
    print(f"Tweet raccolti per periodo:")
    
    for since, until in date_ranges:
        count = sum(1 for t in all_tweets if since in str(t['text_date']))
        print(f"  ‚Ä¢ {since} ‚Üí {until}: ~{count} tweet")
    
    print(f"\nTotale complessivo: {len(all_tweets)} tweet")
    
    # Qui puoi salvare i dati come preferisci
    # es. in un CSV o database

In [None]:
tweets_Bitcoin = pd.DataFrame(all_tweets)
tweets_Bitcoin.to_excel('tweets_bitcoin.xlsx', index=False)

## Nvidia

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from datetime import datetime
import time
import re

def scrape_tweets_for_date(since_date, until_date, target_count=2500):
    """
    Scrape tweets per una specifica data range
    
    Args:
        since_date: data inizio (formato YYYY-MM-DD)
        until_date: data fine (formato YYYY-MM-DD)
        target_count: numero target di tweet da raccogliere
    
    Returns:
        Lista di dizionari contenenti i dati dei tweet
    """
    # Configurazione opzioni Chrome
    chrome_options = Options()
    # chrome_options.add_argument('--headless')  
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')

    # Inizializza il driver
    driver = webdriver.Chrome(options=chrome_options)

    # URL della pagina con le date parametrizzate
    url = f"https://nitter.net/search?f=tweets&q=nvidia+lang%3Aen&e-nativeretweets=on&since={since_date}&until={until_date}&near="
    
    print(f"\n{'='*60}")
    print(f"Inizio scraping per periodo: {since_date} ‚Üí {until_date}")
    print(f"Target: {target_count} tweet")
    print(f"{'='*60}")
    
    # Apri la pagina
    driver.get(url)

    wait = WebDriverWait(driver, 10)

    tweets_data = []
    page_count = 0

    while len(tweets_data) < target_count:
        print(f"\n--- Pagina {page_count + 1} (Raccolti: {len(tweets_data)}/{target_count}) ---")
        
        time.sleep(5)
        
        all_items = driver.find_elements(By.CLASS_NAME, "timeline-item")
        tweets = [item for item in all_items if "show-more" not in item.get_attribute("class")]
        print(f"Trovati {len(tweets)} elementi timeline-item")
        
        for tweet_elem in tweets:
            # Controlla se abbiamo gi√† raggiunto il target
            if len(tweets_data) >= target_count:
                print(f"\n‚úì Target raggiunto: {len(tweets_data)} tweet")
                break
                
            try:
                # Extract username
                username_elem = tweet_elem.find_element(By.CLASS_NAME, "username")
                username = username_elem.text.strip().replace('@', '')
                
                # Extract tweet text
                tweet_content = tweet_elem.find_element(By.CLASS_NAME, "tweet-content")
                tweet_text = tweet_content.text.strip()
                
                # Extract reply-to username (se presente)
                reply = None
                try:
                    reply_elem = tweet_elem.find_element(By.CLASS_NAME, "replying-to")
                    reply = reply_elem.text.strip().replace('Replying to @', '').replace('@', '')
                except NoSuchElementException:
                    pass  

                # Skip retweets
                if tweet_text.startswith('RT @'):
                    continue
                
                # Extract engagement metrics
                stats_container = tweet_elem.find_element(By.CLASS_NAME, "tweet-stats")
                icon_containers = stats_container.find_elements(By.CLASS_NAME, "icon-container")

                comments = retweets = likes = quotes = 0

                for container in icon_containers:
                    container_text = container.text.strip()
                    number_parts = [part for part in container_text.split() if part.replace(',', '').isdigit()]
                    num = int(number_parts[0].replace(',', '')) if number_parts else 0
        
                    if container.find_elements(By.CLASS_NAME, "icon-comment"):
                        comments = num
                    elif container.find_elements(By.CLASS_NAME, "icon-retweet"):
                        retweets = num
                    elif container.find_elements(By.CLASS_NAME, "icon-quote"):
                        quotes = num
                    elif container.find_elements(By.CLASS_NAME, "icon-heart"):
                        likes = num
                
                # Extract mentions
                mentions = re.findall(r'@(\w+)', tweet_text)
                mentions = [m for m in mentions if m != username]

                # Extract tags
                hashtags = re.findall(r'#(\w+)', tweet_text)
                
                # Trova parole precedute da $ ma non seguite da un numero
                cashtags = re.findall(r'\$([A-Za-z][A-Za-z0-9_]*)', tweet_text)

                # Unisci hashtag e cashtag, evitando duplicati
                hashtags = list(set(hashtags + cashtags))
                
                # Aggiungi reply alle mentions (se presente e non gi√† incluso)
                if reply and reply not in mentions:
                    mentions.insert(0, reply)
                
                # Extract date
                date_elem = tweet_elem.find_element(By.CLASS_NAME, "tweet-date")
                date_link = date_elem.find_element(By.TAG_NAME, "a")  
                date_str = date_link.get_attribute('title')  
                tweet_date = datetime.strptime(date_str, '%b %d, %Y ¬∑ %I:%M %p %Z')
                
                # Avoid duplicates
                tweet_id = f"{username}_{tweet_text[:10]}"
                if not any(t['text_id'] == tweet_id for t in tweets_data):
                    tweets_data.append({
                        'thread_title': None,
                        'thread_author': None,
                        'thread_score': None,
                        'thread_num_comments': None,
                        'text_id': tweet_id,
                        'comment_parent_id': None,
                        'text_author': username,
                        'text': tweet_text,
                        'likes': likes,
                        'text_date': tweet_date,
                        'text_num_replies': comments,
                        'retweets': retweets,
                        'comment_parent_author': None,
                        'text_mentions': ' '.join(mentions) if mentions else '',
                        'text_hashtags': ' '.join(hashtags) if hashtags else '',
                        'argument': 'Nvidia',
                        'site':'Nitter'
                    })
                    print(f"‚úì @{username}: {tweet_date} (Totale: {len(tweets_data)})")
            
            except Exception as e:
                print(f"Errore nell'estrazione del tweet: {e}")
                continue
        
        # Controlla se abbiamo raggiunto il target dopo il loop dei tweet
        if len(tweets_data) >= target_count:
            print(f"\n‚úì Target raggiunto: {len(tweets_data)} tweet")
            break
        
        # Bottone "Load more" 
        try:
            # Trova il div.show-more 
            load_more_div = driver.find_element(By.CSS_SELECTOR, "div.show-more:not(.timeline-item)")
            load_more_link = load_more_div.find_element(By.TAG_NAME, "a")
            
            link_text = load_more_link.text.strip()
            if "Load more" in link_text:
                print(f"\n‚Üí Cliccando '{link_text}'...")
                driver.execute_script("arguments[0].scrollIntoView();", load_more_link)
                time.sleep(1)
                load_more_link.click()
                page_count += 1
            else:
                print(f"\n‚úì Fine scraping - trovato '{link_text}' invece di 'Load more'")
                break
                
        except NoSuchElementException:
            print(f"\n‚ö† Fine scraping - nessun bottone 'Load more' trovato")
            print(f"‚ö† Raccolti solo {len(tweets_data)}/{target_count} tweet per questo periodo")
            break
        except Exception as e:
            print(f"\n‚ö† Fine scraping - errore: {e}")
            print(f"‚ö† Raccolti solo {len(tweets_data)}/{target_count} tweet per questo periodo")
            break

    print(f"\n{'='*60}")
    print(f"‚úì SCRAPING COMPLETATO per {since_date} ‚Üí {until_date}")
    print(f"{'='*60}")
    print(f"Totale tweet raccolti: {len(tweets_data)}")
    print(f"Pagine caricate: {page_count + 1}")
    
    # Chiudi il driver
    driver.quit()
    
    return tweets_data


# MAIN SCRIPT
if __name__ == "__main__":
    # Definisci i periodi da scrapare
    date_ranges = [
        ("2025-10-17", "2025-10-18"),
        ("2025-10-18", "2025-10-19")
    ]
    
    # Lista per raccogliere tutti i tweet
    all_tweets = []
    
    # Loop attraverso tutti i periodi
    for since, until in date_ranges:
        tweets = scrape_tweets_for_date(since, until, target_count=2500)
        all_tweets.extend(tweets)
        print(f"\n‚Üí Tweet totali raccolti finora: {len(all_tweets)}")
        
        # Pausa tra una sessione e l'altra
        if (since, until) != date_ranges[-1]:  # Non aspettare dopo l'ultima iterazione
            print("\n‚è≥ Pausa di 10 secondi prima del prossimo periodo...")
            time.sleep(10)
    
    # Risultato finale
    print(f"\n{'#'*60}")
    print(f"‚úì‚úì‚úì SCRAPING TOTALE COMPLETATO ‚úì‚úì‚úì")
    print(f"{'#'*60}")
    print(f"Tweet raccolti per periodo:")
    
    for since, until in date_ranges:
        count = sum(1 for t in all_tweets if since in str(t['text_date']))
        print(f"  ‚Ä¢ {since} ‚Üí {until}: ~{count} tweet")
    
    print(f"\nTotale complessivo: {len(all_tweets)} tweet")
    
    # Qui puoi salvare i dati come preferisci
    # es. in un CSV o database

In [None]:
tweets_Nvidia = pd.DataFrame(all_tweets)
tweets_Nvidia.to_excel('tweets_nvidia.xlsx', index=False)

## Tweets df

In [None]:
tweets_Bitcoin = pd.read_excel('tweets_bitcoin.xlsx')
tweets_Nvidia = pd.read_excel('tweets_nvidia.xlsx')

In [None]:
tweets_df = pd.concat([tweets_Bitcoin, tweets_Nvidia], ignore_index=True)
tweets_df.to_excel('tweets_df.xlsx', index=False)

## Network

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Set color palette moderno (ORIGINALE)
COLORS = {
    'primary': '#FF6B6B',    # Rosso/corallo
    'secondary': '#4ECDC4',  # Turchese
    'accent': '#95E1D3',     # Verde acqua
    'dark': '#34495e',       # Grigio scuro
    'light': '#ECF0F1'       # Grigio chiaro
}

sns.set_style("whitegrid")
plt.rcParams['font.family'] = 'sans-serif'

# ==========================================
# 1. NETWORK CONSTRUCTION
# ==========================================

def build_directed_weighted_network(df, argument_name):
    """Network principale: DIRECTED & WEIGHTED"""
    G = nx.DiGraph()
    
    for _, row in df.iterrows():
        user = row['text_author']
        for mentioned in row['mentions_clean']:
            if user != mentioned:
                if G.has_edge(user, mentioned):
                    G[user][mentioned]['weight'] += 1
                else:
                    G.add_edge(user, mentioned, weight=1)
    
    print(f"\nüîµ DIRECTED WEIGHTED NETWORK - {argument_name}")
    print(f"   Nodi: {G.number_of_nodes()}, Archi: {G.number_of_edges()}")
    
    return G

def build_reciprocal_network(G_directed, argument_name):
    """Network reciproco: UNDIRECTED & WEIGHTED"""
    G_reciprocal = nx.Graph()
    
    for u, v, data in G_directed.edges(data=True):
        if G_directed.has_edge(v, u):
            weight = data['weight'] + G_directed[v][u]['weight']
            if not G_reciprocal.has_edge(u, v):
                G_reciprocal.add_edge(u, v, weight=weight)
    
    print(f"\nüü¢ RECIPROCAL UNDIRECTED NETWORK - {argument_name}")
    print(f"   Nodi: {G_reciprocal.number_of_nodes()}, Archi: {G_reciprocal.number_of_edges()}")
    
    return G_reciprocal

def build_backbone_network(G_directed, argument_name, min_weight=3):
    """Network backbone: DIRECTED & UNWEIGHTED"""
    G_backbone = nx.DiGraph()
    
    for u, v, data in G_directed.edges(data=True):
        if data['weight'] >= min_weight:
            G_backbone.add_edge(u, v)
    
    print(f"\nüî¥ BACKBONE NETWORK - {argument_name} (min weight={min_weight})")
    print(f"   Nodi: {G_backbone.number_of_nodes()}, Archi: {G_backbone.number_of_edges()}")
    
    return G_backbone

# ==========================================
# 2. NETWORK METRICS & STATISTICS
# ==========================================

def analyze_directed_network(G, argument_name):
    """Analisi del network diretto"""
    print("\n" + "="*60)
    print(f"üìà DIRECTED NETWORK ANALYSIS - {argument_name}")
    print("="*60)
    
    in_degree_weighted = dict(G.in_degree(weight='weight'))
    top_mentioned = sorted(in_degree_weighted.items(), key=lambda x: x[1], reverse=True)[:10]
    
    print("\nüéØ TOP 10 MOST MENTIONED:")
    for i, (user, mentions) in enumerate(top_mentioned, 1):
        print(f"   {i}. @{user}: {mentions} menzioni")
    
    out_degree = dict(G.out_degree())
    top_active = sorted(out_degree.items(), key=lambda x: x[1], reverse=True)[:10]
    
    print("\nüí¨ TOP 10 MOST ACTIVE:")
    for i, (user, mentions) in enumerate(top_active, 1):
        print(f"   {i}. @{user}: menziona {mentions} utenti")
    
    pagerank = nx.pagerank(G, weight='weight')
    top_pagerank = sorted(pagerank.items(), key=lambda x: x[1], reverse=True)[:10]
    
    print("\n‚≠ê TOP 10 PAGERANK:")
    for i, (user, score) in enumerate(top_pagerank, 1):
        print(f"   {i}. @{user}: {score:.5f}")
    
    try:
        hits = nx.hits(G, max_iter=100)
        authorities = sorted(hits[0].items(), key=lambda x: x[1], reverse=True)[:5]
        hubs = sorted(hits[1].items(), key=lambda x: x[1], reverse=True)[:5]
        
        print("\nüèÜ TOP 5 AUTHORITIES:")
        for i, (user, score) in enumerate(authorities, 1):
            print(f"   {i}. @{user}: {score:.5f}")
        
        print("\nüîó TOP 5 HUBS:")
        for i, (user, score) in enumerate(hubs, 1):
            print(f"   {i}. @{user}: {score:.5f}")
    except:
        print("\n‚ö†Ô∏è HITS algorithm non convergente")
    
    return {
        'in_degree_weighted': in_degree_weighted,
        'pagerank': pagerank,
        'out_degree': out_degree
    }

def analyze_reciprocal_network(G, argument_name):
    """Analisi del network reciproco"""
    print("\n" + "="*60)
    print(f"üìà RECIPROCAL NETWORK ANALYSIS - {argument_name}")
    print("="*60)
    
    if G.number_of_edges() == 0:
        print("‚ö†Ô∏è Nessuna interazione reciproca trovata!")
        return {}
    
    betweenness = nx.betweenness_centrality(G, weight='weight')
    top_betweenness = sorted(betweenness.items(), key=lambda x: x[1], reverse=True)[:10]
    
    print("\nüåâ TOP 10 BRIDGES:")
    for i, (user, score) in enumerate(top_betweenness, 1):
        print(f"   {i}. @{user}: {score:.5f}")
    
    clustering = nx.clustering(G, weight='weight')
    avg_clustering = sum(clustering.values()) / len(clustering)
    
    print(f"\nüîó CLUSTERING COEFFICIENT: {avg_clustering:.4f}")
    
    try:
        import community as community_louvain
        communities = community_louvain.best_partition(G, weight='weight')
        n_communities = len(set(communities.values()))
        
        print(f"\nüë• COMMUNITY DETECTION: {n_communities} community")
        
        comm_sizes = Counter(communities.values())
        for comm_id, size in comm_sizes.most_common(5):
            members = [u for u, c in communities.items() if c == comm_id][:5]
            print(f"   Community {comm_id}: {size} membri (es: {', '.join(members)})")
        
        return {
            'betweenness': betweenness,
            'clustering': clustering,
            'communities': communities
        }
    except ImportError:
        return {
            'betweenness': betweenness,
            'clustering': clustering
        }

def analyze_backbone_network(G, argument_name):
    """Analisi del network backbone"""
    print("\n" + "="*60)
    print(f"üìà BACKBONE NETWORK ANALYSIS - {argument_name}")
    print("="*60)
    
    if G.number_of_edges() == 0:
        print("‚ö†Ô∏è Backbone vuoto - riduci min_weight")
        return {}
    
    scc = list(nx.strongly_connected_components(G))
    print(f"\nüîÑ STRONGLY CONNECTED COMPONENTS: {len(scc)}")
    largest_scc = max(scc, key=len)
    print(f"   Componente pi√π grande: {len(largest_scc)} nodi")
    
    G_undirected = G.to_undirected()
    G_undirected.remove_edges_from(nx.selfloop_edges(G_undirected))
    
    core_numbers = nx.core_number(G_undirected)
    max_core = max(core_numbers.values())
    
    print(f"\nüíé K-CORE DECOMPOSITION:")
    print(f"   Max core number: {max_core}")
    k_core = [u for u, k in core_numbers.items() if k == max_core]
    print(f"   {max_core}-core: {len(k_core)} nodi")
    if len(k_core) <= 10:
        print(f"   Membri: {', '.join(k_core)}")
    
    try:
        import community as community_louvain
        G_und_simple = nx.Graph()
        for u, v in G.edges():
            if u != v:
                G_und_simple.add_edge(u, v)
        
        communities = community_louvain.best_partition(G_und_simple)
        n_communities = len(set(communities.values()))
        
        print(f"\nüö∂ COMMUNITY DETECTION: {n_communities} community")
        
        return {
            'scc': scc,
            'core_numbers': core_numbers,
            'communities': communities
        }
    except ImportError:
        return {
            'scc': scc,
            'core_numbers': core_numbers
        }

# ==========================================
# 3. VISUALIZATIONS
# ==========================================

def visualize_directed_network(G, metrics, argument_name, top_n=50):
    """Visualizza il network diretto"""
    top_nodes = sorted(metrics['pagerank'].items(), key=lambda x: x[1], reverse=True)[:top_n]
    top_nodes = [n[0] for n in top_nodes]
    G_sub = G.subgraph(top_nodes).copy()
    
    plt.figure(figsize=(16, 12))
    
    pos = nx.spring_layout(G_sub, k=2, iterations=50, seed=42)
    
    node_sizes = [metrics['pagerank'].get(n, 0) * 50000 for n in G_sub.nodes()]
    edges = G_sub.edges()
    weights = [G_sub[u][v]['weight'] for u, v in edges]
    
    # ‚úÖ Frecce meno arrotondate
    nx.draw_networkx_edges(G_sub, pos, edge_color=COLORS['light'], 
                          width=[w*0.5 for w in weights], alpha=0.6,
                          arrows=True, arrowsize=10, 
                          arrowstyle='-|>', connectionstyle='arc3,rad=0.1')  # ‚Üê Frecce pi√π dritte
    
    nx.draw_networkx_nodes(G_sub, pos, node_size=node_sizes, 
                          node_color=COLORS['secondary'], alpha=0.8, 
                          edgecolors=COLORS['dark'], linewidths=2)
    
    nx.draw_networkx_labels(G_sub, pos, font_size=8, font_weight='bold')
    
    plt.title(f"{argument_name} - DIRECTED NETWORK\nTop {top_n} utenti per PageRank (Node's dimension = influence)", 
              fontsize=16, fontweight='bold')
    plt.axis('off')
    plt.tight_layout()
    plt.savefig(f'{argument_name.lower()}_directed_network.png', dpi=300, bbox_inches='tight', facecolor='white')
    plt.show()
    print(f"\nüíæ Salvato: {argument_name.lower()}_directed_network.png")

def visualize_reciprocal_network(G, metrics, argument_name, top_n=40):
    """Visualizza il network reciproco"""
    if G.number_of_edges() == 0:
        print("‚ö†Ô∏è Nessuna interazione reciproca da visualizzare")
        return
    
    largest_cc = max(nx.connected_components(G), key=len)
    G_sub = G.subgraph(largest_cc).copy()
    
    if 'betweenness' in metrics:
        top_nodes = sorted(metrics['betweenness'].items(), key=lambda x: x[1], reverse=True)[:top_n]
        top_nodes = [n[0] for n in top_nodes if n[0] in G_sub.nodes()]
        G_sub = G_sub.subgraph(top_nodes).copy()
    
    plt.figure(figsize=(16, 12))
    
    pos = nx.spring_layout(G_sub, k=2, iterations=50, seed=42)
    
    if 'communities' in metrics:
        communities = metrics['communities']
        node_colors = [communities.get(n, 0) for n in G_sub.nodes()]
        cmap = plt.cm.Set3
    else:
        node_colors = COLORS['accent']
        cmap = None
    
    weights = [G_sub[u][v]['weight'] for u, v in G_sub.edges()]
    
    nx.draw_networkx_edges(G_sub, pos, width=[w*0.3 for w in weights], 
                          alpha=0.5, edge_color=COLORS['light'])
    
    nx.draw_networkx_nodes(G_sub, pos, node_size=300, 
                          node_color=node_colors, cmap=cmap, 
                          alpha=0.9, edgecolors=COLORS['dark'], linewidths=1.5)
    
    nx.draw_networkx_labels(G_sub, pos, font_size=7, font_weight='bold')
    
    plt.title(f'{argument_name} - RECIPROCAL NETWORK\nBidirectional conversations (Colors = Community)', 
              fontsize=16, fontweight='bold')
    plt.axis('off')
    plt.tight_layout()
    plt.savefig(f'{argument_name.lower()}_reciprocal_network.png', dpi=300, bbox_inches='tight', facecolor='white')
    plt.show()
    print(f"\nüíæ Salvato: {argument_name.lower()}_reciprocal_network.png")

def visualize_statistics(G_dir, metrics_dir, argument_name):
    """Visualizzazioni statistiche compatte"""
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.patch.set_facecolor('white')
    
    # 1. Degree distribution
    in_degrees = [d for n, d in G_dir.in_degree()]
    out_degrees = [d for n, d in G_dir.out_degree()]
    
    axes[0, 0].hist(in_degrees, bins=30, alpha=0.7, label='In-degree', 
                   color=COLORS['primary'], edgecolor=COLORS['dark'])
    axes[0, 0].hist(out_degrees, bins=30, alpha=0.7, label='Out-degree', 
                   color=COLORS['secondary'], edgecolor=COLORS['dark'])
    axes[0, 0].set_xlabel('Degree', fontweight='bold')
    axes[0, 0].set_ylabel('Frequenza', fontweight='bold')
    axes[0, 0].set_title('Distribuzione Degree', fontweight='bold', fontsize=12)
    axes[0, 0].legend()
    axes[0, 0].set_yscale('log')
    axes[0, 0].grid(alpha=0.3)
    
    # 2. Weight distribution
    weights = [d['weight'] for u, v, d in G_dir.edges(data=True)]
    axes[0, 1].hist(weights, bins=30, color=COLORS['accent'], alpha=0.8, 
                   edgecolor=COLORS['dark'])
    axes[0, 1].set_xlabel('Peso (# menzioni)', fontweight='bold')
    axes[0, 1].set_ylabel('Frequenza', fontweight='bold')
    axes[0, 1].set_title('Distribuzione Pesi Archi', fontweight='bold', fontsize=12)
    axes[0, 1].set_yscale('log')
    axes[0, 1].grid(alpha=0.3)
    
    # 3. Top users bar chart
    top_pr = sorted(metrics_dir['pagerank'].items(), key=lambda x: x[1], reverse=True)[:15]
    users = [u for u, _ in top_pr]
    scores = [s for _, s in top_pr]
    
    colors_gradient = [COLORS['primary'] if i < 5 else COLORS['secondary'] if i < 10 
                      else COLORS['accent'] for i in range(len(users))]
    
    axes[1, 0].barh(users, scores, color=colors_gradient, alpha=0.8, edgecolor=COLORS['dark'])
    axes[1, 0].set_xlabel('PageRank Score', fontweight='bold')
    axes[1, 0].set_title('Top 15 Utenti per PageRank', fontweight='bold', fontsize=12)
    axes[1, 0].invert_yaxis()
    axes[1, 0].grid(axis='x', alpha=0.3)
    
    # 4. Adjacency heatmap
    top_20 = [u for u, _ in top_pr[:20]]
    G_sub = G_dir.subgraph(top_20)
    adj_matrix = nx.to_numpy_array(G_sub, nodelist=top_20, weight='weight')
    
    sns.heatmap(adj_matrix, xticklabels=top_20, yticklabels=top_20, 
                cmap='RdYlGn', ax=axes[1, 1], cbar_kws={'label': 'Menzioni'},
                linewidths=0.5, linecolor=COLORS['light'])
    axes[1, 1].set_title('Heatmap Menzioni (Top 20 utenti)', fontweight='bold', fontsize=12)
    axes[1, 1].set_xlabel('Menzionato', fontweight='bold')
    axes[1, 1].set_ylabel('Autore', fontweight='bold')
    
    plt.suptitle(f'{argument_name} - Network Statistics', fontsize=16, fontweight='bold', y=0.995)
    plt.tight_layout()
    plt.savefig(f'{argument_name.lower()}_statistics.png', dpi=300, bbox_inches='tight', facecolor='white')
    plt.show()
    print(f"\nüíæ Salvato: {argument_name.lower()}_statistics.png")

# ==========================================
# 4. MAIN EXECUTION PER SINGOLO ARGUMENT
# ==========================================

def analyze_single_argument(df, argument_name, min_weight_backbone=3, top_n_viz=50):
    """
    Pipeline completa di analisi per UN SINGOLO argument
    
    Parameters:
    -----------
    df : DataFrame gi√† filtrato per argument
    argument_name : 'Bitcoin' o 'Nvidia'
    min_weight_backbone : soglia per backbone network
    top_n_viz : numero nodi da visualizzare
    """
    print("\n" + "="*70)
    print(f"üöÄ ANALISI NETWORK: {argument_name.upper()}")
    print("="*70)
    print(f"Dataset: {len(df)} tweets")
    
    # Build networks
    G_directed = build_directed_weighted_network(df, argument_name)
    G_reciprocal = build_reciprocal_network(G_directed, argument_name)
    G_backbone = build_backbone_network(G_directed, argument_name, min_weight=min_weight_backbone)
    
    # Analyze
    metrics_dir = analyze_directed_network(G_directed, argument_name)
    metrics_recip = analyze_reciprocal_network(G_reciprocal, argument_name)
    metrics_backbone = analyze_backbone_network(G_backbone, argument_name)
    
    # Visualize
    print("\n" + "="*70)
    print(f"üé® GENERATING VISUALIZATIONS - {argument_name}")
    print("="*70)
    
    visualize_directed_network(G_directed, metrics_dir, argument_name, top_n=top_n_viz)
    visualize_reciprocal_network(G_reciprocal, metrics_recip, argument_name, top_n=top_n_viz)
    visualize_statistics(G_directed, metrics_dir, argument_name)
    
    print(f"\n‚úÖ ANALISI COMPLETATA: {argument_name}")
    
    return {
        'networks': {
            'directed': G_directed,
            'reciprocal': G_reciprocal,
            'backbone': G_backbone
        },
        'metrics': {
            'directed': metrics_dir,
            'reciprocal': metrics_recip,
            'backbone': metrics_backbone
        }
    }

# ==========================================
# ESECUZIONE COMPLETA
# ==========================================

if __name__ == "__main__":
    
    print("\n" + "üåü"*35)
    print("  TWITTER NETWORK ANALYSIS BY ARGUMENT")
    print("üåü"*35)
    
    # Carica DataFrame
    df = pd.read_excel("tweets_df.xlsx")
    
    # Prepara mentions
    df['mentions_clean'] = df['text_mentions'].apply(lambda x: str(x).split() if pd.notna(x) else [])
    df = df[df['mentions_clean'].apply(len) > 0]
    
    print(f"\nüìä Dataset totale: {len(df)} tweets con mentions")
    
    # Verifica colonna argument
    if 'argument' not in df.columns:
        print("\n‚ö†Ô∏è  Colonna 'argument' non trovata nel DataFrame!")
        print("    Assicurati che il file contenga la colonna 'argument' con valori 'Bitcoin' e 'Nvidia'")
        exit()
    
    # Mostra distribution
    arg_counts = df['argument'].value_counts()
    print(f"\nüìà Distribuzione argument:")
    for arg, count in arg_counts.items():
        print(f"   {arg}: {count} tweets")
    
    # ==========================================
    # ANALISI BITCOIN
    # ==========================================
    
    print("\n\n" + "üü†"*35)
    print("  BITCOIN ANALYSIS")
    print("üü†"*35)
    
    df_bitcoin = df[df['argument'] == 'Bitcoin'].copy()
    
    if len(df_bitcoin) == 0:
        print("‚ö†Ô∏è  Nessun tweet trovato per Bitcoin!")
    else:
        results_bitcoin = analyze_single_argument(
            df=df_bitcoin,
            argument_name='Bitcoin',
            min_weight_backbone=2,
            top_n_viz=5000
        )
    
    # ==========================================
    # ANALISI NVIDIA
    # ==========================================
    
    print("\n\n" + "üü¢"*35)
    print("  NVIDIA ANALYSIS")
    print("üü¢"*35)
    
    df_nvidia = df[df['argument'] == 'Nvidia'].copy()
    
    if len(df_nvidia) == 0:
        print("‚ö†Ô∏è  Nessun tweet trovato per Nvidia!")
    else:
        results_nvidia = analyze_single_argument(
            df=df_nvidia,
            argument_name='Nvidia',
            min_weight_backbone=2,
            top_n_viz=5000
        )
    
    # ==========================================
    # SUMMARY FINALE
    # ==========================================
    
    print("\n\n" + "="*70)
    print("üéâ ANALISI COMPLETA TERMINATA!")
    print("="*70)
    
    print(f"\nüìÅ FILE GENERATI:")
    print(f"\n   üü† Bitcoin:")
    print(f"      - bitcoin_directed_network.png")
    print(f"      - bitcoin_reciprocal_network.png")
    print(f"      - bitcoin_statistics.png")
    
    print(f"\n   üü¢ Nvidia:")
    print(f"      - nvidia_directed_network.png")
    print(f"      - nvidia_reciprocal_network.png")
    print(f"      - nvidia_statistics.png")
    
    print("\n" + "="*70)
    print("‚ú® Tutte le analisi completate con successo! ‚ú®")
    print("="*70)

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from collections import Counter, defaultdict

# Set color palette moderno (ORIGINALE)
COLORS = {
    'primary': '#FF6B6B',    # Rosso/corallo
    'secondary': '#4ECDC4',  # Turchese
    'accent': '#95E1D3',     # Verde acqua
    'dark': '#34495e',       # Grigio scuro
    'light': '#ECF0F1'       # Grigio chiaro
}

sns.set_style("whitegrid")
plt.rcParams['font.family'] = 'sans-serif'

# ==========================================
# ANALISI COMPARATIVA PER ARGUMENT
# ==========================================

def assign_user_argument(df):
    """
    Assegna a ogni utente l'argument prevalente nei suoi tweet
    """
    user_arguments = df.groupby('text_author')['argument'].agg(lambda x: x.mode()[0] if len(x.mode()) > 0 else x.iloc[0])
    return user_arguments.to_dict()

def build_cross_argument_network(df):
    """
    Network con informazione sull'argument di ogni utente
    Ritorna: G (network), user_arg_map (dict user->argument)
    """
    G = nx.DiGraph()
    user_arg_map = assign_user_argument(df)
    
    for _, row in df.iterrows():
        user = row['text_author']
        argument = row['argument']
        
        for mentioned in row['mentions_clean']:
            if user != mentioned:
                # Determina argument del menzionato (se presente nel dataset)
                mentioned_arg = user_arg_map.get(mentioned, 'Unknown')
                
                if G.has_edge(user, mentioned):
                    G[user][mentioned]['weight'] += 1
                else:
                    G.add_edge(user, mentioned, weight=1, 
                             source_arg=argument, 
                             target_arg=mentioned_arg)
    
    # Aggiungi attributi ai nodi
    nx.set_node_attributes(G, user_arg_map, 'argument')
    
    print(f"\nüé® CROSS-ARGUMENT NETWORK")
    print(f"   Nodi: {G.number_of_nodes()}, Archi: {G.number_of_edges()}")
    
    # Statistiche intra vs inter-argument
    intra_arg = sum(1 for u, v, d in G.edges(data=True) 
                    if d.get('source_arg') == d.get('target_arg'))
    inter_arg = G.number_of_edges() - intra_arg
    
    print(f"   Menzioni intra-argument: {intra_arg} ({intra_arg/G.number_of_edges()*100:.1f}%)")
    print(f"   Menzioni inter-argument: {inter_arg} ({inter_arg/G.number_of_edges()*100:.1f}%)")
    
    return G, user_arg_map

def analyze_argument_communities(G, user_arg_map):
    """
    Analizza le caratteristiche di ogni argument community
    """
    arguments = set(user_arg_map.values()) - {'Unknown'}
    
    print("\n" + "="*60)
    print("üìä ARGUMENT COMMUNITIES ANALYSIS")
    print("="*60)
    
    stats = {}
    
    for arg in arguments:
        users_in_arg = [u for u, a in user_arg_map.items() if a == arg and u in G.nodes()]
        G_sub = G.subgraph(users_in_arg).copy()
        
        # Metriche base
        n_nodes = G_sub.number_of_nodes()
        n_edges = G_sub.number_of_edges()
        density = nx.density(G_sub) if n_nodes > 1 else 0
        
        # Top influencers (PageRank locale)
        if n_edges > 0:
            pagerank = nx.pagerank(G_sub, weight='weight')
            top_users = sorted(pagerank.items(), key=lambda x: x[1], reverse=True)[:5]
        else:
            top_users = []
        
        stats[arg] = {
            'n_users': n_nodes,
            'n_interactions': n_edges,
            'density': density,
            'top_users': top_users
        }
        
        print(f"\nüè∑Ô∏è  ARGUMENT: {arg}")
        print(f"   Utenti: {n_nodes}")
        print(f"   Interazioni: {n_edges}")
        print(f"   Densit√†: {density:.4f}")
        if top_users:
            print(f"   Top influencers:")
            for i, (user, score) in enumerate(top_users, 1):
                print(f"      {i}. @{user} (PR: {score:.5f})")
    
    return stats

def find_bridge_users(G, user_arg_map):
    """
    Identifica utenti che fanno da ponte tra argument
    """
    print("\n" + "="*60)
    print("üåâ BRIDGE USERS (Inter-Argument Connectors)")
    print("="*60)
    
    bridge_scores = {}
    
    for user in G.nodes():
        if user not in user_arg_map or user_arg_map[user] == 'Unknown':
            continue
        
        user_arg = user_arg_map[user]
        
        # Conta menzioni verso altri argument
        out_neighbors = list(G.successors(user))
        cross_mentions = sum(1 for n in out_neighbors 
                           if user_arg_map.get(n, 'Unknown') != user_arg 
                           and user_arg_map.get(n, 'Unknown') != 'Unknown')
        
        # Conta menzioni ricevute da altri argument
        in_neighbors = list(G.predecessors(user))
        cross_received = sum(1 for n in in_neighbors 
                           if user_arg_map.get(n, 'Unknown') != user_arg 
                           and user_arg_map.get(n, 'Unknown') != 'Unknown')
        
        total_cross = cross_mentions + cross_received
        
        if total_cross > 0:
            bridge_scores[user] = {
                'total': total_cross,
                'out': cross_mentions,
                'in': cross_received,
                'argument': user_arg
            }
    
    # Top bridges
    top_bridges = sorted(bridge_scores.items(), 
                        key=lambda x: x[1]['total'], reverse=True)[:10]
    
    print("\nüîù TOP 10 BRIDGE USERS:")
    for i, (user, data) in enumerate(top_bridges, 1):
        print(f"   {i}. @{user} ({data['argument']}): "
              f"{data['total']} interazioni cross-argument "
              f"({data['out']} out, {data['in']} in)")
    
    return bridge_scores

# ==========================================
# VISUALIZZAZIONI COMPARATIVE
# ==========================================

def visualize_argument_network(G, user_arg_map, top_n=60):
    """
    Network colorato per argument con evidenza dei collegamenti cross-argument
    """
    # Prendi top utenti per PageRank
    pagerank = nx.pagerank(G, weight='weight')
    top_users = sorted(pagerank.items(), key=lambda x: x[1], reverse=True)[:top_n]
    top_users = [u for u, _ in top_users]
    
    G_sub = G.subgraph(top_users).copy()
    
    # Setup colori: usa COLORS originali per i due argument
    arguments = list(set(user_arg_map.values()) - {'Unknown'})
    
    # Usa i colori originali per Bitcoin e Nvidia
    if 'Bitcoin' in arguments and 'Nvidia' in arguments:
        color_map = {
            'Bitcoin': COLORS['primary'],      # Rosso/corallo
            'Nvidia': COLORS['secondary'],     # Turchese
            'Unknown': COLORS['light']         # Grigio chiaro
        }
    else:
        # Fallback se nomi diversi
        color_map = {arg: [COLORS['primary'], COLORS['secondary']][i] 
                    for i, arg in enumerate(arguments)}
        color_map['Unknown'] = COLORS['light']
    
    node_colors = [color_map.get(user_arg_map.get(n, 'Unknown'), COLORS['light']) 
                   for n in G_sub.nodes()]
    
    # Separa edge intra vs inter-argument
    intra_edges = []
    inter_edges = []
    
    for u, v, d in G_sub.edges(data=True):
        if user_arg_map.get(u, 'Unknown') == user_arg_map.get(v, 'Unknown'):
            intra_edges.append((u, v, d['weight']))
        else:
            inter_edges.append((u, v, d['weight']))
    
    # Plot
    fig, ax = plt.subplots(figsize=(18, 14))
    fig.patch.set_facecolor('white')
    
    pos = nx.spring_layout(G_sub, k=2, iterations=50, seed=42)
    
    # Draw intra-argument edges (grigio chiaro)
    if intra_edges:
        nx.draw_networkx_edges(G_sub, pos, 
                              edgelist=[(u, v) for u, v, w in intra_edges],
                              width=[w*0.3 for u, v, w in intra_edges],
                              edge_color=COLORS['light'], alpha=0.4,
                              arrows=True, arrowsize=8,
                              arrowstyle='-|>', connectionstyle='arc3,rad=0.1')  # ‚Üê Frecce dritte
    
    # Draw inter-argument edges (rosso/evidenziato)
    if inter_edges:
        nx.draw_networkx_edges(G_sub, pos,
                              edgelist=[(u, v) for u, v, w in inter_edges],
                              width=[w*0.5 for u, v, w in inter_edges],
                              edge_color=COLORS['primary'], alpha=0.6,
                              arrows=True, arrowsize=10, style='dashed',
                              arrowstyle='-|>', connectionstyle='arc3,rad=0.1')  # ‚Üê Frecce dritte
    
    # Draw nodes
    node_sizes = [pagerank.get(n, 0) * 50000 for n in G_sub.nodes()]
    nx.draw_networkx_nodes(G_sub, pos, node_size=node_sizes,
                          node_color=node_colors, alpha=0.9,
                          edgecolors=COLORS['dark'], linewidths=1.5)
    
    nx.draw_networkx_labels(G_sub, pos, font_size=7, font_weight='bold')
    
    # Legend
    legend_elements = []
    for arg in arguments:
        legend_elements.append(plt.Line2D([0], [0], marker='o', color='w', 
                                         markerfacecolor=color_map[arg], markersize=12,
                                         label=arg, markeredgecolor=COLORS['dark']))
    legend_elements.append(plt.Line2D([0], [0], color=COLORS['light'], linewidth=2, 
                                     label='Intra-argument'))
    legend_elements.append(plt.Line2D([0], [0], color=COLORS['primary'], linewidth=2, 
                                     linestyle='--', label='Inter-argument'))
    
    ax.legend(handles=legend_elements, loc='upper left', fontsize=10)
    
    plt.title(f'NETWORK COMPARISON BY ARGUMENT\n'
              f'Top {top_n} users - Node size = PageRank\n'
              f'Solid edges = same argument, Dashed = cross-argument',
              fontsize=16, fontweight='bold')
    plt.axis('off')
    plt.tight_layout()
    plt.savefig('network_by_argument.png', dpi=300, bbox_inches='tight', facecolor='white')
    plt.show()
    print("\nüíæ Salvato: network_by_argument.png")

def visualize_argument_comparison(G, user_arg_map, stats):
    """
    Dashboard comparativa con 4 subplot
    """
    fig = plt.figure(figsize=(18, 12))
    fig.patch.set_facecolor('white')
    gs = fig.add_gridspec(2, 2, hspace=0.3, wspace=0.3)
    
    arguments = list(set(user_arg_map.values()) - {'Unknown'})
    
    # Usa i colori originali
    if 'Bitcoin' in arguments and 'Nvidia' in arguments:
        color_dict = {
            'Bitcoin': COLORS['primary'],
            'Nvidia': COLORS['secondary']
        }
    else:
        color_dict = {arg: [COLORS['primary'], COLORS['secondary']][i] 
                     for i, arg in enumerate(arguments)}
    
    # 1. Confronto metriche base
    ax1 = fig.add_subplot(gs[0, 0])
    metrics = ['n_users', 'n_interactions']
    x = np.arange(len(metrics))
    width = 0.35
    
    for i, arg in enumerate(arguments):
        values = [stats[arg]['n_users'], stats[arg]['n_interactions']]
        ax1.bar(x + i*width, values, width, label=arg, 
               color=color_dict[arg], alpha=0.8, edgecolor=COLORS['dark'])
    
    ax1.set_ylabel('Count', fontweight='bold')
    ax1.set_title('Comparison: Users & Interactions', fontweight='bold', fontsize=12)
    ax1.set_xticks(x + width/2)
    ax1.set_xticklabels(['Users', 'Interactions'])
    ax1.legend()
    ax1.grid(axis='y', alpha=0.3)
    
    # 2. Network density comparison
    ax2 = fig.add_subplot(gs[0, 1])
    densities = [stats[arg]['density'] for arg in arguments]
    bars = ax2.bar(arguments, densities, 
                   color=[color_dict[arg] for arg in arguments], 
                   alpha=0.8, edgecolor=COLORS['dark'])
    ax2.set_ylabel('Network Density', fontweight='bold')
    ax2.set_title('Network Density by Argument', fontweight='bold', fontsize=12)
    ax2.grid(axis='y', alpha=0.3)
    
    # 3. Interaction matrix (inter vs intra-argument)
    ax3 = fig.add_subplot(gs[1, 0])
    
    interaction_matrix = np.zeros((len(arguments), len(arguments)))
    
    for u, v, d in G.edges(data=True):
        u_arg = user_arg_map.get(u, 'Unknown')
        v_arg = user_arg_map.get(v, 'Unknown')
        
        if u_arg in arguments and v_arg in arguments:
            i = arguments.index(u_arg)
            j = arguments.index(v_arg)
            interaction_matrix[i, j] += d['weight']
    
    sns.heatmap(interaction_matrix, annot=True, fmt='.0f', 
                xticklabels=arguments, yticklabels=arguments,
                cmap='RdYlGn', ax=ax3, cbar_kws={'label': 'Total Mentions'},
                linewidths=0.5, linecolor=COLORS['light'])
    ax3.set_title('Cross-Argument Interaction Matrix', fontweight='bold', fontsize=12)
    ax3.set_xlabel('Mentioned User Argument', fontweight='bold')
    ax3.set_ylabel('Mentioning User Argument', fontweight='bold')
    
    # 4. PageRank distribution per argument
    ax4 = fig.add_subplot(gs[1, 1])
    
    pagerank = nx.pagerank(G, weight='weight')
    
    pr_by_arg = {arg: [] for arg in arguments}
    for user, pr_score in pagerank.items():
        user_arg = user_arg_map.get(user, 'Unknown')
        if user_arg in arguments:
            pr_by_arg[user_arg].append(pr_score)
    
    for arg in arguments:
        if pr_by_arg[arg]:
            ax4.hist(pr_by_arg[arg], bins=30, alpha=0.6, 
                    label=arg, color=color_dict[arg], edgecolor=COLORS['dark'])
    
    ax4.set_xlabel('PageRank Score', fontweight='bold')
    ax4.set_ylabel('Frequency', fontweight='bold')
    ax4.set_title('PageRank Distribution by Argument', fontweight='bold', fontsize=12)
    ax4.legend()
    ax4.set_yscale('log')
    ax4.grid(alpha=0.3)
    
    plt.suptitle('Argument Comparison Dashboard', fontsize=16, fontweight='bold', y=0.995)
    plt.savefig('argument_comparison_dashboard.png', dpi=300, bbox_inches='tight', facecolor='white')
    plt.show()
    print("\nüíæ Salvato: argument_comparison_dashboard.png")

def visualize_side_by_side_networks(G, user_arg_map, top_n=40):
    """
    Due subnetwork affiancati (uno per argument)
    """
    arguments = list(set(user_arg_map.values()) - {'Unknown'})
    
    if len(arguments) != 2:
        print(f"‚ö†Ô∏è  Questa visualizzazione funziona meglio con 2 argument (trovati: {len(arguments)})")
    
    fig, axes = plt.subplots(1, 2, figsize=(20, 10))
    fig.patch.set_facecolor('white')
    
    # Usa i colori originali
    if 'Bitcoin' in arguments and 'Nvidia' in arguments:
        color_map = {
            'Bitcoin': COLORS['primary'],
            'Nvidia': COLORS['secondary']
        }
    else:
        color_map = {arg: [COLORS['primary'], COLORS['secondary']][i] 
                    for i, arg in enumerate(arguments)}
    
    for idx, arg in enumerate(arguments[:2]):
        # Subgraph per argument
        users_in_arg = [u for u, a in user_arg_map.items() if a == arg and u in G.nodes()]
        G_sub = G.subgraph(users_in_arg).copy()
        
        # Top utenti
        if G_sub.number_of_edges() > 0:
            pagerank = nx.pagerank(G_sub, weight='weight')
            top_users = sorted(pagerank.items(), key=lambda x: x[1], reverse=True)[:top_n]
            top_users = [u for u, _ in top_users if u in G_sub.nodes()]
            G_viz = G_sub.subgraph(top_users).copy()
        else:
            G_viz = G_sub
        
        if G_viz.number_of_nodes() == 0:
            axes[idx].text(0.5, 0.5, f'No data for {arg}', 
                          ha='center', va='center', fontsize=14)
            axes[idx].axis('off')
            continue
        
        # Layout e visualizzazione
        pos = nx.spring_layout(G_viz, k=2, iterations=50, seed=42)
        
        # Edges
        weights = [G_viz[u][v]['weight'] for u, v in G_viz.edges()]
        nx.draw_networkx_edges(G_viz, pos, width=[w*0.4 for w in weights],
                              edge_color=COLORS['light'], alpha=0.5,
                              arrows=True, arrowsize=8, ax=axes[idx],
                              arrowstyle='-|>', connectionstyle='arc3,rad=0.1')  # ‚Üê Frecce dritte
        
        # Nodes
        node_sizes = [pagerank.get(n, 0.001) * 30000 for n in G_viz.nodes()] if G_viz.number_of_edges() > 0 else [300]*G_viz.number_of_nodes()
        
        nx.draw_networkx_nodes(G_viz, pos, node_size=node_sizes,
                              node_color=color_map[arg],
                              alpha=0.8, edgecolors=COLORS['dark'], linewidths=1.5,
                              ax=axes[idx])
        
        nx.draw_networkx_labels(G_viz, pos, font_size=7, 
                               font_weight='bold', ax=axes[idx])
        
        axes[idx].set_title(f'{arg}\n{G_viz.number_of_nodes()} users, '
                          f'{G_viz.number_of_edges()} interactions',
                          fontsize=14, fontweight='bold')
        axes[idx].axis('off')
    
    plt.suptitle('SIDE-BY-SIDE NETWORK COMPARISON', fontsize=16, fontweight='bold', y=0.98)
    plt.tight_layout()
    plt.savefig('side_by_side_networks.png', dpi=300, bbox_inches='tight', facecolor='white')
    plt.show()
    print("\nüíæ Salvato: side_by_side_networks.png")

# ==========================================
# MAIN FUNCTION PER ANALISI ARGUMENT
# ==========================================

def analyze_by_argument(df, top_n_viz=60):
    """
    Pipeline completa per analisi comparativa argument
    
    Parameters:
    -----------
    df : DataFrame con 'text_author', 'mentions_clean', 'argument'
    top_n_viz : numero nodi da visualizzare
    """
    print("\n" + "="*60)
    print("üé® ARGUMENT-BASED NETWORK ANALYSIS")
    print("="*60)
    
    # Build cross-argument network
    G, user_arg_map = build_cross_argument_network(df)
    
    # Analyze communities
    stats = analyze_argument_communities(G, user_arg_map)
    
    # Find bridges
    bridges = find_bridge_users(G, user_arg_map)
    
    # Visualizations
    print("\n" + "="*60)
    print("üìä GENERATING COMPARATIVE VISUALIZATIONS")
    print("="*60)
    
    visualize_argument_network(G, user_arg_map, top_n=top_n_viz)
    visualize_argument_comparison(G, user_arg_map, stats)
    visualize_side_by_side_networks(G, user_arg_map, top_n=40)
    
    print("\n‚úÖ ANALISI ARGUMENT COMPLETATA!")
    
    return {
        'network': G,
        'user_arg_map': user_arg_map,
        'stats': stats,
        'bridges': bridges
    }

# ==========================================
# ESEMPIO DI UTILIZZO
# ==========================================

if __name__ == "__main__":
    
    print("\n" + "üåü"*35)
    print("  ARGUMENT COMPARISON - NETWORK ANALYSIS")
    print("üåü"*35)
    
    # Carica dati
    df = pd.read_excel("tweets_df.xlsx")
    
    # Prepara mentions
    df['mentions_clean'] = df['text_mentions'].apply(
        lambda x: str(x).split() if pd.notna(x) else []
    )
    
    # Filtra tweet con mentions
    df = df[df['mentions_clean'].apply(len) > 0]
    
    print(f"\nüìä Dataset: {len(df)} tweets con mentions")
    
    # Verifica colonna argument
    if 'argument' not in df.columns:
        print("\n‚ö†Ô∏è  Colonna 'argument' non trovata!")
        exit()
    
    # Mostra distribuzione
    arg_counts = df['argument'].value_counts()
    print(f"\nüìà Distribuzione argument:")
    for arg, count in arg_counts.items():
        print(f"   {arg}: {count} tweets")
    
    # Esegui analisi comparativa
    results = analyze_by_argument(df, top_n_viz=60)
    
    print("\n" + "="*70)
    print("üéâ ANALISI COMPLETA TERMINATA!")
    print("="*70)
    
    print(f"\nüìÅ FILE GENERATI:")
    print(f"   - network_by_argument.png")
    print(f"   - argument_comparison_dashboard.png")
    print(f"   - side_by_side_networks.png")
    
    print("\n" + "="*70)
    print("‚ú® Analisi completata con successo! ‚ú®")
    print("="*70)

# Threads

In [None]:
import praw
import pandas as pd
from datetime import datetime, timezone
import time
import re
import csv 

## Bitcoin

In [None]:
# ===== INSERISCI LE TUE CREDENZIALI QUI =====
CLIENT_ID = "eDLBDOYp5Dg7AZFLdCoG1Q"
CLIENT_SECRET = "EhcSQzvvKqkdQ2OTTLaRn-fUyndc0w"
USER_AGENT = "python:bitcoin_scraper:v1.0 (by /u/ActKey2978)"

In [None]:
import praw
import pandas as pd
import re
from datetime import datetime, timezone
import time
from langdetect import detect, LangDetectException

# Intervallo date desiderato (UTC) - DAL 17 AL 20 OTTOBRE
start_date = datetime(2025, 10, 17, 0, 0, 0, tzinfo=timezone.utc)
end_date   = datetime(2025, 10, 21, 0, 0, 0, tzinfo=timezone.utc)

# ========== AUTENTICAZIONE ==========
print("üîê Autenticazione in corso...")
reddit = praw.Reddit(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    user_agent=USER_AGENT
)
print("‚úì Autenticazione riuscita! (solo lettura)\n")

# ========== PARAMETRI ==========
subreddits = "CryptoCurrency+Bitcoin+btc+CryptoMarkets+investing+wallstreetbets"
keywords_filter = ["bitcoin", "btc"]
output_filename = "reddit_comments_bitcoin_oct17_20_2025.xlsx"
comments_data = []
processed_threads = set()
TARGET_COMMENTS = 15000  # Obiettivo commenti

# Statistiche filtro lingua
stats = {
    "threads_checked": 0,
    "threads_non_english": 0,
    "comments_checked": 0,
    "comments_non_english": 0
}

# ========== FUNZIONI DI UTILIT√Ä ==========
def is_english(text):
    """
    Verifica se il testo √® in inglese.
    Ritorna True se inglese, False altrimenti.
    """
    # Ignora testi troppo corti (meno di 10 caratteri)
    if len(text.strip()) < 10:
        return True  # Accetta comunque (troppo corto per rilevare)
    
    try:
        # Rimuovi URL e menzioni per migliorare il rilevamento
        clean_text = re.sub(r'https?://\S+', '', text)
        clean_text = re.sub(r'u/\w+', '', clean_text)
        clean_text = re.sub(r'@\w+', '', clean_text)
        
        if len(clean_text.strip()) < 10:
            return True
        
        detected_lang = detect(clean_text)
        return detected_lang == 'en'
    except LangDetectException:
        # In caso di errore, accetta il testo
        return True

def extract_mentions(text):
    """Estrae le mention (@username o u/username) dal testo"""
    mentions = []
    mentions.extend(re.findall(r'u/(\w+)', text))
    mentions.extend(re.findall(r'@(\w+)', text))
    return list(set(mentions))

def extract_hashtags(text):
    """Estrae gli hashtag dal testo"""
    return re.findall(r"#\w+", text)

def get_parent_author(comment):
    """Ottiene l'autore del commento parent (per network analysis)"""
    try:
        if comment.parent_id.startswith("t1_"):
            parent = reddit.comment(comment.parent_id.split("_")[1])
            return parent.author.name if parent.author else "[deleted]"
        elif comment.parent_id.startswith("t3_"):
            parent = reddit.submission(comment.parent_id.split("_")[1])
            return parent.author.name if parent.author else "[deleted]"
    except:
        return None
    return None

# ========== RICERCA THREAD ==========
print("üîé Cerco thread su Bitcoin tra 17 e 20 ottobre 2025...")
print(f"üìÖ Range: {start_date} - {end_date}")
print(f"üéØ Subreddit: {subreddits}")
print(f"üîç Keywords filter: {keywords_filter}")
print(f"üåç Filtro lingua: SOLO INGLESE")
print(f"üéØ Obiettivo: {TARGET_COMMENTS} commenti\n")

# Strategia 1: Cerca post con keywords (pi√π efficiente)
print("üîÑ Strategia 1: Ricerca per keyword...")
for submission in reddit.subreddit(subreddits).search(
    "bitcoin OR btc",
    sort="new",
    time_filter="month",
    limit=None
):
    # Check obiettivo
    if len(comments_data) >= TARGET_COMMENTS:
        print(f"\n‚úÖ Obiettivo raggiunto! {len(comments_data)} commenti raccolti")
        break
    
    try:
        created_time = datetime.fromtimestamp(submission.created_utc, tz=timezone.utc)
    except:
        continue

    # Filtra solo post tra 17 e 20 ottobre
    if not (start_date <= created_time < end_date):
        continue

    # Filtra solo thread che menzionano Bitcoin/BTC
    title_and_text = (submission.title + " " + (submission.selftext or "")).lower()
    if not any(keyword in title_and_text for keyword in keywords_filter):
        continue

    # EVITA DUPLICATI
    if submission.id in processed_threads:
        continue
    
    processed_threads.add(submission.id)

    # Salta thread senza commenti
    if submission.num_comments == 0:
        continue

    # ========== FILTRO LINGUA THREAD ==========
    stats["threads_checked"] += 1
    thread_text_to_check = submission.title + " " + (submission.selftext or "")
    
    if not is_english(thread_text_to_check):
        stats["threads_non_english"] += 1
        print(f"‚è≠Ô∏è  Thread NON inglese saltato: {submission.title[:50]}...")
        continue

    print(f"üß© Thread: {submission.title[:60]}... ({submission.subreddit.display_name}) - {submission.num_comments} commenti")

    # ========== SCARICA I COMMENTI ==========
    try:
        submission.comments.replace_more(limit=15)
        
        comments_count = 0
        for comment in submission.comments.list():
            if not comment.body or comment.body in ["[deleted]", "[removed]"]:
                continue
            
            # FILTRA I COMMENTI PER DATA
            try:
                comment_created = datetime.fromtimestamp(comment.created_utc, tz=timezone.utc)
            except:
                continue
            
            # SALTA COMMENTI FUORI DAL RANGE TEMPORALE
            if not (start_date <= comment_created < end_date):
                continue
            
            # ========== FILTRO LINGUA COMMENTO ==========
            stats["comments_checked"] += 1
            if not is_english(comment.body):
                stats["comments_non_english"] += 1
                continue
            
            if not comment.author:
                author_name = "[deleted]"
            else:
                author_name = comment.author.name

            # Estrai hashtags e mentions
            hashtags = extract_hashtags(comment.body)
            mentions = extract_mentions(comment.body)
            
            # Ottieni l'autore del parent (per network analysis)
            parent_author = get_parent_author(comment)
            
            # Conta le interazioni
            num_replies = len(comment.replies) if comment.replies else 0

            comment_info = {
                # Info thread
                "thread_title": submission.title,
                "thread_author": submission.author.name if submission.author else "[deleted]",
                "thread_score": submission.score,
                "thread_num_comments": submission.num_comments,
                
                # Info commento
                "text_id": comment.id,
                "comment_parent_id": comment.parent_id,
                "text_author": author_name,
                "text": comment.body,
                "likes": comment.score,
                "text_date": comment_created,
                "text_num_replies": num_replies,
                "retweets": None,
                
                # DATI PER NETWORK ANALYSIS
                "comment_parent_author": parent_author,
                "text_mentions": ", ".join(mentions),
                "text_hashtags": ", ".join(hashtags),

                # ARGUMENT:
                "argument": "Bitcoin",
                "site": "Reddit",
            }

            comments_data.append(comment_info)
            comments_count += 1
            
            # Check obiettivo
            if len(comments_data) >= TARGET_COMMENTS:
                print(f"  üéØ Obiettivo raggiunto!")
                break

        print(f"  üí¨ Commenti inglesi raccolti: {comments_count}")

    except Exception as e:
        print(f"  ‚ö†Ô∏è Errore nel thread: {e}")
    
    time.sleep(1.2)

# Strategia 2: Se non abbiamo raggiunto l'obiettivo, scansiona i post recenti
if len(comments_data) < TARGET_COMMENTS:
    print(f"\nüîÑ Strategia 2: Scansione post recenti... (raccolti: {len(comments_data)}/{TARGET_COMMENTS})")
    
    for submission in reddit.subreddit(subreddits).new(limit=None):
        # Check obiettivo
        if len(comments_data) >= TARGET_COMMENTS:
            print(f"\n‚úÖ Obiettivo raggiunto! {len(comments_data)} commenti raccolti")
            break
        
        try:
            created_time = datetime.fromtimestamp(submission.created_utc, tz=timezone.utc)
        except:
            continue

        # Se siamo andati troppo indietro nel tempo, fermati
        if created_time < start_date:
            break

        # Filtra solo post tra 17 e 20 ottobre
        if not (start_date <= created_time < end_date):
            continue

        # Filtra solo thread che menzionano Bitcoin/BTC
        title_and_text = (submission.title + " " + (submission.selftext or "")).lower()
        if not any(keyword in title_and_text for keyword in keywords_filter):
            continue

        # EVITA DUPLICATI
        if submission.id in processed_threads:
            continue
        
        processed_threads.add(submission.id)

        # Salta thread senza commenti
        if submission.num_comments == 0:
            continue

        # ========== FILTRO LINGUA THREAD ==========
        stats["threads_checked"] += 1
        thread_text_to_check = submission.title + " " + (submission.selftext or "")
        
        if not is_english(thread_text_to_check):
            stats["threads_non_english"] += 1
            print(f"‚è≠Ô∏è  Thread NON inglese saltato: {submission.title[:50]}...")
            continue

        print(f"üß© Thread: {submission.title[:60]}... ({submission.subreddit.display_name}) - {submission.num_comments} commenti")

        # ========== SCARICA I COMMENTI ==========
        try:
            submission.comments.replace_more(limit=15)
            
            comments_count = 0
            for comment in submission.comments.list():
                if not comment.body or comment.body in ["[deleted]", "[removed]"]:
                    continue
                
                # FILTRA I COMMENTI PER DATA
                try:
                    comment_created = datetime.fromtimestamp(comment.created_utc, tz=timezone.utc)
                except:
                    continue
                
                # SALTA COMMENTI FUORI DAL RANGE TEMPORALE
                if not (start_date <= comment_created < end_date):
                    continue
                
                # ========== FILTRO LINGUA COMMENTO ==========
                stats["comments_checked"] += 1
                if not is_english(comment.body):
                    stats["comments_non_english"] += 1
                    continue
                
                if not comment.author:
                    author_name = "[deleted]"
                else:
                    author_name = comment.author.name

                # Estrai hashtags e mentions
                hashtags = extract_hashtags(comment.body)
                mentions = extract_mentions(comment.body)
                
                # Ottieni l'autore del parent (per network analysis)
                parent_author = get_parent_author(comment)
                
                # Conta le interazioni
                num_replies = len(comment.replies) if comment.replies else 0

                comment_info = {
                    # Info thread
                    "thread_title": submission.title,
                    "thread_author": submission.author.name if submission.author else "[deleted]",
                    "thread_score": submission.score,
                    "thread_num_comments": submission.num_comments,
                    
                    # Info commento
                    "text_id": comment.id,
                    "comment_parent_id": comment.parent_id,
                    "text_author": author_name,
                    "text": comment.body,
                    "likes": comment.score,
                    "text_date": comment_created,
                    "text_num_replies": num_replies,
                    "retweets": None,
                    
                    # DATI PER NETWORK ANALYSIS
                    "comment_parent_author": parent_author,
                    "text_mentions": ", ".join(mentions),
                    "text_hashtags": ", ".join(hashtags),

                    # ARGUMENT:
                    "argument": "Bitcoin",
                    "site": "Reddit",
                }

                comments_data.append(comment_info)
                comments_count += 1
                
                # Check obiettivo
                if len(comments_data) >= TARGET_COMMENTS:
                    print(f"  üéØ Obiettivo raggiunto!")
                    break

            print(f"  üí¨ Commenti inglesi raccolti: {comments_count}")

        except Exception as e:
            print(f"  ‚ö†Ô∏è Errore nel thread: {e}")
        
        time.sleep(1.2)

# ========== SALVATAGGIO ==========
print(f"\nüì¶ Totale commenti raccolti: {len(comments_data)}")

if comments_data:
    df = pd.DataFrame(comments_data)
    
    # Rimuovi duplicati
    initial_count = len(df)
    df = df.drop_duplicates(subset=['text_id'])
    if len(df) < initial_count:
        print(f"üßπ Rimossi {initial_count - len(df)} commenti duplicati")
    
    # Ordina per data
    df = df.sort_values('text_date')
    
    # Rimuovi timezone per Excel
    df['text_date'] = pd.to_datetime(df['text_date']).dt.tz_localize(None)
    
    # Prova a salvare in Excel
    try:
        df.to_excel(output_filename, index=False, engine='openpyxl')
        print(f"‚úÖ Salvato in Excel: {output_filename}")
    except PermissionError:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        backup_filename = output_filename.replace('.xlsx', f'_{timestamp}.xlsx')
        df.to_excel(backup_filename, index=False, engine='openpyxl')
        print(f"‚ö†Ô∏è  File aperto! Salvato come: {backup_filename}")
    except Exception as e:
        print(f"‚ö†Ô∏è  Errore Excel: {e}")
        csv_filename = output_filename.replace('.xlsx', '.csv')
        df.to_csv(csv_filename, index=False)
        print(f"‚úÖ Salvato in CSV: {csv_filename}")
    
    # Mostra statistiche filtro lingua
    print(f"\nüåç Statistiche Filtro Lingua:")
    print(f"   Thread controllati: {stats['threads_checked']}")
    print(f"   Thread NON inglesi saltati: {stats['threads_non_english']}")
    print(f"   Commenti controllati: {stats['comments_checked']}")
    print(f"   Commenti NON inglesi saltati: {stats['comments_non_english']}")
    
    # Mostra statistiche
    print(f"\nüìä Statistiche Finali:")
    print(f"   Thread processati: {len(processed_threads)}")
    print(f"   Autori unici: {df['text_author'].nunique()}")
    print(f"   Relazioni parent-child: {df['comment_parent_author'].notna().sum()}")
    
    # Distribuzione temporale
    df['comment_date'] = pd.to_datetime(df['text_date']).dt.date
    print(f"\nüìÖ Distribuzione per giorno:")
    print(df['comment_date'].value_counts().sort_index())
    
    # Distribuzione oraria
    print(f"\nüïê Distribuzione per ora:")
    df['hour'] = pd.to_datetime(df['text_date']).dt.hour
    print(df['hour'].value_counts().sort_index().head(10))
    
    # STATISTICHE PER NETWORK ANALYSIS
    print(f"\nüï∏Ô∏è Metriche Network Analysis:")
    print(f"   Nodi (utenti): {df['text_author'].nunique()}")
    print(f"   Edges potenziali (risposte): {df['comment_parent_author'].notna().sum()}")
    
    # TOP 5 thread pi√π commentati
    print(f"\nüî• Top 5 Thread pi√π commentati:")
    top_threads = df.groupby('thread_title').size().sort_values(ascending=False).head()
    for title, count in top_threads.items():
        print(f"   {count:3d} commenti - {title[:60]}...")
    
else:
    print("‚ö†Ô∏è Nessun commento trovato nel periodo indicato.")

## Nvidia

In [None]:
# ===== INSERISCI LE TUE CREDENZIALI QUI =====
CLIENT_ID = "eDLBDOYp5Dg7AZFLdCoG1Q"
CLIENT_SECRET = "EhcSQzvvKqkdQ2OTTLaRn-fUyndc0w"
USER_AGENT = "python:bitcoin_scraper:v1.0 (by /u/ActKey2978)"

In [None]:
import praw
import pandas as pd
import re
from datetime import datetime, timezone
import time
from langdetect import detect, LangDetectException


# Data di inizio - dal 17 ottobre in poi (NESSUN FILTRO FINE)
start_date = datetime(2025, 10, 17, 0, 0, 0, tzinfo=timezone.utc)


# ========== AUTENTICAZIONE ==========
print("üîê Autenticazione in corso...")
reddit = praw.Reddit(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    user_agent=USER_AGENT
)
print("‚úì Autenticazione riuscita! (solo lettura)\n")


# ========== PARAMETRI ==========
subreddits = "stocks+investing+wallstreetbets+StockMarket+technology+hardware+nvidia+pcmasterrace"
keywords_filter = ["nvidia", "nvda"]
output_filename = "reddit_comments_nvidia_from_oct17_ALL.xlsx"
comments_data = []
processed_threads = set()
TARGET_COMMENTS = 15000  # Obiettivo commenti


# Statistiche filtro lingua
stats = {
    "threads_checked": 0,
    "threads_non_english": 0,
    "comments_checked": 0,
    "comments_non_english": 0,
    "comments_whitelisted": 0
}


# ========== WHITELIST TERMINI NVIDIA/TECH/FINANZIARI ==========
NVIDIA_TECH_TERMS = {
    # NVIDIA specifici
    'nvidia', 'nvda', 'jensen', 'huang', 'geforce', 'rtx', 'gtx', 'quadro',
    'tesla', 'cuda', 'cudnn', 'tensor', 'tensorrt', 'dlss', 'ray tracing',
    'ampere', 'ada', 'hopper', 'blackwell', 'grace', 'lovelace',
    '4090', '4080', '4070', '4060', '3090', '3080', '3070', '3060',
    'a100', 'h100', 'b100', 'l40', 'a40', 'dgx', 'hgx',

    # GPU/Gaming
    'gpu', 'graphics card', 'vram', 'memory', 'bandwidth', 'cores',
    'fps', 'framerate', 'gaming', 'gamer', 'overclock', 'oc', 'tdp',
    'bottleneck', 'benchmark', 'rasterization', 'shaders',
    'vr', 'virtual reality', '4k', '1440p', '1080p', '8k',

    # AI/ML/Datacenter
    'ai', 'artificial intelligence', 'ml', 'machine learning', 'deep learning',
    'llm', 'large language model', 'transformer', 'neural network', 'inference',
    'training', 'model', 'pytorch', 'tensorflow', 'onnx', 'triton',
    'datacenter', 'data center', 'hpc', 'supercomputer', 'cluster',
    'cloud', 'aws', 'azure', 'gcp', 'hyperscaler',

    # Competitori
    'amd', 'intel', 'radeon', 'arc', 'alchemist', 'battlemage',
    'mi300', 'instinct', 'epyc', 'ryzen', 'threadripper',

    # Tech slang
    'mobo', 'motherboard', 'psu', 'power supply', 'cpu', 'processor',
    'ram', 'ssd', 'nvme', 'pcie', 'rgb', 'cooler', 'thermal paste',
    'bios', 'uefi', 'driver', 'firmware', 'update', 'patch',

    # Stock/Trading
    'stock', 'stocks', 'share', 'shares', 'ticker', 'nasdaq', 'sp500',
    'earnings', 'revenue', 'profit', 'margin', 'guidance', 'beat', 'miss',
    'bull', 'bear', 'bullish', 'bearish', 'calls', 'puts', 'options',
    'long', 'short', 'squeeze', 'gamma', 'theta', 'strike', 'expiry',
    'ath', 'atl', 'pe ratio', 'market cap', 'mcap', 'valuation',

    # Finance slang
    'hodl', 'hodling', 'stonks', 'tendies', 'yolo', 'fomo', 'fud',
    'moon', 'lambo', 'rocket', 'ape', 'diamond hands', 'paper hands',
    'buy the dip', 'btfd', 'dca', 'rsi', 'macd', 'support', 'resistance',

    # Tech companies
    'tsmc', 'samsung', 'micron', 'sk hynix', 'broadcom', 'qualcomm',
    'arm', 'apple', 'microsoft', 'google', 'meta', 'amazon', 'openai',

    # Acronimi comuni
    'imo', 'imho', 'tbh', 'ngl', 'af', 'rn', 'fr', 'btw', 'fyi',
    'dyor', 'nfa', 'not financial advice', 'afaik', 'iirc',

    # Gaming/PC
    'pc', 'rig', 'build', 'prebuilt', 'custom', 'watercooling', 'aio',
    'case', 'fans', 'airflow', 'temps', 'benchmark', 'stress test',
    'msrp', 'scalper', 'scalping', 'shortage', 'availability',

    # Numeri comuni
    '100k', '200k', '500k', '1m', '10m', '100m', '1b', '10b', '100b',
    'trillion', 'billion', 'million', 'thousand'
}


# ========== FUNZIONI DI UTILIT√Ä ==========
def is_english(text):
    """
    Verifica se il testo √® probabilmente in inglese.
    Versione PERMISSIVA con whitelist termini NVIDIA/tech/finance.
    """
    if len(text.strip()) < 30:
        return True

    text_lower = text.lower()
    if any(term in text_lower for term in NVIDIA_TECH_TERMS):
        stats["comments_whitelisted"] += 1
        return True

    try:
        clean_text = re.sub(r'https?://\S+|u/\w+|@\w+|\$[A-Z]+|[0-9$%‚Ç¨¬£¬•‚Çø]|#\w+', '', text)
        if len(clean_text.strip()) < 30:
            return True
        detected_lang = detect(clean_text)
        accepted_langs = ['en', 'nl', 'da', 'no', 'sv', 'de', 'cy']
        return detected_lang in accepted_langs
    except LangDetectException:
        return True


def extract_mentions(text):
    """Estrae le mention (@username o u/username) dal testo"""
    mentions = []
    mentions.extend(re.findall(r'u/(\w+)', text))
    mentions.extend(re.findall(r'@(\w+)', text))
    return list(set(mentions))


def extract_hashtags(text):
    """Estrae gli hashtag dal testo"""
    return re.findall(r"#\w+", text)


def get_parent_author(comment):
    """Ottiene l'autore del commento parent (per network analysis)"""
    try:
        if comment.parent_id.startswith("t1_"):
            parent = reddit.comment(comment.parent_id.split("_")[1])
            return parent.author.name if parent.author else "[deleted]"
        elif comment.parent_id.startswith("t3_"):
            parent = reddit.submission(comment.parent_id.split("_")[1])
            return parent.author.name if parent.author else "[deleted]"
    except:
        return None
    return None


# ========== RICERCA THREAD ==========
print("üîé Cerco TUTTI i thread e commenti su Nvidia dal 17 ottobre 2025 in poi...")
print(f"üìÖ Range THREAD: dal {start_date.date()} in poi")
print(f"üìÖ Range COMMENTI: dal {start_date.date()} in poi")
print(f"üéØ Subreddit: {subreddits}")
print(f"üîç Keywords filter: {keywords_filter}")
print(f"üåç Filtro lingua: INGLESE (permissivo + whitelist NVIDIA/tech)")
print(f"üéØ Obiettivo: {TARGET_COMMENTS} commenti\n")


# Strategia 1: Cerca post con keywords
print("üîÑ Strategia 1: Ricerca per keyword...")
for submission in reddit.subreddit(subreddits).search(
    "nvidia OR nvda",
    sort="new",
    time_filter="month",
    limit=None
):
    if len(comments_data) >= TARGET_COMMENTS:
        print(f"\n‚úÖ Obiettivo raggiunto! {len(comments_data)} commenti raccolti")
        break

    try:
        created_time = datetime.fromtimestamp(submission.created_utc, tz=timezone.utc)
    except:
        continue

    # ‚úÖ Prendi solo thread DAL 17 ottobre IN POI
    if created_time < start_date:
        continue

    # Filtra solo thread che menzionano Nvidia/NVDA
    title_and_text = (submission.title + " " + (submission.selftext or "")).lower()
    if not any(keyword in title_and_text for keyword in keywords_filter):
        continue

    # EVITA DUPLICATI
    if submission.id in processed_threads:
        continue
    processed_threads.add(submission.id)

    # Salta thread senza commenti
    if submission.num_comments == 0:
        continue

    # ========== FILTRO LINGUA THREAD ==========
    stats["threads_checked"] += 1
    thread_text_to_check = submission.title + " " + (submission.selftext or "")

    if not is_english(thread_text_to_check):
        stats["threads_non_english"] += 1
        print(f"‚è≠Ô∏è  Thread NON inglese saltato: {submission.title[:50]}...")
        continue

    print(f"üß© Thread: {submission.title[:60]}... ({submission.subreddit.display_name}) - {submission.num_comments} commenti")

    # ========== SCARICA I COMMENTI ==========
    try:
        submission.comments.replace_more(limit=15)

        comments_count = 0
        for comment in submission.comments.list():
            if not comment.body or comment.body in ["[deleted]", "[removed]"]:
                continue

            # ‚úÖ PRENDI SOLO COMMENTI DAL 17 OTTOBRE IN POI
            try:
                comment_created = datetime.fromtimestamp(comment.created_utc, tz=timezone.utc)
            except:
                continue

            # SALTA COMMENTI PRIMA DEL 17 OTTOBRE
            if comment_created < start_date:
                continue

            # ========== FILTRO LINGUA COMMENTO (CON WHITELIST) ==========
            stats["comments_checked"] += 1
            if not is_english(comment.body):
                stats["comments_non_english"] += 1
                continue

            author_name = comment.author.name if comment.author else "[deleted]"

            # Estrai hashtags e mentions
            hashtags = extract_hashtags(comment.body)
            mentions = extract_mentions(comment.body)
            parent_author = get_parent_author(comment)
            num_replies = len(comment.replies) if comment.replies else 0

            comment_info = {
                "thread_title": submission.title,
                "thread_author": submission.author.name if submission.author else "[deleted]",
                "thread_score": submission.score,
                "thread_num_comments": submission.num_comments,
                "text_id": comment.id,
                "comment_parent_id": comment.parent_id,
                "text_author": author_name,
                "text": comment.body,
                "likes": comment.score,
                "text_date": comment_created,
                "text_num_replies": num_replies,
                "retweets": None,
                "comment_parent_author": parent_author,
                "text_mentions": ", ".join(mentions),
                "text_hashtags": ", ".join(hashtags),
                "argument": "Nvidia",
                "site": "Reddit",
            }

            comments_data.append(comment_info)
            comments_count += 1

            if len(comments_data) >= TARGET_COMMENTS:
                print(f"  üéØ Obiettivo raggiunto!")
                break

        print(f"  üí¨ Commenti raccolti: {comments_count}")

    except Exception as e:
        print(f"  ‚ö†Ô∏è Errore nel thread: {e}")

    time.sleep(1.2)


# Strategia 2: Scansiona post recenti
if len(comments_data) < TARGET_COMMENTS:
    print(f"\nüîÑ Strategia 2: Scansione post recenti... (raccolti: {len(comments_data)}/{TARGET_COMMENTS})")

    for submission in reddit.subreddit(subreddits).new(limit=None):
        if len(comments_data) >= TARGET_COMMENTS:
            print(f"\n‚úÖ Obiettivo raggiunto! {len(comments_data)} commenti raccolti")
            break

        try:
            created_time = datetime.fromtimestamp(submission.created_utc, tz=timezone.utc)
        except:
            continue

        # Prendi solo thread dal 17 ottobre in poi
        if created_time < start_date:
            continue

        # Filtra solo thread che menzionano Nvidia/NVDA
        title_and_text = (submission.title + " " + (submission.selftext or "")).lower()
        if not any(keyword in title_and_text for keyword in keywords_filter):
            continue

        if submission.id in processed_threads:
            continue
        processed_threads.add(submission.id)

        if submission.num_comments == 0:
            continue

        stats["threads_checked"] += 1
        thread_text_to_check = submission.title + " " + (submission.selftext or "")

        if not is_english(thread_text_to_check):
            stats["threads_non_english"] += 1
            print(f"‚è≠Ô∏è  Thread NON inglese saltato: {submission.title[:50]}...")
            continue

        print(f"üß© Thread: {submission.title[:60]}... ({submission.subreddit.display_name}) - {submission.num_comments} commenti")

        try:
            submission.comments.replace_more(limit=15)

            comments_count = 0
            for comment in submission.comments.list():
                if not comment.body or comment.body in ["[deleted]", "[removed]"]:
                    continue

                try:
                    comment_created = datetime.fromtimestamp(comment.created_utc, tz=timezone.utc)
                except:
                    continue

                if comment_created < start_date:
                    continue

                stats["comments_checked"] += 1
                if not is_english(comment.body):
                    stats["comments_non_english"] += 1
                    continue

                author_name = comment.author.name if comment.author else "[deleted]"
                hashtags = extract_hashtags(comment.body)
                mentions = extract_mentions(comment.body)
                parent_author = get_parent_author(comment)
                num_replies = len(comment.replies) if comment.replies else 0

                comment_info = {
                    "thread_title": submission.title,
                    "thread_author": submission.author.name if submission.author else "[deleted]",
                    "thread_score": submission.score,
                    "thread_num_comments": submission.num_comments,
                    "text_id": comment.id,
                    "comment_parent_id": comment.parent_id,
                    "text_author": author_name,
                    "text": comment.body,
                    "likes": comment.score,
                    "text_date": comment_created,
                    "text_num_replies": num_replies,
                    "retweets": None,
                    "comment_parent_author": parent_author,
                    "text_mentions": ", ".join(mentions),
                    "text_hashtags": ", ".join(hashtags),
                    "argument": "Nvidia",
                    "site": "Reddit",
                }

                comments_data.append(comment_info)
                comments_count += 1

                if len(comments_data) >= TARGET_COMMENTS:
                    print(f"  üéØ Obiettivo raggiunto!")
                    break

            print(f"  üí¨ Commenti raccolti: {comments_count}")

        except Exception as e:
            print(f"  ‚ö†Ô∏è Errore nel thread: {e}")

        time.sleep(1.2)


# ========== SALVATAGGIO ==========
print(f"\nüì¶ Totale commenti raccolti: {len(comments_data)}")

if comments_data:
    df = pd.DataFrame(comments_data)

    initial_count = len(df)
    df = df.drop_duplicates(subset=['text_id'])
    if len(df) < initial_count:
        print(f"üßπ Rimossi {initial_count - len(df)} commenti duplicati")

    df = df.sort_values('text_date')
    df['text_date'] = pd.to_datetime(df['text_date']).dt.tz_localize(None)

    try:
        df.to_excel(output_filename, index=False, engine='openpyxl')
        print(f"‚úÖ Salvato in Excel: {output_filename}")
    except PermissionError:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        backup_filename = output_filename.replace('.xlsx', f'_{timestamp}.xlsx')
        df.to_excel(backup_filename, index=False, engine='openpyxl')
        print(f"‚ö†Ô∏è  File aperto! Salvato come: {backup_filename}")
    except Exception as e:
        print(f"‚ö†Ô∏è  Errore Excel: {e}")
        csv_filename = output_filename.replace('.xlsx', '.csv')
        df.to_csv(csv_filename, index=False)
        print(f"‚úÖ Salvato in CSV: {csv_filename}")

    print(f"\nüåç Statistiche Filtro Lingua:")
    print(f"   Thread controllati: {stats['threads_checked']}")
    print(f"   Thread filtrati: {stats['threads_non_english']}")
    print(f"   Commenti controllati: {stats['comments_checked']}")
    print(f"   Commenti filtrati: {stats['comments_non_english']}")
    print(f"   ‚úÖ Commenti salvati via whitelist: {stats['comments_whitelisted']}")

    print(f"\nüìä Statistiche Finali:")
    print(f"   Thread processati: {len(processed_threads)}")
    print(f"   Autori unici: {df['text_author'].nunique()}")
    print(f"   Relazioni parent-child: {df['comment_parent_author'].notna().sum()}")

    df['comment_date'] = pd.to_datetime(df['text_date']).dt.date
    print(f"\nüìÖ Distribuzione per giorno:")
    print(df['comment_date'].value_counts().sort_index())

    print(f"\nüïê Distribuzione per ora:")
    df['hour'] = pd.to_datetime(df['text_date']).dt.hour
    print(df['hour'].value_counts().sort_index().head(10))

    print(f"\nüï∏Ô∏è Metriche Network Analysis:")
    print(f"   Nodi (utenti): {df['text_author'].nunique()}")
    print(f"   Edges potenziali (risposte): {df['comment_parent_author'].notna().sum()}")

    print(f"\nüî• Top 5 Thread pi√π commentati:")
    top_threads = df.groupby('thread_title').size().sort_values(ascending=False).head()
    for title, count in top_threads.items():
        print(f"   {count:3d} commenti - {title[:60]}...")

else:
    print("‚ö†Ô∏è Nessun commento trovato nel periodo indicato.")


In [None]:
df.to_excel("reddit_comments_nvidia_oct17_22_2025.xlsx")
df['text_date'] = pd.to_datetime(df['text_date'])

# Filtra commenti dal 17 al 20 ottobre 2025
df_17_20 = df[(df['text_date'] >= '2025-10-17') & (df['text_date'] < '2025-10-21')]

print(f"Commenti filtrati: {len(df_17_20)}")
df_17_20.to_excel("reddit_comments_nvidia_oct17_20_2025.xlsx")

## Threads df

In [None]:
df1 = pd.read_excel('reddit_comments_bitcoin_oct17_20_2025.xlsx')
df2 = pd.read_excel('reddit_comments_nvidia_oct17_20_2025.xlsx')


In [None]:
#sample_reddit_Bitcoin = df1.sample(n=2100, random_state=42)
#sample_reddit_Nvidia = df2.sample(n=2100, random_state=42)

In [None]:
threads_df = pd.concat([sample_reddit_Bitcoin, sample_reddit_Nvidia], ignore_index=True)
threads_df.to_excel('threads_df.xlsx', index=False)

## Network

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Modern color palette
COLORS = {
    'primary': '#FF6B6B',    # Red/coral
    'secondary': '#4ECDC4',  # Turquoise
    'accent': '#95E1D3',     # Aqua green
    'dark': '#34495e',       # Dark gray
    'light': '#ECF0F1',      # Light gray
    'combined': '#9B59B6'    # Purple for combined analysis
}

sns.set_style("whitegrid")
plt.rcParams['font.family'] = 'sans-serif'

# ==========================================
# 1. NETWORK CONSTRUCTION (REDDIT VERSION)
# ==========================================

def build_directed_weighted_network(df, argument_name):
    """
    Main network: DIRECTED & WEIGHTED
    Edge: text_author ‚Üí comment_parent_author (who replies to whom)
    """
    G = nx.DiGraph()
    
    for _, row in df.iterrows():
        user = row['text_author']
        parent = row['comment_parent_author']
        
        # Skip if parent is null (root comment) or if it's a self-reply
        if pd.notna(parent) and user != parent:
            if G.has_edge(user, parent):
                G[user][parent]['weight'] += 1
            else:
                G.add_edge(user, parent, weight=1)
    
    print(f"\nüîµ DIRECTED WEIGHTED NETWORK - {argument_name}")
    print(f"   Nodes: {G.number_of_nodes()}, Edges: {G.number_of_edges()}")
    
    return G

def build_reciprocal_network(G_directed, argument_name):
    """Reciprocal network: UNDIRECTED & WEIGHTED"""
    G_reciprocal = nx.Graph()
    
    for u, v, data in G_directed.edges(data=True):
        if G_directed.has_edge(v, u):
            weight = data['weight'] + G_directed[v][u]['weight']
            if not G_reciprocal.has_edge(u, v):
                G_reciprocal.add_edge(u, v, weight=weight)
    
    print(f"\nüü¢ RECIPROCAL UNDIRECTED NETWORK - {argument_name}")
    print(f"   Nodes: {G_reciprocal.number_of_nodes()}, Edges: {G_reciprocal.number_of_edges()}")
    
    return G_reciprocal

def build_backbone_network(G_directed, argument_name, min_weight=3):
    """Backbone network: DIRECTED & UNWEIGHTED"""
    G_backbone = nx.DiGraph()
    
    for u, v, data in G_directed.edges(data=True):
        if data['weight'] >= min_weight:
            G_backbone.add_edge(u, v)
    
    print(f"\nüî¥ BACKBONE NETWORK - {argument_name} (min weight={min_weight})")
    print(f"   Nodes: {G_backbone.number_of_nodes()}, Edges: {G_backbone.number_of_edges()}")
    
    return G_backbone


# ==========================================
# 2. NETWORK METRICS & STATISTICS
# ==========================================

def analyze_directed_network(G, argument_name):
    """Directed network analysis"""
    print("\n" + "="*60)
    print(f"üìà DIRECTED NETWORK ANALYSIS - {argument_name}")
    print("="*60)
    
    in_degree_weighted = dict(G.in_degree(weight='weight'))
    top_mentioned = sorted(in_degree_weighted.items(), key=lambda x: x[1], reverse=True)[:10]
    
    print("\nüéØ TOP 10 MOST REPLIED TO (receive most replies):")
    for i, (user, replies) in enumerate(top_mentioned, 1):
        print(f"   {i}. u/{user}: {replies} replies received")
    
    out_degree = dict(G.out_degree())
    top_active = sorted(out_degree.items(), key=lambda x: x[1], reverse=True)[:10]
    
    print("\nüí¨ TOP 10 MOST ACTIVE (reply to most different users):")
    for i, (user, replies) in enumerate(top_active, 1):
        print(f"   {i}. u/{user}: replies to {replies} users")
    
    pagerank = nx.pagerank(G, weight='weight')
    top_pagerank = sorted(pagerank.items(), key=lambda x: x[1], reverse=True)[:10]
    
    print("\n‚≠ê TOP 10 PAGERANK (overall influence):")
    for i, (user, score) in enumerate(top_pagerank, 1):
        print(f"   {i}. u/{user}: {score:.5f}")
    
    try:
        hits = nx.hits(G, max_iter=100)
        authorities = sorted(hits[0].items(), key=lambda x: x[1], reverse=True)[:5]
        hubs = sorted(hits[1].items(), key=lambda x: x[1], reverse=True)[:5]
        
        print("\nüèÜ TOP 5 AUTHORITIES (most cited):")
        for i, (user, score) in enumerate(authorities, 1):
            print(f"   {i}. u/{user}: {score:.5f}")
        
        print("\nüîó TOP 5 HUBS (cite most):")
        for i, (user, score) in enumerate(hubs, 1):
            print(f"   {i}. u/{user}: {score:.5f}")
    except:
        print("\n‚ö†Ô∏è HITS algorithm did not converge")
    
    return {
        'in_degree_weighted': in_degree_weighted,
        'pagerank': pagerank,
        'out_degree': out_degree
    }

def analyze_reciprocal_network(G, argument_name):
    """Reciprocal network analysis"""
    print("\n" + "="*60)
    print(f"üìà RECIPROCAL NETWORK ANALYSIS - {argument_name}")
    print("="*60)
    
    if G.number_of_edges() == 0:
        print("‚ö†Ô∏è No reciprocal interactions found!")
        return {}
    
    betweenness = nx.betweenness_centrality(G, weight='weight')
    top_betweenness = sorted(betweenness.items(), key=lambda x: x[1], reverse=True)[:10]
    
    print("\nüåâ TOP 10 BRIDGES (key connectors between groups):")
    for i, (user, score) in enumerate(top_betweenness, 1):
        print(f"   {i}. u/{user}: {score:.5f}")
    
    clustering = nx.clustering(G, weight='weight')
    avg_clustering = sum(clustering.values()) / len(clustering)
    
    print(f"\nüîó CLUSTERING COEFFICIENT: {avg_clustering:.4f}")
    
    try:
        import community as community_louvain
        communities = community_louvain.best_partition(G, weight='weight')
        n_communities = len(set(communities.values()))
        
        print(f"\nüë• COMMUNITY DETECTION: {n_communities} communities")
        
        comm_sizes = Counter(communities.values())
        for comm_id, size in comm_sizes.most_common(5):
            members = [u for u, c in communities.items() if c == comm_id][:5]
            print(f"   Community {comm_id}: {size} members (e.g.: {', '.join(members)})")
        
        return {
            'betweenness': betweenness,
            'clustering': clustering,
            'communities': communities
        }
    except ImportError:
        print("\n‚ö†Ô∏è python-louvain not installed (pip install python-louvain)")
        return {
            'betweenness': betweenness,
            'clustering': clustering
        }

def analyze_backbone_network(G, argument_name):
    """Backbone network analysis"""
    print("\n" + "="*60)
    print(f"üìà BACKBONE NETWORK ANALYSIS - {argument_name}")
    print("="*60)
    
    if G.number_of_edges() == 0:
        print("‚ö†Ô∏è Empty backbone - reduce min_weight")
        return {}
    
    scc = list(nx.strongly_connected_components(G))
    print(f"\nüîÑ STRONGLY CONNECTED COMPONENTS: {len(scc)}")
    largest_scc = max(scc, key=len)
    print(f"   Largest component: {len(largest_scc)} nodes")
    
    G_undirected = G.to_undirected()
    G_undirected.remove_edges_from(nx.selfloop_edges(G_undirected))
    
    core_numbers = nx.core_number(G_undirected)
    max_core = max(core_numbers.values())
    
    print(f"\nüíé K-CORE DECOMPOSITION:")
    print(f"   Max core number: {max_core}")
    k_core = [u for u, k in core_numbers.items() if k == max_core]
    print(f"   {max_core}-core: {len(k_core)} nodes")
    if len(k_core) <= 10:
        print(f"   Members: {', '.join(k_core)}")
    
    try:
        import community as community_louvain
        G_und_simple = nx.Graph()
        for u, v in G.edges():
            if u != v:
                G_und_simple.add_edge(u, v)
        
        communities = community_louvain.best_partition(G_und_simple)
        n_communities = len(set(communities.values()))
        
        print(f"\nüö∂ COMMUNITY DETECTION: {n_communities} communities")
        
        return {
            'scc': scc,
            'core_numbers': core_numbers,
            'communities': communities
        }
    except ImportError:
        return {
            'scc': scc,
            'core_numbers': core_numbers
        }

# ==========================================
# 3. VISUALIZATIONS
# ==========================================

def visualize_directed_network(G, metrics, argument_name, top_n=200):
    """Visualize directed network"""
    top_nodes = sorted(metrics['pagerank'].items(), key=lambda x: x[1], reverse=True)[:top_n]
    top_nodes = [n[0] for n in top_nodes]
    G_sub = G.subgraph(top_nodes).copy()
    
    plt.figure(figsize=(16, 12))
    
    pos = nx.spring_layout(G_sub, k=2, iterations=50, seed=42)
    
    node_sizes = [metrics['pagerank'].get(n, 0) * 50000 for n in G_sub.nodes()]
    edges = G_sub.edges()
    weights = [G_sub[u][v]['weight'] for u, v in edges]
    
    # Calculate colors based on weight for better visibility
    max_weight = max(weights) if weights else 1
    edge_colors = [plt.cm.Blues(0.3 + 0.7 * (w / max_weight)) for w in weights]
    
    nx.draw_networkx_edges(G_sub, pos, edge_color=edge_colors, 
                          width=[w*0.6 for w in weights], alpha=0.7,
                          arrows=True, arrowsize=10, 
                          arrowstyle='-|>', connectionstyle='arc3,rad=0.1')
    
    nx.draw_networkx_nodes(G_sub, pos, node_size=node_sizes, 
                          node_color=COLORS['secondary'], alpha=0.8, 
                          edgecolors=COLORS['dark'], linewidths=2)
    
    nx.draw_networkx_labels(G_sub, pos, font_size=8, font_weight='bold')
    
    plt.title(f'{argument_name} - DIRECTED NETWORK (Reddit Replies)\nTop {top_n} users by PageRank (Node size = influence)', 
              fontsize=16, fontweight='bold')
    plt.axis('off')
    plt.tight_layout()
    plt.savefig(f'{argument_name.lower()}_directed_network.png', dpi=300, bbox_inches='tight', facecolor='white')
    plt.show()
    print(f"\nüíæ Saved: {argument_name.lower()}_directed_network.png")

def visualize_reciprocal_network(G, metrics, argument_name, top_n=300):
    """Visualize reciprocal network"""
    if G.number_of_edges() == 0:
        print("‚ö†Ô∏è No reciprocal interactions to visualize")
        return
    
    # Use the entire network without filtering
    G_sub = G.copy()
    
    plt.figure(figsize=(16, 12))
    
    pos = nx.spring_layout(G_sub, k=2, iterations=50, seed=42)
    
    if 'communities' in metrics:
        communities = metrics['communities']
        node_colors = [communities.get(n, 0) for n in G_sub.nodes()]
        cmap = plt.cm.Set3
    else:
        node_colors = COLORS['accent']
        cmap = None
    
    weights = [G_sub[u][v]['weight'] for u, v in G_sub.edges()]
    
    # Apply same style as directed network
    max_weight = max(weights) if weights else 1
    edge_colors = [plt.cm.Greens(0.3 + 0.7 * (w / max_weight)) for w in weights]
    
    nx.draw_networkx_edges(G_sub, pos, edge_color=edge_colors,
                          width=[w*0.6 for w in weights], alpha=0.7)
    
    nx.draw_networkx_nodes(G_sub, pos, node_size=300, 
                          node_color=node_colors, cmap=cmap, 
                          alpha=0.9, edgecolors=COLORS['dark'], linewidths=1.5)
    
    nx.draw_networkx_labels(G_sub, pos, font_size=7, font_weight='bold')
    
    plt.title(f'{argument_name} - RECIPROCAL NETWORK\nBidirectional conversations (Colors = Communities)', 
              fontsize=16, fontweight='bold')
    plt.axis('off')
    plt.tight_layout()
    plt.savefig(f'{argument_name.lower()}_reciprocal_network.png', dpi=300, bbox_inches='tight', facecolor='white')
    plt.show()
    print(f"\nüíæ Saved: {argument_name.lower()}_reciprocal_network.png")

def visualize_statistics(G_dir, metrics_dir, argument_name):
    """Compact statistical visualizations"""
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.patch.set_facecolor('white')
    
    # 1. Degree distribution
    in_degrees = [d for n, d in G_dir.in_degree()]
    out_degrees = [d for n, d in G_dir.out_degree()]
    
    axes[0, 0].hist(in_degrees, bins=30, alpha=0.7, label='In-degree (replies received)', 
                   color=COLORS['primary'], edgecolor=COLORS['dark'])
    axes[0, 0].hist(out_degrees, bins=30, alpha=0.7, label='Out-degree (replies given)', 
                   color=COLORS['secondary'], edgecolor=COLORS['dark'])
    axes[0, 0].set_xlabel('Degree', fontweight='bold')
    axes[0, 0].set_ylabel('Frequency', fontweight='bold')
    axes[0, 0].set_title('Degree Distribution', fontweight='bold', fontsize=12)
    axes[0, 0].legend()
    axes[0, 0].set_yscale('log')
    axes[0, 0].grid(alpha=0.3)
    
    # 2. Weight distribution
    weights = [d['weight'] for u, v, d in G_dir.edges(data=True)]
    axes[0, 1].hist(weights, bins=30, color=COLORS['accent'], alpha=0.8, 
                   edgecolor=COLORS['dark'])
    axes[0, 1].set_xlabel('Weight (# replies)', fontweight='bold')
    axes[0, 1].set_ylabel('Frequency', fontweight='bold')
    axes[0, 1].set_title('Edge Weight Distribution', fontweight='bold', fontsize=12)
    axes[0, 1].set_yscale('log')
    axes[0, 1].grid(alpha=0.3)
    
    # 3. Top users bar chart
    top_pr = sorted(metrics_dir['pagerank'].items(), key=lambda x: x[1], reverse=True)[:15]
    users = [u for u, _ in top_pr]
    scores = [s for _, s in top_pr]
    
    colors_gradient = [COLORS['primary'] if i < 5 else COLORS['secondary'] if i < 10 
                      else COLORS['accent'] for i in range(len(users))]
    
    axes[1, 0].barh(users, scores, color=colors_gradient, alpha=0.8, edgecolor=COLORS['dark'])
    axes[1, 0].set_xlabel('PageRank Score', fontweight='bold')
    axes[1, 0].set_title('Top 15 Users by PageRank', fontweight='bold', fontsize=12)
    axes[1, 0].invert_yaxis()
    axes[1, 0].grid(axis='x', alpha=0.3)
    
    # 4. Adjacency heatmap
    top_20 = [u for u, _ in top_pr[:20]]
    G_sub = G_dir.subgraph(top_20)
    adj_matrix = nx.to_numpy_array(G_sub, nodelist=top_20, weight='weight')
    
    sns.heatmap(adj_matrix, xticklabels=top_20, yticklabels=top_20, 
                cmap='RdYlGn', ax=axes[1, 1], cbar_kws={'label': 'Replies'},
                linewidths=0.5, linecolor=COLORS['light'])
    axes[1, 1].set_title('Reply Heatmap (Top 20 users)', fontweight='bold', fontsize=12)
    axes[1, 1].set_xlabel('Replies to', fontweight='bold')
    axes[1, 1].set_ylabel('Author', fontweight='bold')
    
    plt.suptitle(f'{argument_name} - Network Statistics', fontsize=16, fontweight='bold', y=0.995)
    plt.tight_layout()
    plt.savefig(f'{argument_name.lower()}_statistics.png', dpi=300, bbox_inches='tight', facecolor='white')
    plt.show()
    print(f"\nüíæ Saved: {argument_name.lower()}_statistics.png")

# ==========================================
# 4. MAIN EXECUTION FOR SINGLE ARGUMENT
# ==========================================

def analyze_single_argument(df, argument_name, min_weight_backbone=3, top_n_viz=200):
    """
    Complete analysis pipeline for ONE SINGLE argument
    
    Parameters:
    -----------
    df : DataFrame already filtered by argument
    argument_name : 'bitcoin' or 'nvidia' or 'combined'
    min_weight_backbone : threshold for backbone network
    top_n_viz : number of nodes to visualize
    """
    print("\n" + "="*70)
    print(f"üöÄ NETWORK ANALYSIS: {argument_name.upper()}")
    print("="*70)
    print(f"Dataset: {len(df)} comments")
    
    # Build networks
    G_directed = build_directed_weighted_network(df, argument_name)
    G_reciprocal = build_reciprocal_network(G_directed, argument_name)
    G_backbone = build_backbone_network(G_directed, argument_name, min_weight=min_weight_backbone)
    
    # Analyze
    metrics_dir = analyze_directed_network(G_directed, argument_name)
    metrics_recip = analyze_reciprocal_network(G_reciprocal, argument_name)
    metrics_backbone = analyze_backbone_network(G_backbone, argument_name)
    
    # Visualize
    print("\n" + "="*70)
    print(f"üé® GENERATING VISUALIZATIONS - {argument_name}")
    print("="*70)
    
    visualize_directed_network(G_directed, metrics_dir, argument_name, top_n=top_n_viz)
    visualize_reciprocal_network(G_reciprocal, metrics_recip, argument_name, top_n=top_n_viz)
    visualize_statistics(G_directed, metrics_dir, argument_name)
    
    print(f"\n‚úÖ ANALYSIS COMPLETED: {argument_name}")
    
    return {
        'networks': {
            'directed': G_directed,
            'reciprocal': G_reciprocal,
            'backbone': G_backbone
        },
        'metrics': {
            'directed': metrics_dir,
            'reciprocal': metrics_recip,
            'backbone': metrics_backbone
        }
    }

# ==========================================
# 5. COMPARATIVE ANALYSIS
# ==========================================

def compare_networks(results_bitcoin, results_nvidia, results_combined):
    """Compare metrics across Bitcoin and NVIDIA networks"""
    print("\n" + "="*70)
    print("üìä COMPARATIVE NETWORK ANALYSIS")
    print("="*70)
    
    # Create comparison dataframe
    comparison_data = {
        'Metric': [],
        'Bitcoin': [],
        'NVIDIA': []
    }
    
    # Network size metrics
    comparison_data['Metric'].extend(['Nodes', 'Edges', 'Avg Degree', 'Density'])
    
    for name, results in [('Bitcoin', results_bitcoin), ('NVIDIA', results_nvidia)]:
        G = results['networks']['directed']
        comparison_data[name].append(G.number_of_nodes())
        comparison_data[name].append(G.number_of_edges())
        comparison_data[name].append(f"{2*G.number_of_edges()/G.number_of_nodes():.2f}" if G.number_of_nodes() > 0 else "0")
        comparison_data[name].append(f"{nx.density(G):.4f}")
    
    df_comparison = pd.DataFrame(comparison_data)
    
    print("\nüìà NETWORK SIZE COMPARISON:")
    print(df_comparison.to_string(index=False))
    
    # Visualization
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.patch.set_facecolor('white')
    
    # 1. Network size comparison
    metrics = ['Nodes', 'Edges']
    btc_values = [results_bitcoin['networks']['directed'].number_of_nodes(),
                  results_bitcoin['networks']['directed'].number_of_edges()]
    nvda_values = [results_nvidia['networks']['directed'].number_of_nodes(),
                   results_nvidia['networks']['directed'].number_of_edges()]
    
    x = np.arange(len(metrics))
    width = 0.35
    
    axes[0, 0].bar(x - width/2, btc_values, width, label='Bitcoin', color=COLORS['primary'], alpha=0.8)
    axes[0, 0].bar(x + width/2, nvda_values, width, label='NVIDIA', color=COLORS['secondary'], alpha=0.8)
    axes[0, 0].set_xlabel('Metric', fontweight='bold')
    axes[0, 0].set_ylabel('Count', fontweight='bold')
    axes[0, 0].set_title('Network Size Comparison', fontweight='bold', fontsize=12)
    axes[0, 0].set_xticks(x)
    axes[0, 0].set_xticklabels(metrics)
    axes[0, 0].legend()
    axes[0, 0].grid(axis='y', alpha=0.3)
    
    # 2. Degree distribution comparison
    btc_degrees = [d for n, d in results_bitcoin['networks']['directed'].degree()]
    nvda_degrees = [d for n, d in results_nvidia['networks']['directed'].degree()]
    
    axes[0, 1].hist(btc_degrees, bins=30, alpha=0.6, label='Bitcoin', color=COLORS['primary'], edgecolor='black')
    axes[0, 1].hist(nvda_degrees, bins=30, alpha=0.6, label='NVIDIA', color=COLORS['secondary'], edgecolor='black')
    axes[0, 1].set_xlabel('Degree', fontweight='bold')
    axes[0, 1].set_ylabel('Frequency', fontweight='bold')
    axes[0, 1].set_title('Degree Distribution Comparison', fontweight='bold', fontsize=12)
    axes[0, 1].legend()
    axes[0, 1].set_yscale('log')
    axes[0, 1].grid(alpha=0.3)
    
    # 3. Top users PageRank comparison
    btc_pr = results_bitcoin['metrics']['directed']['pagerank']
    nvda_pr = results_nvidia['metrics']['directed']['pagerank']
    
    top_btc = sorted(btc_pr.items(), key=lambda x: x[1], reverse=True)[:10]
    top_nvda = sorted(nvda_pr.items(), key=lambda x: x[1], reverse=True)[:10]
    
    users_btc = [u[:15] for u, _ in top_btc]
    scores_btc = [s for _, s in top_btc]
    users_nvda = [u[:15] for u, _ in top_nvda]
    scores_nvda = [s for _, s in top_nvda]
    
    y_btc = np.arange(len(users_btc))
    y_nvda = np.arange(len(users_nvda))
    
    axes[1, 0].barh(y_btc, scores_btc, color=COLORS['primary'], alpha=0.8, edgecolor='black')
    axes[1, 0].set_yticks(y_btc)
    axes[1, 0].set_yticklabels(users_btc, fontsize=8)
    axes[1, 0].set_xlabel('PageRank Score', fontweight='bold')
    axes[1, 0].set_title('Top 10 Users - Bitcoin', fontweight='bold', fontsize=12)
    axes[1, 0].invert_yaxis()
    axes[1, 0].grid(axis='x', alpha=0.3)
    
    axes[1, 1].barh(y_nvda, scores_nvda, color=COLORS['secondary'], alpha=0.8, edgecolor='black')
    axes[1, 1].set_yticks(y_nvda)
    axes[1, 1].set_yticklabels(users_nvda, fontsize=8)
    axes[1, 1].set_xlabel('PageRank Score', fontweight='bold')
    axes[1, 1].set_title('Top 10 Users - NVIDIA', fontweight='bold', fontsize=12)
    axes[1, 1].invert_yaxis()
    axes[1, 1].grid(axis='x', alpha=0.3)
    
    plt.suptitle('Comparative Network Analysis: Bitcoin vs NVIDIA', fontsize=16, fontweight='bold', y=0.995)
    plt.tight_layout()
    plt.savefig('comparative_analysis.png', dpi=300, bbox_inches='tight', facecolor='white')
    plt.show()
    print(f"\nüíæ Saved: comparative_analysis.png")

# ==========================================
# 6. COMPLETE EXECUTION
# ==========================================

if __name__ == "__main__":
    
    print("\n" + "üåü"*35)
    print("  REDDIT NETWORK ANALYSIS BY ARGUMENT")
    print("üåü"*35)
    
    # Load DataFrame (modify filename if needed)
    df = pd.read_excel("threads_df.xlsx")  # Excel file
    # df = pd.read_csv("reddit_data.csv")  # If CSV
    
    print(f"\nüìä Total dataset: {len(df)} comments")
    
    # Check argument column
    if 'argument' not in df.columns:
        print("\n‚ö†Ô∏è  Column 'argument' not found in DataFrame!")
        print("    Make sure the file contains the 'argument' column with values 'nvidia' and 'bitcoin'")
        exit()
    
    # Remove root comments (without parent) and self-replies
    df_filtered = df[pd.notna(df['comment_parent_author'])].copy()
    df_filtered = df_filtered[df_filtered['text_author'] != df_filtered['comment_parent_author']]
    
    print(f"üìä Comments with valid interactions: {len(df_filtered)}")
    
    # Show distribution
    arg_counts = df_filtered['argument'].value_counts()
    print(f"\nüìà Argument distribution:")
    for arg, count in arg_counts.items():
        print(f"   {arg}: {count} comments")
    
    # ==========================================
    # BITCOIN ANALYSIS
    # ==========================================
    
    print("\n\n" + "üü†"*35)
    print("  BITCOIN ANALYSIS")
    print("üü†"*35)
    
    df_bitcoin = df_filtered[df_filtered['argument'].str.lower() == 'bitcoin'].copy()
    
    if len(df_bitcoin) == 0:
        print("‚ö†Ô∏è  No comments found for Bitcoin!")
        results_bitcoin = None
    else:
        results_bitcoin = analyze_single_argument(
            df=df_bitcoin,
            argument_name='Bitcoin',
            min_weight_backbone=2,
            top_n_viz=200
        )
    
    # ==========================================
    # NVIDIA ANALYSIS
    # ==========================================
    
    print("\n\n" + "üü¢"*35)
    print("  NVIDIA ANALYSIS")
    print("üü¢"*35)
    
    df_nvidia = df_filtered[df_filtered['argument'].str.lower() == 'nvidia'].copy()
    
    if len(df_nvidia) == 0:
        print("‚ö†Ô∏è  No comments found for NVIDIA!")
        results_nvidia = None
    else:
        results_nvidia = analyze_single_argument(
            df=df_nvidia,
            argument_name='NVIDIA',
            min_weight_backbone=2,
            top_n_viz=200
        )
    
    # ==========================================
    # COMBINED ANALYSIS (BITCOIN + NVIDIA)
    # ==========================================
    
    print("\n\n" + "üü£"*35)
    print("  COMBINED ANALYSIS (BITCOIN + NVIDIA)")
    print("üü£"*35)
    
    # Use all filtered data for combined analysis
    if len(df_filtered) == 0:
        print("‚ö†Ô∏è  No comments found for combined analysis!")
        results_combined = None
    else:
        results_combined = analyze_single_argument(
            df=df_filtered,
            argument_name='Combined',
            min_weight_backbone=2,
            top_n_viz=200
        )
    
    # ==========================================
    # COMPARATIVE ANALYSIS
    # ==========================================
    
    if results_bitcoin and results_nvidia and results_combined:
        print("\n\n" + "üìä"*35)
        print("  COMPARATIVE ANALYSIS")
        print("üìä"*35)
        compare_networks(results_bitcoin, results_nvidia, results_combined)
    
    # ==========================================
    # FINAL SUMMARY
    # ==========================================
    
    print("\n\n" + "="*70)
    print("üéâ COMPLETE ANALYSIS FINISHED!")
    print("="*70)
    
    print(f"\nüìÅ GENERATED FILES:")
    print(f"\n   üü† Bitcoin:")
    print(f"      - bitcoin_directed_network.png")
    print(f"      - bitcoin_reciprocal_network.png")
    print(f"      - bitcoin_statistics.png")
    
    print(f"\n   üü¢ NVIDIA:")
    print(f"      - nvidia_directed_network.png")
    print(f"      - nvidia_reciprocal_network.png")
    print(f"      - nvidia_statistics.png")
    
    print(f"\n   üü£ Combined:")
    print(f"      - combined_directed_network.png")
    print(f"      - combined_reciprocal_network.png")
    print(f"      - combined_statistics.png")
    
    print(f"\n   üìä Comparative:")
    print(f"      - comparative_analysis.png")
    
    print("\n" + "="*70)
    print("‚ú® All analyses completed successfully! ‚ú®")
    print("="*70)

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from collections import Counter
from scipy.spatial import ConvexHull
import warnings
warnings.filterwarnings('ignore')

# Verify and import required libraries
try:
    import igraph as ig
    import leidenalg as la
    LEIDEN_AVAILABLE = True
except ImportError as e:
    LEIDEN_AVAILABLE = False
    print("‚ö†Ô∏è  ERROR: Missing libraries!")
    print("   Install with: pip install python-igraph leidenalg")
    print(f"   Error details: {e}")
    print("\n   Code cannot proceed without these libraries.")
    raise ImportError("Install python-igraph and leidenalg to continue")

# ==========================================
# COMMUNITY DETECTION WITH LEIDEN
# ==========================================

def networkx_to_igraph(G_nx):
    """Convert NetworkX network to iGraph for Leiden"""
    nodes = list(G_nx.nodes())
    node_map = {node: idx for idx, node in enumerate(nodes)}
    
    edges = []
    weights = []
    
    for u, v, data in G_nx.edges(data=True):
        edges.append((node_map[u], node_map[v]))
        weights.append(data.get('weight', 1))
    
    g = ig.Graph(directed=G_nx.is_directed())
    g.add_vertices(len(nodes))
    g.add_edges(edges)
    g.es['weight'] = weights
    g.vs['name'] = nodes
    
    return g, node_map, nodes

def detect_communities_leiden(G, resolution=1.0):
    """
    Apply Leiden algorithm for community detection
    
    Parameters:
    -----------
    G : NetworkX graph (directed or undirected)
    resolution : float, resolution parameter (default 1.0)
                 - lower values = larger communities
                 - higher values = smaller communities
    """
    print(f"\nüîç LEIDEN COMMUNITY DETECTION (resolution={resolution})")
    print(f"   Network: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
    
    # Convert to undirected for community detection
    if G.is_directed():
        G_undirected = G.to_undirected()
        print("   ‚Üí Converted to undirected for community detection")
    else:
        G_undirected = G.copy()
    
    # Remove self-loops
    G_undirected.remove_edges_from(nx.selfloop_edges(G_undirected))
    
    # Take only the largest connected component
    largest_cc = max(nx.connected_components(G_undirected), key=len)
    G_clean = G_undirected.subgraph(largest_cc).copy()
    
    print(f"   ‚Üí Largest connected component: {G_clean.number_of_nodes()} nodes")
    
    # Convert to iGraph
    g_igraph, node_map, nodes = networkx_to_igraph(G_clean)
    
    # Apply Leiden
    print("   ‚Üí Running Leiden algorithm...")
    partition = la.find_partition(
        g_igraph,
        la.RBConfigurationVertexPartition,
        weights='weight',
        resolution_parameter=resolution,
        seed=42
    )
    
    # Extract communities
    communities = {}
    for comm_id, members in enumerate(partition):
        for node_idx in members:
            user = nodes[node_idx]
            communities[user] = comm_id
    
    modularity = partition.modularity
    n_communities = len(partition)
    
    print(f"\n‚úÖ Community detection completed!")
    print(f"   Communities found: {n_communities}")
    print(f"   Modularity: {modularity:.4f}")
    
    return communities, modularity, partition, G_clean

# ==========================================
# IMPROVED VISUALIZATIONS
# ==========================================

def get_community_layout(G, communities, top_n=200):
    """
    Create layout where nodes of same community are grouped together
    Uses a hierarchical approach: first place communities, then nodes within
    """
    # Filter top nodes by degree
    degrees = dict(G.degree(weight='weight'))
    top_nodes = sorted(degrees.items(), key=lambda x: x[1], reverse=True)[:top_n]
    top_nodes_set = set([n for n, _ in top_nodes])
    
    # Filter communities to include only top nodes
    filtered_communities = {n: c for n, c in communities.items() if n in top_nodes_set}
    
    # Create subgraph
    G_sub = G.subgraph(top_nodes_set).copy()
    
    # Group nodes by community
    comm_nodes = {}
    for node, comm in filtered_communities.items():
        if comm not in comm_nodes:
            comm_nodes[comm] = []
        comm_nodes[comm].append(node)
    
    # Calculate community positions in a circle
    n_communities = len(comm_nodes)
    comm_positions = {}
    
    angle_step = 2 * np.pi / n_communities
    radius = 10  # Distance from center for communities
    
    for i, comm_id in enumerate(sorted(comm_nodes.keys())):
        angle = i * angle_step
        comm_positions[comm_id] = (radius * np.cos(angle), radius * np.sin(angle))
    
    # Calculate node positions within each community
    pos = {}
    for comm_id, nodes in comm_nodes.items():
        comm_center = comm_positions[comm_id]
        
        # Create subgraph for this community
        G_comm = G_sub.subgraph(nodes).copy()
        
        # Use spring layout for internal structure, scaled down
        if len(nodes) > 1:
            pos_comm = nx.spring_layout(G_comm, k=0.5, iterations=50, scale=2, seed=42)
        else:
            pos_comm = {nodes[0]: (0, 0)}
        
        # Offset by community center
        for node, (x, y) in pos_comm.items():
            pos[node] = (comm_center[0] + x, comm_center[1] + y)
    
    return G_sub, pos, filtered_communities, comm_nodes

def draw_community_hulls(ax, pos, comm_nodes, communities, colors):
    """Draw convex hulls around communities"""
    for comm_id, nodes in comm_nodes.items():
        if len(nodes) < 3:
            continue
        
        # Get positions for this community
        points = np.array([pos[n] for n in nodes if n in pos])
        
        if len(points) < 3:
            continue
        
        try:
            # Calculate convex hull
            hull = ConvexHull(points)
            
            # Get hull vertices
            hull_points = points[hull.vertices]
            
            # Add first point at end to close the polygon
            hull_points = np.vstack([hull_points, hull_points[0]])
            
            # Draw filled polygon
            color = colors[comm_id % len(colors)]
            ax.fill(hull_points[:, 0], hull_points[:, 1], 
                   color=color, alpha=0.2, zorder=0)
            ax.plot(hull_points[:, 0], hull_points[:, 1], 
                   color=color, linewidth=2, alpha=0.6, zorder=1)
        except:
            # If hull fails, just skip
            pass

def visualize_leiden_communities(G, communities, argument_map=None, top_n=200, use_topic_colors=False):
    """
    Visualize network with Leiden communities properly grouped
    
    Parameters:
    -----------
    use_topic_colors : bool
        If True and argument_map exists, color by dominant topic (Bitcoin/NVIDIA)
        If False, use diverse colors for each community
    """
    print("\nüé® Generating Leiden community visualization...")
    
    # Get community-based layout
    G_sub, pos, filtered_communities, comm_nodes = get_community_layout(G, communities, top_n)
    
    print(f"   Visualizing {G_sub.number_of_nodes()} nodes in {len(comm_nodes)} communities")
    
    # Calculate dominant topic for each community
    comm_dominant_topic = {}
    if argument_map:
        for comm_id, nodes in comm_nodes.items():
            topics = [argument_map.get(n, 'Unknown') for n in nodes if n in argument_map]
            if topics:
                topic_counts = Counter(topics)
                dominant = topic_counts.most_common(1)[0]
                comm_dominant_topic[comm_id] = {
                    'topic': dominant[0],
                    'purity': dominant[1] / len(topics),
                    'count': dominant[1],
                    'total': len(topics)
                }
    
    # Prepare colors
    n_communities = len(comm_nodes)
    colors = {}
    
    # Check if we have mixed topics (both Bitcoin and NVIDIA)
    has_mixed_topics = False
    if argument_map and comm_dominant_topic:
        topics_present = set(info['topic'].lower() for info in comm_dominant_topic.values())
        has_mixed_topics = ('bitcoin' in str(topics_present) and 'nvidia' in str(topics_present))
    
    # Use topic-based colors only if explicitly requested AND we have mixed topics
    if use_topic_colors and argument_map and comm_dominant_topic and has_mixed_topics:
        # Use different color schemes for Bitcoin vs NVIDIA communities
        bitcoin_colors = plt.cm.Oranges(np.linspace(0.4, 0.9, n_communities))
        nvidia_colors = plt.cm.Greens(np.linspace(0.4, 0.9, n_communities))
        mixed_colors = plt.cm.Purples(np.linspace(0.4, 0.9, n_communities))
        
        bitcoin_idx = 0
        nvidia_idx = 0
        mixed_idx = 0
        
        for comm_id in sorted(comm_nodes.keys()):
            if comm_id in comm_dominant_topic:
                topic = comm_dominant_topic[comm_id]['topic'].lower()
                purity = comm_dominant_topic[comm_id]['purity']
                
                if 'bitcoin' in topic and purity > 0.6:
                    colors[comm_id] = bitcoin_colors[bitcoin_idx % len(bitcoin_colors)]
                    bitcoin_idx += 1
                elif 'nvidia' in topic and purity > 0.6:
                    colors[comm_id] = nvidia_colors[nvidia_idx % len(nvidia_colors)]
                    nvidia_idx += 1
                else:
                    colors[comm_id] = mixed_colors[mixed_idx % len(mixed_colors)]
                    mixed_idx += 1
            else:
                colors[comm_id] = plt.cm.Greys(0.5)
        
        color_mode = 'topic'
    else:
        # Use diverse colors for each community
        if n_communities <= 20:
            color_array = plt.cm.tab20(np.linspace(0, 1, 20))
        else:
            color_array = plt.cm.gist_rainbow(np.linspace(0, 1, n_communities))
        
        for i, comm_id in enumerate(sorted(comm_nodes.keys())):
            colors[comm_id] = color_array[i % len(color_array)]
        
        color_mode = 'community'
    
    # Create figure
    fig, ax = plt.subplots(figsize=(20, 16))
    ax.set_facecolor('white')
    
    # Draw community hulls first
    draw_community_hulls(ax, pos, comm_nodes, filtered_communities, colors)
    
    # Draw edges
    edge_list = list(G_sub.edges())
    if edge_list:
        weights = [G_sub[u][v].get('weight', 1) for u, v in edge_list]
        max_weight = max(weights) if weights else 1
        
        for (u, v), weight in zip(edge_list, weights):
            x = [pos[u][0], pos[v][0]]
            y = [pos[u][1], pos[v][1]]
            alpha = 0.1 + 0.3 * (weight / max_weight)
            width = 0.3 + 1.0 * (weight / max_weight)
            ax.plot(x, y, color='gray', alpha=alpha, linewidth=width, zorder=2)
    
    # Draw nodes
    degrees = dict(G.degree(weight='weight'))
    
    for node in G_sub.nodes():
        if node not in pos or node not in filtered_communities:
            continue
        
        x, y = pos[node]
        comm_id = filtered_communities[node]
        color = colors[comm_id]
        
        # Node size based on degree
        size = 50 + degrees.get(node, 0) * 3
        
        ax.scatter(x, y, s=size, c=[color], alpha=0.9, 
                  edgecolors='black', linewidths=1.5, zorder=3)
    
    # Draw labels for larger nodes only
    top_degree_nodes = sorted(degrees.items(), key=lambda x: x[1], reverse=True)[:30]
    for node, _ in top_degree_nodes:
        if node in pos:
            x, y = pos[node]
            ax.text(x, y, node, fontsize=7, fontweight='bold',
                   ha='center', va='center', zorder=4)
    
    # Create legend with topic information
    comm_sizes = Counter(filtered_communities.values())
    top_comms = sorted(comm_sizes.keys(), key=lambda c: comm_sizes[c], reverse=True)[:10]
    
    legend_elements = []
    for comm_id in top_comms:
        color = colors[comm_id]
        size = comm_sizes[comm_id]
        
        # Add topic info if in topic color mode
        if color_mode == 'topic' and comm_id in comm_dominant_topic:
            topic_info = comm_dominant_topic[comm_id]
            topic = topic_info['topic']
            purity = topic_info['purity']
            label = f'Community {comm_id}: {size} users - {topic} ({purity*100:.0f}% pure)'
        else:
            label = f'Community {comm_id} ({size} users)'
        
        legend_elements.append(
            plt.Line2D([0], [0], marker='o', color='w',
                      markerfacecolor=color, markersize=12,
                      label=label,
                      markeredgecolor='black', markeredgewidth=1.5)
        )
    
    # Add topic color legend only if in topic color mode
    if color_mode == 'topic':
        legend_elements.append(plt.Line2D([0], [0], linestyle='none', label=''))  # Spacer
        legend_elements.append(plt.Line2D([0], [0], marker='s', color='w',
                              markerfacecolor='orange', markersize=12,
                              label='Bitcoin-dominant (>60%)',
                              markeredgecolor='black', markeredgewidth=1.5))
        legend_elements.append(plt.Line2D([0], [0], marker='s', color='w',
                              markerfacecolor='green', markersize=12,
                              label='NVIDIA-dominant (>60%)',
                              markeredgecolor='black', markeredgewidth=1.5))
        legend_elements.append(plt.Line2D([0], [0], marker='s', color='w',
                              markerfacecolor='purple', markersize=12,
                              label='Mixed topics',
                              markeredgecolor='black', markeredgewidth=1.5))
    
    ax.legend(handles=legend_elements, loc='upper left', fontsize=9,
             title='Top 10 Communities', framealpha=0.95, 
             title_fontsize=11, edgecolor='black')
    
    # Update title
    if color_mode == 'topic':
        bitcoin_comms = sum(1 for c in comm_dominant_topic.values() 
                           if 'bitcoin' in c['topic'].lower() and c['purity'] > 0.6)
        nvidia_comms = sum(1 for c in comm_dominant_topic.values() 
                          if 'nvidia' in c['topic'].lower() and c['purity'] > 0.6)
        mixed_comms = n_communities - bitcoin_comms - nvidia_comms
        
        title_text = (f'LEIDEN COMMUNITY DETECTION\n'
                     f'{G_sub.number_of_nodes()} users grouped into {n_communities} communities\n'
                     f'üü† {bitcoin_comms} Bitcoin-dominant | üü¢ {nvidia_comms} NVIDIA-dominant | üü£ {mixed_comms} Mixed\n'
                     f'Node size = weighted degree | Colored areas = community boundaries')
    else:
        title_text = (f'LEIDEN COMMUNITY DETECTION\n'
                     f'{G_sub.number_of_nodes()} users grouped into {n_communities} communities\n'
                     f'Node size = weighted degree | Colored areas = community boundaries')
    
    ax.set_title(title_text, fontsize=18, fontweight='bold', pad=20)
    
    ax.axis('equal')
    ax.axis('off')
    
    plt.tight_layout()
    plt.savefig('leiden_communities_grouped.png', dpi=300, bbox_inches='tight', facecolor='white')
    plt.show()
    print("üíæ Saved: leiden_communities_grouped.png")

def analyze_communities(G, communities, argument_map=None):
    """Analyze detected communities"""
    print("\n" + "="*70)
    print("üìä COMMUNITY ANALYSIS")
    print("="*70)
    
    comm_sizes = Counter(communities.values())
    n_communities = len(comm_sizes)
    
    print(f"\nTotal communities: {n_communities}")
    print(f"Largest community: {max(comm_sizes.values())} members")
    print(f"Smallest community: {min(comm_sizes.values())} members")
    print(f"Average members per community: {np.mean(list(comm_sizes.values())):.1f}")
    
    print(f"\nüèÜ TOP 10 COMMUNITIES BY SIZE:")
    for i, (comm_id, size) in enumerate(comm_sizes.most_common(10), 1):
        members = [u for u, c in communities.items() if c == comm_id]
        
        G_comm = G.subgraph(members).copy()
        n_edges = G_comm.number_of_edges()
        density = nx.density(G_comm) if len(members) > 1 else 0
        
        if n_edges > 0:
            degrees = dict(G_comm.degree())
            top_members = sorted(degrees.items(), key=lambda x: x[1], reverse=True)[:3]
            top_names = [f"u/{u}" for u, d in top_members]
        else:
            top_names = [f"u/{members[0]}"] if members else []
        
        print(f"\n   {i}. Community {comm_id}: {size} members")
        print(f"      Internal interactions: {n_edges}, Density: {density:.4f}")
        print(f"      Top members: {', '.join(top_names)}")
        
        if argument_map:
            comm_arguments = [argument_map.get(u, 'Unknown') for u in members if u in argument_map]
            if comm_arguments:
                arg_dist = Counter(comm_arguments)
                dominant_arg = arg_dist.most_common(1)[0]
                print(f"      Dominant topic: {dominant_arg[0]} ({dominant_arg[1]}/{len(comm_arguments)} users)")
    
    return comm_sizes

def visualize_community_stats(comm_sizes, communities, argument_map=None):
    """Dashboard with community statistics"""
    print("\nüé® Generating community statistics...")
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.patch.set_facecolor('white')
    
    # 1. Community size distribution
    sizes = list(comm_sizes.values())
    axes[0, 0].hist(sizes, bins=min(30, len(set(sizes))), color='#4ECDC4', alpha=0.8, edgecolor='black')
    axes[0, 0].set_xlabel('Community Size (# users)', fontweight='bold')
    axes[0, 0].set_ylabel('Frequency', fontweight='bold')
    axes[0, 0].set_title('Community Size Distribution', fontweight='bold', fontsize=12)
    if max(sizes) / min(sizes) > 10:
        axes[0, 0].set_yscale('log')
    axes[0, 0].grid(alpha=0.3)
    
    # 2. Top 15 communities bar chart
    top_15 = comm_sizes.most_common(15)
    comm_ids = [f'C{c}' for c, _ in top_15]
    sizes_top = [s for _, s in top_15]
    
    colors_gradient = plt.cm.viridis(np.linspace(0.3, 0.9, len(sizes_top)))
    axes[0, 1].barh(comm_ids, sizes_top, color=colors_gradient, alpha=0.8, edgecolor='black')
    axes[0, 1].set_xlabel('Number of Users', fontweight='bold')
    axes[0, 1].set_title('Top 15 Communities by Size', fontweight='bold', fontsize=12)
    axes[0, 1].invert_yaxis()
    axes[0, 1].grid(axis='x', alpha=0.3)
    
    # 3. Topic purity (if available)
    if argument_map:
        community_purity = {}
        
        for comm_id in set(communities.values()):
            members = [u for u, c in communities.items() if c == comm_id]
            arguments = [argument_map.get(u, 'Unknown') for u in members if u in argument_map]
            
            if arguments:
                arg_counts = Counter(arguments)
                dominant = arg_counts.most_common(1)[0]
                purity = dominant[1] / len(arguments)
                community_purity[comm_id] = {
                    'purity': purity,
                    'dominant_arg': dominant[0],
                    'size': len(members)
                }
        
        top_comms = [c for c, _ in comm_sizes.most_common(20)]
        purities = [community_purity.get(c, {}).get('purity', 0) for c in top_comms]
        args = [community_purity.get(c, {}).get('dominant_arg', 'N/A') for c in top_comms]
        
        colors_by_arg = ['#FF6B6B' if 'bitcoin' in str(a).lower() else '#4ECDC4' if 'nvidia' in str(a).lower() else '#95E1D3' 
                        for a in args]
        
        axes[1, 0].barh([f'C{c}' for c in top_comms], purities,
                       color=colors_by_arg, alpha=0.8, edgecolor='black')
        axes[1, 0].set_xlabel('Topic Purity', fontweight='bold')
        axes[1, 0].set_title('Community Purity (Top 20)', fontweight='bold', fontsize=12)
        axes[1, 0].invert_yaxis()
        axes[1, 0].grid(axis='x', alpha=0.3)
        axes[1, 0].set_xlim([0, 1])
        
        # Add legend
        from matplotlib.patches import Patch
        legend_elements = [
            Patch(facecolor='#FF6B6B', label='Bitcoin-dominant'),
            Patch(facecolor='#4ECDC4', label='NVIDIA-dominant'),
            Patch(facecolor='#95E1D3', label='Mixed')
        ]
        axes[1, 0].legend(handles=legend_elements, loc='lower right')
    else:
        axes[1, 0].text(0.5, 0.5, 'Topic data not available',
                       ha='center', va='center', fontsize=12)
        axes[1, 0].axis('off')
    
    # 4. Cumulative coverage
    sorted_sizes = sorted(sizes, reverse=True)
    cumsum = np.cumsum(sorted_sizes)
    total_users = sum(comm_sizes.values())
    cumsum_pct = cumsum / total_users * 100
    
    axes[1, 1].plot(range(1, len(cumsum_pct)+1), cumsum_pct,
                   color='#FF6B6B', linewidth=3, marker='o', markersize=4)
    axes[1, 1].axhline(y=80, color='gray', linestyle='--', alpha=0.7, label='80% threshold')
    axes[1, 1].set_xlabel('Number of Communities', fontweight='bold')
    axes[1, 1].set_ylabel('Cumulative % of Users', fontweight='bold')
    axes[1, 1].set_title('Cumulative User Coverage', fontweight='bold', fontsize=12)
    axes[1, 1].grid(alpha=0.3)
    axes[1, 1].legend()
    axes[1, 1].set_ylim([0, 105])
    
    plt.suptitle('LEIDEN COMMUNITY DETECTION - Statistics', fontsize=16, fontweight='bold', y=0.995)
    plt.tight_layout()
    plt.savefig('leiden_statistics.png', dpi=300, bbox_inches='tight', facecolor='white')
    plt.show()
    print("üíæ Saved: leiden_statistics.png")

# ==========================================
# MAIN PIPELINE
# ==========================================

def analyze_reddit_communities(df, argument_name='All', resolution=1.0, top_n_viz=200, use_topic_colors=False):
    """
    Complete pipeline for community detection on Reddit data
    
    Parameters:
    -----------
    use_topic_colors : bool
        If True, color communities by dominant topic (for combined analysis)
        If False, use diverse colors for each community (for single-topic analysis)
    """
    print("\n" + "="*70)
    print(f"üöÄ LEIDEN COMMUNITY DETECTION - {argument_name.upper()}")
    print("="*70)
    print(f"Dataset: {len(df)} comments")
    
    # 1. Build network
    print("\nüìä Building network...")
    G = nx.DiGraph()
    
    for _, row in df.iterrows():
        user = row['text_author']
        parent = row['comment_parent_author']
        
        if pd.notna(parent) and user != parent:
            if G.has_edge(user, parent):
                G[user][parent]['weight'] += 1
            else:
                G.add_edge(user, parent, weight=1)
    
    print(f"   Network: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
    
    # 2. Detect communities
    communities, modularity, partition, G_clean = detect_communities_leiden(G, resolution=resolution)
    
    # 3. Analyze communities
    if 'argument' in df.columns:
        argument_map = df.groupby('text_author')['argument'].first().to_dict()
    else:
        argument_map = None
    
    comm_sizes = analyze_communities(G_clean, communities, argument_map)
    
    # 4. Visualize
    print("\n" + "="*70)
    print("üé® GENERATING VISUALIZATIONS")
    print("="*70)
    
    visualize_leiden_communities(G_clean, communities, argument_map, top_n=top_n_viz, use_topic_colors=use_topic_colors)
    visualize_community_stats(comm_sizes, communities, argument_map)
    
    print(f"\n‚úÖ COMMUNITY DETECTION COMPLETED: {argument_name}")
    
    return {
        'network': G_clean,
        'communities': communities,
        'modularity': modularity,
        'comm_sizes': comm_sizes,
        'partition': partition
    }

# ==========================================
# EXECUTION
# ==========================================

if __name__ == "__main__":
    
    print("\n" + "üåü"*35)
    print("  REDDIT LEIDEN COMMUNITY DETECTION")
    print("üåü"*35)
    
    # Load data
    df = pd.read_excel("threads_df.xlsx")
    
    print(f"\nüìä Total dataset: {len(df)} comments")
    
    # Filter valid comments
    df_filtered = df[pd.notna(df['comment_parent_author'])].copy()
    df_filtered = df_filtered[df_filtered['text_author'] != df_filtered['comment_parent_author']]
    
    print(f"üìä Comments with valid interactions: {len(df_filtered)}")
    
    # Check if argument column exists
    if 'argument' in df_filtered.columns:
        arg_counts = df_filtered['argument'].value_counts()
        print(f"\nüìà Argument distribution:")
        for arg, count in arg_counts.items():
            print(f"   {arg}: {count} comments")
    
    # ==========================================
    # 1. COMBINED ANALYSIS (ALL DATA)
    # ==========================================
    
    print("\n\n" + "üü£"*35)
    print("  COMBINED ANALYSIS (BITCOIN + NVIDIA)")
    print("üü£"*35)
    
    results_combined = analyze_reddit_communities(
        df=df_filtered,
        argument_name='Combined',
        resolution=1.0,
        top_n_viz=200,
        use_topic_colors=True  # Use topic-based colors for combined
    )
    
    # Rename output files for combined
    import os
    if os.path.exists('leiden_communities_grouped.png'):
        if os.path.exists('combined_leiden_communities.png'):
            os.remove('combined_leiden_communities.png')
        os.rename('leiden_communities_grouped.png', 'combined_leiden_communities.png')
    if os.path.exists('leiden_statistics.png'):
        if os.path.exists('combined_leiden_statistics.png'):
            os.remove('combined_leiden_statistics.png')
        os.rename('leiden_statistics.png', 'combined_leiden_statistics.png')
    
    # ==========================================
    # 2. BITCOIN ANALYSIS
    # ==========================================
    
    if 'argument' in df_filtered.columns:
        print("\n\n" + "üü†"*35)
        print("  BITCOIN ANALYSIS")
        print("üü†"*35)
        
        df_bitcoin = df_filtered[df_filtered['argument'].str.lower() == 'bitcoin'].copy()
        
        if len(df_bitcoin) > 0:
            results_bitcoin = analyze_reddit_communities(
                df=df_bitcoin,
                argument_name='Bitcoin',
                resolution=1.0,
                top_n_viz=200,
                use_topic_colors=False  # Use diverse colors for single topic
            )
            
            # Rename output files for bitcoin
            if os.path.exists('leiden_communities_grouped.png'):
                if os.path.exists('bitcoin_leiden_communities.png'):
                    os.remove('bitcoin_leiden_communities.png')
                os.rename('leiden_communities_grouped.png', 'bitcoin_leiden_communities.png')
            if os.path.exists('leiden_statistics.png'):
                if os.path.exists('bitcoin_leiden_statistics.png'):
                    os.remove('bitcoin_leiden_statistics.png')
                os.rename('leiden_statistics.png', 'bitcoin_leiden_statistics.png')
        else:
            print("‚ö†Ô∏è  No comments found for Bitcoin!")
        
        # ==========================================
        # 3. NVIDIA ANALYSIS
        # ==========================================
        
        print("\n\n" + "üü¢"*35)
        print("  NVIDIA ANALYSIS")
        print("üü¢"*35)
        
        df_nvidia = df_filtered[df_filtered['argument'].str.lower() == 'nvidia'].copy()
        
        if len(df_nvidia) > 0:
            results_nvidia = analyze_reddit_communities(
                df=df_nvidia,
                argument_name='NVIDIA',
                resolution=1.0,
                top_n_viz=200,
                use_topic_colors=False  # Use diverse colors for single topic
            )
            
            # Rename output files for nvidia
            if os.path.exists('leiden_communities_grouped.png'):
                if os.path.exists('nvidia_leiden_communities.png'):
                    os.remove('nvidia_leiden_communities.png')
                os.rename('leiden_communities_grouped.png', 'nvidia_leiden_communities.png')
            if os.path.exists('leiden_statistics.png'):
                if os.path.exists('nvidia_leiden_statistics.png'):
                    os.remove('nvidia_leiden_statistics.png')
                os.rename('leiden_statistics.png', 'nvidia_leiden_statistics.png')
        else:
            print("‚ö†Ô∏è  No comments found for NVIDIA!")
    
    # ==========================================
    # FINAL SUMMARY
    # ==========================================
    
    print("\n\n" + "="*70)
    print("üéâ COMPLETE ANALYSIS FINISHED!")
    print("="*70)
    
    print(f"\nüìÅ GENERATED FILES:")
    
    print(f"\n   üü£ Combined (Bitcoin + NVIDIA):")
    print(f"      - combined_leiden_communities.png")
    print(f"      - combined_leiden_statistics.png")
    
    if 'argument' in df_filtered.columns:
        print(f"\n   üü† Bitcoin:")
        print(f"      - bitcoin_leiden_communities.png")
        print(f"      - bitcoin_leiden_statistics.png")
        
        print(f"\n   üü¢ NVIDIA:")
        print(f"      - nvidia_leiden_communities.png")
        print(f"      - nvidia_leiden_statistics.png")
    
    print("\n" + "="*70)
    print("‚ú® All analyses completed successfully! ‚ú®")
    print("="*70)

# Threads and Tweets df

In [None]:
tweets_df = pd.read_excel('tweets_df.xlsx')
threads_df = pd.read_excel('threads_df.xlsx')

In [None]:
tnt_df = pd.concat([tweets_df, threads_df], ignore_index=True)
tnt_df.to_excel('tnt_df.xlsx', index=False)

In [None]:
tnt_df.head()

## Text cleaning, lemmatization , vectorization

In [None]:
tnt_df = pd.read_excel("tnt_df.xlsx")

In [None]:
# ============================================================================
# REQUIRED LIBRARIES
# ============================================================================
import re
import pandas as pd
import math
import os
from collections import Counter
from tqdm import tqdm
import spacy
import nltk
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.feature_extraction.text import CountVectorizer

# Download NLTK stopwords
try:
    nltk_stopwords.words('english')
except LookupError:
    nltk.download('stopwords')

# ============================================================================
# 1. TEXT CLEANING FUNCTION
# ============================================================================

def clean_text(text, remove_hashtag_symbol=True, remove_cashtag_before_words=True, 
               remove_emojis=True, keep_dollar_numbers=True, lowercase=True):
    
    if not isinstance(text, str):
        return ""
    
    # Remove URLs
    text = re.sub(r"(f|ht)(tp)(s?)(://)(.*?)([\s]|$)", " ", text)
    
    # Remove RT patterns
    text = re.sub(r"(RT|rt|via)((?:\b\W*@\w+)+)", " ", text)
    
    # Remove HTML entities
    html_entities = ["&copy;", "&reg;", "&trade;", "&ldquo;", "&lsquo;", "&rsquo;", 
                     "&bull;", "&middot;", "&ndash;", "&mdash;", "&nbsp;", "&lt;", 
                     "&gt;", "&amp;", "&quot;"]
    for entity in html_entities:
        text = text.replace(entity, " ")
    
    # Remove mentions (@username)
    text = re.sub(r"@\S+", " ", text)
    
    # Remove emojis (including flag emojis like üáÆüá≥)
    if remove_emojis:
        # Remove emoji characters (comprehensive pattern)
        text = re.sub(r"[^\w\s,.\'!?-]", "", text)
    
    # Handle $ symbol:
    # Remove $ only before letters (not before numbers)
    if remove_cashtag_before_words:
        text = re.sub(r'\$(?=[A-Za-z])', '', text)
    
    # Handle # symbol:
    # Remove '#' but keep the word after it
    if remove_hashtag_symbol:
        text = re.sub(r'#(\w+)', r'\1', text)
    
    # Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()
    
    # Lowercase if requested
    if lowercase:
        text = text.lower()
    
    return text


# ============================================================================
# 2. SPACY UTILITIES & COLLOCATIONS
# ============================================================================

def load_spacy_model(model_name="en_core_web_sm"):
    """Load spaCy model"""
    try:
        nlp = spacy.load(model_name)
    except OSError:
        raise FileNotFoundError(f"Cannot load spaCy model: {model_name}. "
                                f"Run: python -m spacy download {model_name}")
    return nlp


def annotate_texts(nlp, texts, show_progress=True):
    """Annotate list of texts with spaCy"""
    out = []
    iterable = tqdm(texts, desc="Annotating") if show_progress else texts
    
    for text in iterable:
        doc = nlp(str(text) if pd.notna(text) else "")
        tokens = [{
            'form': token.text,
            'lemma': token.lemma_,
            'upos': token.pos_
        } for token in doc]
        out.append(tokens)
    
    return out


def extract_collocations_POS(texts, nlp, pos_patterns=[('ADJ','NOUN'), ('NOUN','NOUN'), 
                                                        ('NOUN','PROPN'), ('PROPN','PROPN')], 
                             min_freq=2, save_file="colloc_POS.xlsx", verbose=True):
    
    annotated = annotate_texts(nlp, texts, show_progress=verbose)
    counts = Counter()
    pattern_map = {}
    
    for doc in annotated:
        for i in range(len(doc)-1):
            t1, t2 = doc[i], doc[i+1]
            pattern = (t1['upos'], t2['upos'])
            
            if pattern in pos_patterns:
                w1 = (t1['lemma'] if t1['lemma'] != '_' else t1['form']).lower()
                w2 = (t2['lemma'] if t2['lemma'] != '_' else t2['form']).lower()
                colloc = f"{w1} {w2}"
                counts[colloc] += 1
                pattern_map[colloc] = f"{pattern[0]} {pattern[1]}"
    
    rows = [{'collocation': c, 'freq': f, 'pos_pattern': pattern_map[c]}
            for c, f in counts.items() if f >= min_freq]
    
    df = pd.DataFrame(rows).sort_values('freq', ascending=False).reset_index(drop=True)
    
    if save_file:
        df.to_excel(save_file, index=False)
        if verbose: 
            print(f"‚úì Saved {save_file} ({len(df)} collocations)")
    
    return df


def extract_collocations_PMI(texts, nlp, top_n=200, min_freq=2, 
                             save_file="colloc_PMI.xlsx", verbose=True):
    
    annotated = annotate_texts(nlp, texts, show_progress=verbose)
    unigram = Counter()
    bigram = Counter()
    total_unigrams = 0
    
    for doc in annotated:
        lemmas = []
        for tok in doc:
            if tok['upos'] == 'PUNCT':
                continue
            lemma = (tok['lemma'] if tok['lemma'] != '_' else tok['form']).lower()
            lemmas.append(lemma)
            unigram[lemma] += 1
            total_unigrams += 1
        
        for i in range(len(lemmas)-1):
            bigram[f"{lemmas[i]} {lemmas[i+1]}"] += 1
    
    N = max(total_unigrams, 1)
    rows = []
    
    for big, freq in bigram.items():
        if freq < min_freq:
            continue
        
        w1, w2 = big.split(" ", 1)
        p_w1 = unigram[w1] / N
        p_w2 = unigram[w2] / N
        p_w1w2 = freq / max(1, N-1)
        
        if p_w1 > 0 and p_w2 > 0 and p_w1w2 > 0:
            pmi = math.log2(p_w1w2 / (p_w1 * p_w2))
        else:
            pmi = float('-inf')
        
        rows.append({'collocation': big, 'freq': freq, 'pmi': pmi})
    
    df = pd.DataFrame(rows).sort_values(['pmi', 'freq'], ascending=[False, False])
    
    if top_n:
        df = df.head(top_n)
    
    df = df.reset_index(drop=True)
    
    if save_file:
        df.to_excel(save_file, index=False)
        if verbose:
            print(f"‚úì Saved {save_file} ({len(df)} collocations)")
    
    return df


def apply_collocations(texts, colloc_file, verbose=True):
    """Replace multi-word collocations with underscores"""
    
    if not os.path.exists(colloc_file):
        if verbose:
            print(f"‚ö† File not found: {colloc_file}. Skipping.")
        return texts
    
    df = pd.read_excel(colloc_file)
    collocations = sorted(df['collocation'].dropna().unique(), 
                         key=lambda s: len(s.split()), reverse=True)
    
    patterns = []
    for colloc in collocations:
        escaped = r'\s+'.join(re.escape(word) for word in colloc.split())
        pattern = re.compile(rf'\b{escaped}\b', flags=re.IGNORECASE)
        replacement = "_".join(colloc.split())
        patterns.append((pattern, replacement))
    
    result = []
    iterator = tqdm(texts, desc=f"Applying {os.path.basename(colloc_file)}") if verbose else texts
    
    for text in iterator:
        if pd.isna(text):
            result.append(text)
            continue
        
        text_str = str(text)
        for pattern, repl in patterns:
            text_str = pattern.sub(repl, text_str)
        result.append(text_str)
    
    return result


def apply_collocation_pipeline(df, text_col='text_cleaned', nlp=None, verbose=True):
    """Apply complete collocation extraction and substitution pipeline"""
    
    if nlp is None:
        nlp = load_spacy_model()
    
    # Step 1: POS collocations
    if verbose:
        print("\n" + "="*60)
        print("STEP 1: POS-based collocations")
        print("="*60)
    
    pos_df = extract_collocations_POS(df[text_col], nlp, min_freq=1, verbose=verbose)
    df[text_col] = apply_collocations(df[text_col], "colloc_POS.xlsx", verbose=verbose)
    
    # Step 2: PMI collocations
    if verbose:
        print("\n" + "="*60)
        print("STEP 2: PMI-based collocations")
        print("="*60)
    
    pmi_df = extract_collocations_PMI(df[text_col], nlp, top_n=200, min_freq=2, verbose=verbose)
    df[text_col] = apply_collocations(df[text_col], "colloc_PMI.xlsx", verbose=verbose)
    
    return df, pos_df, pmi_df


# ============================================================================
# 3. LEMMATIZATION WITH STOPWORDS
# ============================================================================

def lemmatize_texts(texts, nlp=None, stopwords_list=None, verbose=True):
    """Lemmatize texts and identify stopwords"""
    
    if nlp is None:
        nlp = load_spacy_model()
    
    if stopwords_list is None:
        stopwords_list = list(nltk_stopwords.words('english'))
    
    stopwords_lower = set(w.lower() for w in stopwords_list)
    
    results = []
    iterator = tqdm(enumerate(texts), total=len(texts), desc="Lemmatizing") if verbose else enumerate(texts)
    
    for doc_idx, text in iterator:
        doc = nlp(str(text) if pd.notna(text) else "")
        doc_id = f"doc_{doc_idx}"
        
        for token_idx, token in enumerate(doc):
            if token.is_punct or token.is_space:
                continue
            
            is_stopword = (token.text.lower() in stopwords_lower or 
                          token.lemma_.lower() in stopwords_lower)
            
            results.append({
                'doc_id': doc_id,
                'token_id': token_idx + 1,
                'token': token.text,
                'lemma': token.lemma_,
                'upos': token.pos_,
                'STOP': is_stopword
            })
    
    df_lem = pd.DataFrame(results)
    
    if verbose:
        print(f"\n‚úì Lemmatization complete: {len(df_lem)} tokens from {len(texts)} documents")
        print(f"  Stopwords: {df_lem['STOP'].sum()}")
        print(f"  Content words: {(~df_lem['STOP']).sum()}")
    
    return df_lem


def create_text_withstop(df, lemmatized_df, verbose=True):
    """Create text_lemmatized column WITH stopwords (all tokens)"""
    
    if verbose:
        print("\nCreating text_lemmatized column (with stopwords)...")
    
    def reconstruct_withstop(doc_idx):
        doc_id = f"doc_{doc_idx}"
        content_lemmas = lemmatized_df[lemmatized_df['doc_id'] == doc_id]['lemma'].tolist()
        return ' '.join(content_lemmas)
    
    df['text_lemmatized'] = [reconstruct_withstop(i) for i in range(len(df))]
    
    if verbose:
        avg_tokens = df['text_lemmatized'].str.split().str.len().mean()
        print(f"‚úì Created text_lemmatized column")
        print(f"  Avg tokens: {avg_tokens:.1f}")
    
    return df


def create_text_nostop(df, lemmatized_df, verbose=True):
    """Create text_nostop column WITHOUT stopwords (content words only)"""
    
    if verbose:
        print("\nCreating text_nostop column (without stopwords)...")
    
    def reconstruct_nostop(doc_idx):
        doc_id = f"doc_{doc_idx}"
        # Filter only content words (STOP == False)
        content_lemmas = lemmatized_df[
            (lemmatized_df['doc_id'] == doc_id) & 
            (~lemmatized_df['STOP'])
        ]['lemma'].tolist()
        return ' '.join(content_lemmas)
    
    df['text_nostop'] = [reconstruct_nostop(i) for i in range(len(df))]
    
    if verbose:
        avg_tokens = df['text_nostop'].str.split().str.len().mean()
        print(f"‚úì Created text_nostop column")
        print(f"  Avg tokens: {avg_tokens:.1f}")
    
    return df


# ============================================================================
# 4. TERM-DOCUMENT MATRIX
# ============================================================================

def create_document_term_matrix(texts, save_file="document_term_matrix.csv", verbose=True):
    
    vectorizer = CountVectorizer(
        lowercase=True,
        token_pattern=r'(?u)\b\w+\b',
        min_df=1
    )
    
    dtm_sparse = vectorizer.fit_transform(texts)
    
    # Transpose: rows=terms, columns=documents
    dtm_df = pd.DataFrame(
        dtm_sparse.toarray().T,
        index=vectorizer.get_feature_names_out(),
        columns=[f'doc_{i}' for i in range(len(texts))]
    )
    
    if verbose:
        sparsity = (dtm_sparse.nnz / (dtm_sparse.shape[0] * dtm_sparse.shape[1]) * 100)
        print(f"\n‚úì DTM created: {dtm_df.shape[0]} terms √ó {dtm_df.shape[1]} documents")
        print(f"  Sparsity: {sparsity:.2f}%")
    
    if save_file:
        dtm_df.to_csv(save_file)
        if verbose:
            print(f"  Saved to: {save_file}")
    
    return dtm_df, vectorizer


# ============================================================================
# MAIN EXECUTION FUNCTION
# ============================================================================

def process_tweets(tnt_df, verbose=True):
    """Complete processing pipeline for tweet data"""
    
    print("="*60)
    print("NLP TEXT PROCESSING PIPELINE")
    print("="*60)
    
    df = tnt_df.copy()
    
    # STEP 1: Text Cleaning
    print("\n" + "="*60)
    print("STEP 1: Text Cleaning")
    print("="*60)
    
    df['text_cleaned'] = df['text'].apply(
        lambda x: clean_text(x, remove_hashtag_symbol=True, remove_cashtag_before_words=True, 
               remove_emojis=True, keep_dollar_numbers=True, lowercase=True)
    )
    
    if verbose:
        print(f"\n‚úì Cleaned {len(df)} documents")
        print(f"  Avg length before: {df['text'].str.len().mean():.0f} chars")
        print(f"  Avg length after: {df['text_cleaned'].str.len().mean():.0f} chars")
    
    # STEP 2: Collocations
    print("\n" + "="*60)
    print("STEP 2: Collocations")
    print("="*60)
    
    nlp = load_spacy_model()
    df, pos_df, pmi_df = apply_collocation_pipeline(df, text_col='text_cleaned', nlp=nlp, verbose=verbose)
    
    # STEP 3: Lemmatization
    print("\n" + "="*60)
    print("STEP 3: Lemmatization")
    print("="*60)
    
    lemmatized_df = lemmatize_texts(df['text_cleaned'].tolist(), nlp=nlp, verbose=verbose)
    lemmatized_df.to_csv('lemmatized_tokens.csv', index=False)
    if verbose:
        print("‚úì Saved: lemmatized_tokens.csv")
    
    # STEP 4: Create text columns (with and without stopwords)
    print("\n" + "="*60)
    print("STEP 4: Creating Text Columns")
    print("="*60)
    
    df = create_text_withstop(df, lemmatized_df, verbose=verbose)
    df = create_text_nostop(df, lemmatized_df, verbose=verbose)
    
    # STEP 5: Document-Term Matrix (using text_nostop)
    print("\n" + "="*60)
    print("STEP 5: Document-Term Matrix")
    print("="*60)
    
    dtm_df, vectorizer = create_document_term_matrix(df['text_nostop'].tolist(), verbose=verbose)
    
    # Summary
    if verbose:
        print("\n" + "="*60)
        print("PIPELINE COMPLETE!")
        print("="*60)
        print(f"\nüìä Summary:")
        print(f"  Documents processed: {len(df)}")
        print(f"  Total tokens: {len(lemmatized_df)}")
        print(f"  Content words: {(~lemmatized_df['STOP']).sum()}")
        print(f"  POS collocations: {len(pos_df)}")
        print(f"  PMI collocations: {len(pmi_df)}")
        print(f"  Vocabulary size: {dtm_df.shape[0]}")
        
        print(f"\nüíæ Files created:")
        print(f"  ‚úì colloc_POS.xlsx")
        print(f"  ‚úì colloc_PMI.xlsx")
        print(f"  ‚úì lemmatized_tokens.csv")
        print(f"  ‚úì document_term_matrix.csv")
        
        print(f"\nüìà Dataframe columns:")
        print(f"  ‚Ä¢ text_cleaned: cleaned text")
        print(f"  ‚Ä¢ text_lemmatized: lemmatized with stopwords")
        print(f"  ‚Ä¢ text_nostop: lemmatized without stopwords")
        
        print(f"\nüìà Top 10 content words:")
        top_words = lemmatized_df[~lemmatized_df['STOP']]['lemma'].value_counts().head(10)
        for word, freq in top_words.items():
            print(f"  ‚Ä¢ {word}: {freq}")
    
    return df, lemmatized_df, dtm_df, pos_df, pmi_df


# ============================================================================
# RUN PIPELINE
# ============================================================================

if __name__ == "__main__":
    # Assuming tnt_df is already loaded from your scraping script
    df_processed, lemmatized_tokens, dtm, pos_collocations, pmi_collocations = process_tweets(tnt_df)

In [None]:
df_processed.to_excel('df_processed.xlsx', index=False)

In [None]:
df_processed = pd.read_excel('df_processed.xlsx')
df_processed.head()

## Sentiment Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from nrclex import NRCLex

# ==========================================
# 1. ANALISI SENTIMENT
# ==========================================

def analyze_vader(df, text_col='text_lemmatized'):
    """VADER: Ottimizzato per social media"""
    print("\nüîµ VADER Sentiment Analysis...")
    analyzer = SentimentIntensityAnalyzer()
    
    results = []
    for text in tqdm(df[text_col], desc="VADER"):
        scores = analyzer.polarity_scores(str(text))
        results.append({
            'vader_compound': scores['compound'],
            'vader_pos': scores['pos'],
            'vader_neu': scores['neu'],
            'vader_neg': scores['neg'],
            'vader_label': 'positive' if scores['compound'] >= 0.05 
                          else 'negative' if scores['compound'] <= -0.05 
                          else 'neutral'
        })
    
    return pd.DataFrame(results)

def analyze_textblob(df, text_col='text_lemmatized'):
    """TextBlob: Include subjectivity"""
    print("\nüîµ TextBlob Sentiment Analysis...")
    
    results = []
    for text in tqdm(df[text_col], desc="TextBlob"):
        try:
            blob = TextBlob(str(text))
            results.append({
                'textblob_polarity': blob.sentiment.polarity,
                'textblob_subjectivity': blob.sentiment.subjectivity,
                'textblob_label': 'positive' if blob.sentiment.polarity > 0.1
                                 else 'negative' if blob.sentiment.polarity < -0.1
                                 else 'neutral'
            })
        except:
            results.append({
                'textblob_polarity': 0,
                'textblob_subjectivity': 0,
                'textblob_label': 'neutral'
            })
    
    return pd.DataFrame(results)

def analyze_nrclex(df, text_col='text_lemmatized'):
    """NRCLex: 8 emozioni base"""
    print("\nüòä NRCLex Emotion Analysis...")
    
    results = []
    for text in tqdm(df[text_col], desc="NRCLex"):
        try:
            emotion = NRCLex(str(text))
            freq = emotion.affect_frequencies
            
            results.append({
                'nrc_fear': freq.get('fear', 0),
                'nrc_anger': freq.get('anger', 0),
                'nrc_anticipation': freq.get('anticipation', 0),
                'nrc_trust': freq.get('trust', 0),
                'nrc_surprise': freq.get('surprise', 0),
                'nrc_sadness': freq.get('sadness', 0),
                'nrc_joy': freq.get('joy', 0),
                'nrc_disgust': freq.get('disgust', 0),
                'nrc_positive': freq.get('positive', 0),
                'nrc_negative': freq.get('negative', 0),
                'nrc_dominant_emotion': max(freq.items(), key=lambda x: x[1])[0] if freq else 'neutral'
            })
        except:
            results.append({
                'nrc_fear': 0, 'nrc_anger': 0, 'nrc_anticipation': 0,
                'nrc_trust': 0, 'nrc_surprise': 0, 'nrc_sadness': 0,
                'nrc_joy': 0, 'nrc_disgust': 0, 'nrc_positive': 0,
                'nrc_negative': 0, 'nrc_dominant_emotion': 'neutral'
            })
    
    return pd.DataFrame(results)

# ==========================================
# 2. AGGREGAZIONE
# ==========================================

def aggregate_sentiments(df):
    """Crea sentiment ensemble"""
    print("\nüìä Aggregazione sentiment scores...")
    
    # Ensemble polarity (media di VADER e TextBlob)
    df['polarity_ensemble'] = (df['vader_compound'] + df['textblob_polarity']) / 2
    
    # Majority vote
    def majority_vote(row):
        labels = [row['vader_label'], row['textblob_label']]
        return max(set(labels), key=labels.count)
    
    df['sentiment_ensemble'] = df.apply(majority_vote, axis=1)
    
    return df

# ==========================================
# 3. VISUALIZZAZIONI
# ==========================================

def plot_sentiment_distribution(df, title="Sentiment Distribution"):
    """Visualizza distribuzioni sentiment"""
    fig, axes = plt.subplots(2, 2, figsize=(16, 10))
    fig.suptitle(title, fontsize=16, fontweight='bold')
    
    # VADER
    axes[0, 0].hist(df['vader_compound'], bins=50, color='steelblue', alpha=0.7, edgecolor='black')
    axes[0, 0].axvline(0, color='red', linestyle='--', linewidth=2)
    axes[0, 0].set_title('VADER Compound Score')
    axes[0, 0].set_xlabel('Score')
    axes[0, 0].set_ylabel('Frequency')
    
    # TextBlob
    axes[0, 1].hist(df['textblob_polarity'], bins=50, color='coral', alpha=0.7, edgecolor='black')
    axes[0, 1].axvline(0, color='red', linestyle='--', linewidth=2)
    axes[0, 1].set_title('TextBlob Polarity')
    axes[0, 1].set_xlabel('Score')
    axes[0, 1].set_ylabel('Frequency')
    
    # Ensemble
    axes[1, 0].hist(df['polarity_ensemble'], bins=50, color='mediumseagreen', alpha=0.7, edgecolor='black')
    axes[1, 0].axvline(0, color='red', linestyle='--', linewidth=2)
    axes[1, 0].set_title('Ensemble Polarity')
    axes[1, 0].set_xlabel('Score')
    axes[1, 0].set_ylabel('Frequency')
    
    # Sentiment labels
    sentiment_counts = df['sentiment_ensemble'].value_counts()
    colors = {'positive': 'green', 'neutral': 'gray', 'negative': 'red'}
    axes[1, 1].bar(sentiment_counts.index, sentiment_counts.values, 
                   color=[colors.get(x, 'gray') for x in sentiment_counts.index], alpha=0.7)
    axes[1, 1].set_title('Sentiment Labels Distribution')
    axes[1, 1].set_ylabel('Count')
    
    plt.tight_layout()
    plt.savefig(f'{title.lower().replace(" ", "_").replace("/", "_")}.png', dpi=300, bbox_inches='tight')
    plt.show()

def plot_emotions_distribution(df, title="Emotions Distribution"):
    """Visualizza distribuzioni emozioni"""
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    fig.suptitle(title, fontsize=16, fontweight='bold')
    
    # NRC Emotions Media
    nrc_cols = ['nrc_fear', 'nrc_anger', 'nrc_anticipation', 'nrc_trust', 
                'nrc_surprise', 'nrc_sadness', 'nrc_joy', 'nrc_disgust']
    nrc_means = df[nrc_cols].mean().sort_values(ascending=False)
    
    axes[0].barh(nrc_means.index, nrc_means.values, color='skyblue', alpha=0.8)
    axes[0].set_title('NRCLex Emotions (Average Frequency)')
    axes[0].set_xlabel('Frequency')
    
    # Emozioni dominanti
    nrc_dominant_counts = df['nrc_dominant_emotion'].value_counts().head(10)
    axes[1].bar(nrc_dominant_counts.index, nrc_dominant_counts.values, 
                color='mediumseagreen', alpha=0.8)
    axes[1].set_title('NRCLex Dominant Emotion')
    axes[1].set_ylabel('Count')
    axes[1].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.savefig(f'{title.lower().replace(" ", "_").replace("/", "_")}.png', dpi=300, bbox_inches='tight')
    plt.show()

def plot_temporal_sentiment(df, date_col='text_date', title="Temporal Sentiment"):
    """Analisi temporale del sentiment"""
    if date_col not in df.columns:
        print(f"‚ö†Ô∏è Colonna {date_col} non trovata")
        return
    
    df_temp = df.copy()
    df_temp[date_col] = pd.to_datetime(df_temp[date_col], errors='coerce')
    df_temp = df_temp.dropna(subset=[date_col])
    
    if len(df_temp) == 0:
        print("‚ö†Ô∏è Nessuna data valida trovata")
        return
    
    df_temp['date_only'] = df_temp[date_col].dt.date
    
    daily = df_temp.groupby('date_only').agg({
        'vader_compound': 'mean',
        'textblob_polarity': 'mean',
        'polarity_ensemble': 'mean'
    }).reset_index()
    
    fig, ax = plt.subplots(figsize=(16, 6))
    fig.suptitle(title, fontsize=16, fontweight='bold')
    
    ax.plot(daily['date_only'], daily['vader_compound'], label='VADER', marker='o', alpha=0.7)
    ax.plot(daily['date_only'], daily['textblob_polarity'], label='TextBlob', marker='s', alpha=0.7)
    ax.plot(daily['date_only'], daily['polarity_ensemble'], label='Ensemble', marker='^', linewidth=2)
    ax.axhline(0, color='red', linestyle='--', linewidth=1)
    ax.set_ylabel('Polarity Score')
    ax.set_xlabel('Date')
    ax.legend()
    ax.grid(alpha=0.3)
    
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f'{title.lower().replace(" ", "_").replace("/", "_")}.png', dpi=300, bbox_inches='tight')
    plt.show()

def print_statistics(df, category_name="Dataset"):
    """Stampa statistiche descrittive"""
    print("\n" + "="*60)
    print(f"üìä STATISTICS - {category_name}")
    print("="*60)
    
    print("\nüéØ SENTIMENT DISTRIBUTION:")
    sentiment_pct = df['sentiment_ensemble'].value_counts(normalize=True) * 100
    for label, pct in sentiment_pct.items():
        print(f"   {label.capitalize()}: {pct:.2f}%")
    
    print("\nüìà POLARITY SCORES:")
    print(f"   VADER: {df['vader_compound'].mean():.3f} (std: {df['vader_compound'].std():.3f})")
    print(f"   TextBlob: {df['textblob_polarity'].mean():.3f} (std: {df['textblob_polarity'].std():.3f})")
    print(f"   Ensemble: {df['polarity_ensemble'].mean():.3f} (std: {df['polarity_ensemble'].std():.3f})")
    
    agreement = (df['vader_label'] == df['textblob_label']).mean() * 100
    print(f"\nü§ù MODEL AGREEMENT: {agreement:.2f}%")
    
    print("\nüí≠ SUBJECTIVITY:")
    print(f"   Mean: {df['textblob_subjectivity'].mean():.3f}")
    print(f"   Median: {df['textblob_subjectivity'].median():.3f}")
    
    print("\nüòä TOP 5 EMOTIONS (NRCLex):")
    top_emotions = df['nrc_dominant_emotion'].value_counts().head(5)
    for emotion, count in top_emotions.items():
        pct = (count / len(df)) * 100
        print(f"   {emotion.capitalize()}: {count} ({pct:.2f}%)")
    
    print("\n" + "="*60)

# ==========================================
# 4. PIPELINE COMPLETA
# ==========================================

def analyze_complete_sentiment(df, category_name="Dataset"):
    """Pipeline completa sentiment analysis (solo lessicale)"""
    print(f"\n{'='*60}")
    print(f"üöÄ SENTIMENT ANALYSIS: {category_name}")
    print(f"{'='*60}")
    print(f"Dataset: {len(df)} post/commenti")
    print("Metodi: VADER + TextBlob + NRCLex")
    
    # Analisi
    vader_results = analyze_vader(df)
    textblob_results = analyze_textblob(df)
    nrc_results = analyze_nrclex(df)
    
    # Merge
    results_df = pd.concat([
        df.reset_index(drop=True),
        vader_results, 
        textblob_results,
        nrc_results
    ], axis=1)
    
    # Aggregazione
    results_df = aggregate_sentiments(results_df)
    
    # Output
    print_statistics(results_df, category_name)
    plot_sentiment_distribution(results_df, f"{category_name} - Sentiment")
    plot_emotions_distribution(results_df, f"{category_name} - Emotions")
    
    if 'text_date' in results_df.columns:
        plot_temporal_sentiment(results_df, date_col='text_date', title=f"{category_name} - Temporal")
    
    print(f"\n‚úÖ COMPLETATO: {category_name}")
    
    return results_df

# ==========================================
# 5. CONFRONTI
# ==========================================

def compare_two_groups(df1, df2, name1="Group 1", name2="Group 2", save_prefix="comparison"):
    """Confronta due gruppi generici"""
    print("\n" + "="*60)
    print(f"‚öñÔ∏è  {name1.upper()} vs {name2.upper()} COMPARISON")
    print("="*60)
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 10))
    fig.suptitle(f'{name1} vs {name2} - Comparison', fontsize=16, fontweight='bold')
    
    # Polarity
    data_polarity = pd.DataFrame({
        name1: df1['polarity_ensemble'],
        name2: df2['polarity_ensemble']
    })
    data_polarity.boxplot(ax=axes[0, 0])
    axes[0, 0].axhline(0, color='red', linestyle='--')
    axes[0, 0].set_title('Polarity Comparison')
    axes[0, 0].set_ylabel('Ensemble Score')
    
    # Sentiment labels
    sent1 = df1['sentiment_ensemble'].value_counts(normalize=True) * 100
    sent2 = df2['sentiment_ensemble'].value_counts(normalize=True) * 100
    
    x = np.arange(3)
    width = 0.35
    labels = ['positive', 'neutral', 'negative']
    
    axes[0, 1].bar(x - width/2, [sent1.get(l, 0) for l in labels], width, label=name1, alpha=0.8)
    axes[0, 1].bar(x + width/2, [sent2.get(l, 0) for l in labels], width, label=name2, alpha=0.8)
    axes[0, 1].set_xticks(x)
    axes[0, 1].set_xticklabels(labels)
    axes[0, 1].set_ylabel('Percentage (%)')
    axes[0, 1].set_title('Sentiment Distribution')
    axes[0, 1].legend()
    
    # Emotions
    emo1 = df1['nrc_dominant_emotion'].value_counts().head(5)
    emo2 = df2['nrc_dominant_emotion'].value_counts().head(5)
    
    all_emotions = list(set(emo1.index) | set(emo2.index))
    x_emo = np.arange(len(all_emotions))
    
    axes[1, 0].barh(x_emo - width/2, [emo1.get(e, 0) for e in all_emotions], width, label=name1)
    axes[1, 0].barh(x_emo + width/2, [emo2.get(e, 0) for e in all_emotions], width, label=name2)
    axes[1, 0].set_yticks(x_emo)
    axes[1, 0].set_yticklabels(all_emotions)
    axes[1, 0].set_xlabel('Count')
    axes[1, 0].set_title('Top Emotions')
    axes[1, 0].legend()
    
    # Stats table
    stats_data = {
        'Metric': ['Polarity Mean', 'Polarity Std', '% Positive', '% Negative', 'Subjectivity'],
        name1: [
            f"{df1['polarity_ensemble'].mean():.3f}",
            f"{df1['polarity_ensemble'].std():.3f}",
            f"{(df1['sentiment_ensemble']=='positive').mean()*100:.1f}%",
            f"{(df1['sentiment_ensemble']=='negative').mean()*100:.1f}%",
            f"{df1['textblob_subjectivity'].mean():.3f}"
        ],
        name2: [
            f"{df2['polarity_ensemble'].mean():.3f}",
            f"{df2['polarity_ensemble'].std():.3f}",
            f"{(df2['sentiment_ensemble']=='positive').mean()*100:.1f}%",
            f"{(df2['sentiment_ensemble']=='negative').mean()*100:.1f}%",
            f"{df2['textblob_subjectivity'].mean():.3f}"
        ]
    }
    
    stats_df = pd.DataFrame(stats_data)
    axes[1, 1].axis('off')
    table = axes[1, 1].table(cellText=stats_df.values, colLabels=stats_df.columns,
                            cellLoc='center', loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(9)
    table.scale(1, 2)
    axes[1, 1].set_title('Statistics')
    
    plt.tight_layout()
    plt.savefig(f'{save_prefix}.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"\n‚úÖ Confronto {name1} vs {name2} completato!")

# ==========================================
# 6. ESECUZIONE MULTI-PIATTAFORMA
# ==========================================

print("\nüöÄ SENTIMENT ANALYSIS - MULTI-PLATFORM (Twitter + Reddit)")
print("="*60)

# CARICA IL TUO DATASET PROCESSATO
df = df_processed.copy()

# Pulizia dati
print(f"\nüìä Dataset totale: {len(df)} righe")
df['text_lemmatized'] = df['text_lemmatized'].fillna("").astype(str)
df['text_nostop'] = df['text_nostop'].fillna("").astype(str)
df = df[df['text_lemmatized'].str.strip() != ""]
print(f"üìä Dopo pulizia: {len(df)} righe")

# Verifica colonna site
if 'site' not in df.columns:
    print("‚ö†Ô∏è ATTENZIONE: Colonna 'site' non trovata! Assumo tutti i dati siano da Twitter.")
    df['site'] = 'twitter'

# Statistiche piattaforme
print(f"\nüì± DISTRIBUZIONE PER PIATTAFORMA:")
print(df['site'].value_counts())

# Dividi per PIATTAFORMA
df_twitter = df[df['site'].str.lower().isin(['nitter', 'twitter'])].copy()
df_reddit = df[df['site'].str.lower() == 'reddit'].copy()

print(f"\nüìä Twitter/Nitter posts: {len(df_twitter)}")
print(f"üìä Reddit posts: {len(df_reddit)}")

# Dividi per ARGOMENTO
df_bitcoin = df[df['argument'] == 'Bitcoin'].copy()
df_nvidia = df[df['argument'] == 'Nvidia'].copy()

print(f"\nüìä Bitcoin posts (totali): {len(df_bitcoin)}")
print(f"üìä Nvidia posts (totali): {len(df_nvidia)}")

# Dividi per ARGOMENTO + PIATTAFORMA
df_bitcoin_twitter = df[(df['argument'] == 'Bitcoin') & (df['site'].str.lower().isin(['nitter', 'twitter']))].copy()
df_bitcoin_reddit = df[(df['argument'] == 'Bitcoin') & (df['site'].str.lower() == 'reddit')].copy()
df_nvidia_twitter = df[(df['argument'] == 'Nvidia') & (df['site'].str.lower().isin(['nitter', 'twitter']))].copy()
df_nvidia_reddit = df[(df['argument'] == 'Nvidia') & (df['site'].str.lower() == 'reddit')].copy()

print(f"\nüìä Bitcoin Twitter: {len(df_bitcoin_twitter)}")
print(f"üìä Bitcoin Reddit: {len(df_bitcoin_reddit)}")
print(f"üìä Nvidia Twitter: {len(df_nvidia_twitter)}")
print(f"üìä Nvidia Reddit: {len(df_nvidia_reddit)}")

# ==========================================
# ANALISI COMPLETE
# ==========================================

print("\n" + "="*60)
print("INIZIO ANALISI")
print("="*60)

# 1. Analisi per ARGOMENTO (tutti i dati)
print("\n" + "="*60)
print("1Ô∏è‚É£ ANALISI PER ARGOMENTO (Twitter + Reddit)")
print("="*60)

results_bitcoin_all = analyze_complete_sentiment(df_bitcoin, "Bitcoin (All Platforms)")
results_nvidia_all = analyze_complete_sentiment(df_nvidia, "Nvidia (All Platforms)")

# 2. Analisi per PIATTAFORMA (tutti gli argomenti)
print("\n" + "="*60)
print("2Ô∏è‚É£ ANALISI PER PIATTAFORMA (Bitcoin + Nvidia)")
print("="*60)

if len(df_twitter) > 0:
    results_twitter_all = analyze_complete_sentiment(df_twitter, "Twitter (All Topics)")
else:
    print("‚ö†Ô∏è Nessun dato Twitter trovato")
    
if len(df_reddit) > 0:
    results_reddit_all = analyze_complete_sentiment(df_reddit, "Reddit (All Topics)")
else:
    print("‚ö†Ô∏è Nessun dato Reddit trovato")

# 3. Analisi DETTAGLIATE (Argomento x Piattaforma)
print("\n" + "="*60)
print("3Ô∏è‚É£ ANALISI DETTAGLIATE (Argomento x Piattaforma)")
print("="*60)

if len(df_bitcoin_twitter) > 0:
    results_bitcoin_twitter = analyze_complete_sentiment(df_bitcoin_twitter, "Bitcoin/Twitter")
else:
    print("‚ö†Ô∏è Nessun dato Bitcoin/Twitter")
    
if len(df_bitcoin_reddit) > 0:
    results_bitcoin_reddit = analyze_complete_sentiment(df_bitcoin_reddit, "Bitcoin/Reddit")
else:
    print("‚ö†Ô∏è Nessun dato Bitcoin/Reddit")
    
if len(df_nvidia_twitter) > 0:
    results_nvidia_twitter = analyze_complete_sentiment(df_nvidia_twitter, "Nvidia/Twitter")
else:
    print("‚ö†Ô∏è Nessun dato Nvidia/Twitter")
    
if len(df_nvidia_reddit) > 0:
    results_nvidia_reddit = analyze_complete_sentiment(df_nvidia_reddit, "Nvidia/Reddit")
else:
    print("‚ö†Ô∏è Nessun dato Nvidia/Reddit")

# ==========================================
# CONFRONTI
# ==========================================

print("\n" + "="*60)
print("4Ô∏è‚É£ CONFRONTI")
print("="*60)

# Confronto Bitcoin vs Nvidia (tutti i dati)
compare_two_groups(results_bitcoin_all, results_nvidia_all, 
                   "Bitcoin", "Nvidia", "comparison_bitcoin_vs_nvidia")

# Confronto Twitter vs Reddit (tutti i dati)
if len(df_twitter) > 0 and len(df_reddit) > 0:
    compare_two_groups(results_twitter_all, results_reddit_all, 
                       "Twitter", "Reddit", "comparison_twitter_vs_reddit")

# Confronto Bitcoin: Twitter vs Reddit
if len(df_bitcoin_twitter) > 0 and len(df_bitcoin_reddit) > 0:
    compare_two_groups(results_bitcoin_twitter, results_bitcoin_reddit, 
                       "Bitcoin/Twitter", "Bitcoin/Reddit", "comparison_bitcoin_twitter_vs_reddit")

# Confronto Nvidia: Twitter vs Reddit
if len(df_nvidia_twitter) > 0 and len(df_nvidia_reddit) > 0:
    compare_two_groups(results_nvidia_twitter, results_nvidia_reddit, 
                       "Nvidia/Twitter", "Nvidia/Reddit", "comparison_nvidia_twitter_vs_reddit")

# ==========================================
# SALVATAGGIO RISULTATI
# ==========================================

print("\nüíæ Salvataggio risultati in formato CSV (pi√π leggero)...")

# Salva tutti i risultati in CSV
try:
    results_bitcoin_all.to_csv("results_bitcoin_all.csv", index=False)
    print("   ‚úÖ Salvato: results_bitcoin_all.csv")
except Exception as e:
    print(f"   ‚ùå Errore salvando Bitcoin: {e}")

try:
    results_nvidia_all.to_csv("results_nvidia_all.csv", index=False)
    print("   ‚úÖ Salvato: results_nvidia_all.csv")
except Exception as e:
    print(f"   ‚ùå Errore salvando Nvidia: {e}")

if len(df_twitter) > 0:
    try:
        results_twitter_all.to_csv("results_twitter_all.csv", index=False)
        print("   ‚úÖ Salvato: results_twitter_all.csv")
    except Exception as e:
        print(f"   ‚ùå Errore salvando Twitter: {e}")

if len(df_reddit) > 0:
    try:
        results_reddit_all.to_csv("results_reddit_all.csv", index=False)
        print("   ‚úÖ Salvato: results_reddit_all.csv")
    except Exception as e:
        print(f"   ‚ùå Errore salvando Reddit: {e}")

if len(df_bitcoin_twitter) > 0:
    try:
        results_bitcoin_twitter.to_csv("results_bitcoin_twitter.csv", index=False)
        print("   ‚úÖ Salvato: results_bitcoin_twitter.csv")
    except Exception as e:
        print(f"   ‚ùå Errore salvando Bitcoin/Twitter: {e}")

if len(df_bitcoin_reddit) > 0:
    try:
        results_bitcoin_reddit.to_csv("results_bitcoin_reddit.csv", index=False)
        print("   ‚úÖ Salvato: results_bitcoin_reddit.csv")
    except Exception as e:
        print(f"   ‚ùå Errore salvando Bitcoin/Reddit: {e}")

if len(df_nvidia_twitter) > 0:
    try:
        results_nvidia_twitter.to_csv("results_nvidia_twitter.csv", index=False)
        print("   ‚úÖ Salvato: results_nvidia_twitter.csv")
    except Exception as e:
        print(f"   ‚ùå Errore salvando Nvidia/Twitter: {e}")

if len(df_nvidia_reddit) > 0:
    try:
        results_nvidia_reddit.to_csv("results_nvidia_reddit.csv", index=False)
        print("   ‚úÖ Salvato: results_nvidia_reddit.csv")
    except Exception as e:
        print(f"   ‚ùå Errore salvando Nvidia/Reddit: {e}")

print("\nüéâ ANALISI COMPLETA TERMINATA!")
print("="*60)
print("\nüìÅ FILE SALVATI (CSV):")
print("\nüìä RISULTATI PRINCIPALI:")
print("   ‚úÖ results_bitcoin_all.csv")
print("   ‚úÖ results_nvidia_all.csv")

print("\nüåê RISULTATI PER PIATTAFORMA:")
if len(df_twitter) > 0:
    print("   ‚úÖ results_twitter_all.csv")
if len(df_reddit) > 0:
    print("   ‚úÖ results_reddit_all.csv")

print("\nüîç RISULTATI DETTAGLIATI:")
if len(df_bitcoin_twitter) > 0:
    print("   ‚úÖ results_bitcoin_twitter.csv")
if len(df_bitcoin_reddit) > 0:
    print("   ‚úÖ results_bitcoin_reddit.csv")
if len(df_nvidia_twitter) > 0:
    print("   ‚úÖ results_nvidia_twitter.csv")
if len(df_nvidia_reddit) > 0:
    print("   ‚úÖ results_nvidia_reddit.csv")

print("\nüìà GRAFICI PNG:")
print("   ‚úÖ Tutti i grafici sentiment/emotions/temporal per ogni categoria")
print("   ‚úÖ comparison_bitcoin_vs_nvidia.png")
if len(df_twitter) > 0 and len(df_reddit) > 0:
    print("   ‚úÖ comparison_twitter_vs_reddit.png")
if len(df_bitcoin_twitter) > 0 and len(df_bitcoin_reddit) > 0:
    print("   ‚úÖ comparison_bitcoin_twitter_vs_reddit.png")
if len(df_nvidia_twitter) > 0 and len(df_nvidia_reddit) > 0:
    print("   ‚úÖ comparison_nvidia_twitter_vs_reddit.png")

print("\nüí° INFO:")
print("   üìÅ I file CSV occupano ~70% meno spazio degli Excel")
print("   üìñ Apribili con Excel, Google Sheets, o pandas")
print("   üîÑ Per convertire in Excel: pd.read_csv('file.csv').to_excel('file.xlsx')")

print("\n" + "="*60)

In [None]:
# SUBITO DOPO aver caricato df_processed (nel primo script)
print("\nüîç DIAGNOSI CARICAMENTO:")
print(f"Totale righe caricate: {len(df)}")
print("\nDistribuzione ORIGINALE:")
print(df.groupby(['argument', 'site']).size())

# DOPO il fillna e il filtro delle righe vuote
df['text_lemmatized'] = df['text_lemmatized'].fillna("")
df['text_nostop'] = df['text_nostop'].fillna("")
df = df[df['text_lemmatized'].str.strip() != ""]

print("\nDistribuzione DOPO filtro righe vuote:")
print(df.groupby(['argument', 'site']).size())

# DOPO aver diviso per categoria
df_bitcoin = df[df['argument'] == 'Bitcoin'].copy()
df_nvidia = df[df['argument'] == 'Nvidia'].copy()

print(f"\nüìä Bitcoin: {len(df_bitcoin)}")
print(df_bitcoin['site'].value_counts())
print(f"\nüìä Nvidia: {len(df_nvidia)}")
print(df_nvidia['site'].value_counts())