# Analyse d'un hashtag X.com

Ce notebook exploite les sorties `gallery-dl --dump-json` ou `--write-metadata` pour explorer un hashtag.

## Pré-requis

```bash
python3 -m venv .venv
source .venv/bin/activate
pip install -r requirements-analysis.txt
```

`requirements-analysis.txt` peut inclure : `pandas numpy matplotlib seaborn scikit-learn networkx scipy tqdm ipywidgets plotly umap-learn`.

Télécharge les données :
```bash
gallery-dl --cookies-from-browser firefox/x.com --dump-json "https://x.com/search?q=%23PlusJamaisPs&src=typed_query&f=live" > plusjamaisps.jsonl
```


In [10]:
%config InlineBackend.figure_format = 'retina'
%pip install numpy pandas seaborn matplotlib tqdm
import json
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
from tqdm.auto import tqdm

sns.set_theme(style='whitegrid')

DATA_PATH = Path('..')
JSONL_FILE = DATA_PATH / 'plusjamaisps.jsonl'

Note: you may need to restart the kernel to use updated packages.


In [11]:
def load_jsonl(path: Path) -> pd.DataFrame:
    records = []
    with path.open('r', encoding='utf-8') as handle:
        for line in handle:
            line = line.strip()
            if not line or line.startswith('['):
                continue
            try:
                data = json.loads(line)
            except json.JSONDecodeError:
                continue
            if isinstance(data, dict) and data.get('category') == 'twitter':
                records.append(data)
    return pd.DataFrame(records)

df = load_jsonl(JSONL_FILE)
print(df.shape)
df.head()

(0, 0)


In [12]:
def prepare_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    def coalesce_series(options, fallback=None):
        for col in options:
            if col in df.columns:
                return df[col]
        return fallback() if callable(fallback) else fallback

    def empty_series(fill_value=None):
        if len(df) == 0:
            return pd.Series(dtype=object)
        if callable(fill_value):
            data = [fill_value() for _ in range(len(df))]
        else:
            data = [fill_value] * len(df)
        return pd.Series(data, index=df.index)

    df['tweet_id'] = coalesce_series(['tweet_id', 'tweetid', 'id'], fallback=lambda: empty_series())

    author_series = coalesce_series(['author', 'user', 'user_data'], fallback=lambda: empty_series())
    df['author'] = author_series.apply(lambda x: x.get('name') if isinstance(x, dict) else x)

    df['content'] = coalesce_series(['content', 'text'], fallback=lambda: empty_series(''))

    date_series = coalesce_series(['date', 'created_at'], fallback=lambda: empty_series())
    df['created_at'] = pd.to_datetime(date_series, utc=True, errors='coerce')

    hashtags_series = coalesce_series(['hashtags'], fallback=lambda: empty_series(lambda: []))
    df['hashtags'] = hashtags_series.apply(lambda x: x if isinstance(x, list) else [])

    mentions_series = coalesce_series(['mentions'], fallback=lambda: empty_series(lambda: []))
    df['mentions'] = mentions_series.apply(lambda x: x if isinstance(x, list) else [])

    urls_series = coalesce_series(['urls'], fallback=lambda: empty_series(lambda: []))
    df['url_count'] = urls_series.apply(lambda x: len(x) if isinstance(x, list) else 0)

    media_counts = coalesce_series(['count', 'media_count', 'attachments'], fallback=lambda: empty_series(0))
    df['has_media'] = media_counts.fillna(0).astype(int) > 0

    return df

df = prepare_dataframe(df)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype              
---  ------      --------------  -----              
 0   tweet_id    0 non-null      object             
 1   author      0 non-null      object             
 2   content     0 non-null      object             
 3   created_at  0 non-null      datetime64[ns, UTC]
 4   hashtags    0 non-null      object             
 5   mentions    0 non-null      object             
 6   url_count   0 non-null      object             
 7   has_media   0 non-null      bool               
dtypes: bool(1), datetime64[ns, UTC](1), object(6)
memory usage: 132.0+ bytes


In [13]:
def plot_time_series(df: pd.DataFrame, freq: str = '15min'):
    ts = df.set_index('created_at').resample(freq)['tweet_id'].count()
    fig, ax = plt.subplots(figsize=(12, 4))
    ts.plot(ax=ax)
    ax.set_title(f'Nombre de tweets par {freq}')
    ax.set_ylabel('Tweets')
    ax.set_xlabel('Temps')
    ax.xaxis.set_major_formatter(DateFormatter('%Y-%m-%d %H:%M'))
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

if not df.empty:
    plot_time_series(df)

In [14]:
def burstiness(inter_times: np.ndarray) -> float:
    mu = np.mean(inter_times)
    sigma = np.std(inter_times)
    return (sigma - mu) / (sigma + mu) if mu + sigma else 0.0

timestamps = df['created_at'].sort_values().astype('int64') // 10**9
inter_times = np.diff(timestamps)
if len(inter_times):
    print('Indice de burstiness global:', burstiness(inter_times))

In [15]:
top_accounts = (df.groupby('author')['tweet_id'].count().sort_values(ascending=False).head(20))
if not top_accounts.empty:
    plt.figure(figsize=(8, 6))
    sns.barplot(y=top_accounts.index, x=top_accounts.values, palette='viridis')
    plt.title('Top 20 comptes les plus actifs')
    plt.xlabel('Tweets')
    plt.ylabel('Compte')
    plt.tight_layout()
    plt.show()

In [16]:
%pip install scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
content = df['content'].fillna('')
if len(content):
    tfidf = TfidfVectorizer(min_df=2, ngram_range=(1, 2))
    X = tfidf.fit_transform(content)
    similarity_matrix = cosine_similarity(X)
    df['similarity_hits'] = (similarity_matrix > 0.9).sum(axis=1)
    display(df.sort_values('similarity_hits', ascending=False).head(10)[['tweet_id','author','similarity_hits','content']])

Note: you may need to restart the kernel to use updated packages.


In [17]:
from collections import Counter
hashtags = [tag.lower() for tags in df['hashtags'] for tag in (tags or [])]
if hashtags:
    display(pd.Series(Counter(hashtags)).sort_values(ascending=False).head(20))

In [18]:
%pip install networkx
import networkx as nx
G = nx.DiGraph()
for _, row in df.iterrows():
    author = row['author']
    for mention in row.get('mentions') or []:
        target = mention.get('name') if isinstance(mention, dict) else mention
        if author and target:
            G.add_edge(author, target)
print(nx.number_of_nodes(G), 'nodes', nx.number_of_edges(G), 'edges')

Note: you may need to restart the kernel to use updated packages.
0 nodes 0 edges


## Checklist inspirée de la littérature
- synchronisation temporelle
- homogénéité du contenu
- densité du graphe retweet/mention
- âge des comptes
- cadence automatisée (burstiness)
- amplification (retweets rapides)
- hashtags récurrents
- source d'API suspecte