# Analysis of Improved Generated Articles

Analyse du dataset généré avec le pipeline amélioré (omissions contrôlées, diversité des noms de modèles).

In [None]:
import json
import os
import re
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict, Counter

OUTPUT_DIR = "output_improved"
EXPECTED_TAGS = ["model", "params", "gpu_count", "hardware", "training", "country", "year"]
INFO_FIELDS = ["model_name", "parameter_count", "gpu_count", "hardware", "training_duration", "country", "year"]

## 1. Load Data

In [None]:
def load_all_articles():
    all_data = {}
    for model_name in sorted(os.listdir(OUTPUT_DIR)):
        model_dir = os.path.join(OUTPUT_DIR, model_name)
        json_path = os.path.join(model_dir, "articles.json")
        if os.path.exists(json_path):
            with open(json_path, "r", encoding="utf-8") as f:
                all_data[model_name] = json.load(f)
    return all_data

data = load_all_articles()

for model, articles in data.items():
    print(f"{model}: {len(articles)} articles")

total = sum(len(a) for a in data.values())
print(f"\nTotal: {total} articles")

## 2. Success Rate

In [None]:
NUM_EXPECTED = 100

success_data = []
for model_name, articles in data.items():
    success_data.append({
        "Model": model_name,
        "Generated": len(articles),
        "Expected": NUM_EXPECTED,
        "Success Rate": f"{(len(articles)/NUM_EXPECTED)*100:.0f}%"
    })

pd.DataFrame(success_data)

## 3. Omission Analysis

Distribution des champs omis ("Not specified") par modèle.

In [None]:
omission_data = []

for model_name, articles in data.items():
    field_omitted = Counter()
    omission_counts = []

    for article in articles:
        info = article.get("information", {})
        n_omitted = 0
        for field in INFO_FIELDS:
            if info.get(field, "Not specified") == "Not specified":
                field_omitted[field] += 1
                n_omitted += 1
        omission_counts.append(n_omitted)

    total = len(articles)
    row = {"Model": model_name}
    for field in INFO_FIELDS:
        row[field] = f"{field_omitted[field]}/{total} ({field_omitted[field]/total*100:.0f}%)"
    row["Avg omissions/article"] = f"{sum(omission_counts)/len(omission_counts):.1f}"
    omission_data.append(row)

pd.DataFrame(omission_data)

In [None]:
fig, axes = plt.subplots(1, len(data), figsize=(5*len(data), 4), sharey=True)
if len(data) == 1:
    axes = [axes]

for ax, (model_name, articles) in zip(axes, data.items()):
    omission_counts = []
    for article in articles:
        info = article.get("information", {})
        n = sum(1 for f in INFO_FIELDS if info.get(f, "Not specified") == "Not specified")
        omission_counts.append(n)

    counts = Counter(omission_counts)
    x = sorted(counts.keys())
    y = [counts[k] for k in x]
    ax.bar(x, y)
    ax.set_title(model_name)
    ax.set_xlabel("Nb champs omis")
    ax.set_ylabel("Nb articles")
    ax.set_xticks(range(0, 8))

plt.suptitle("Distribution du nombre d'omissions par article", fontsize=14)
plt.tight_layout()
plt.show()

## 4. XML Tags Compliance

In [None]:
def check_tags(text):
    found = {}
    for tag in EXPECTED_TAGS:
        pattern = f"<{tag}>.*?</{tag}>"
        found[tag] = bool(re.search(pattern, text, re.DOTALL))
    return found

tags_data = []

for model_name, articles in data.items():
    tag_counts = defaultdict(int)

    for article in articles:
        text = article.get("article", "")
        info = article.get("information", {})
        tags = check_tags(text)

        for tag in EXPECTED_TAGS:
            if tags[tag]:
                tag_counts[tag] += 1

    total = len(articles)
    row = {"Model": model_name}
    for tag in EXPECTED_TAGS:
        row[f"<{tag}>"] = f"{tag_counts[tag]}/{total}"
    tags_data.append(row)

pd.DataFrame(tags_data)

## 5. Article Length

In [None]:
length_data = []

for model_name, articles in data.items():
    words = [len(a.get("article", "").split()) for a in articles]
    paras = [len([p for p in a.get("article", "").split('\n\n') if p.strip()]) for a in articles]

    length_data.append({
        "Model": model_name,
        "Words (min)": min(words),
        "Words (max)": max(words),
        "Words (avg)": f"{sum(words)/len(words):.0f}",
        "Paragraphs (avg)": f"{sum(paras)/len(paras):.1f}"
    })

pd.DataFrame(length_data)

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))

for model_name, articles in data.items():
    words = [len(a.get("article", "").split()) for a in articles]
    ax.hist(words, bins=20, alpha=0.5, label=model_name)

ax.set_xlabel("Nombre de mots")
ax.set_ylabel("Nombre d'articles")
ax.set_title("Distribution de la longueur des articles")
ax.legend()
plt.tight_layout()
plt.show()

## 6. Diversity Analysis

### 6.1 Unique Model Names

In [None]:
def extract_tag_value(text, tag):
    match = re.search(f"<{tag}>(.*?)</{tag}>", text, re.DOTALL)
    return match.group(1).strip() if match else None

for model_name, articles in data.items():
    names = []
    for a in articles:
        info = a.get("information", {})
        name = info.get("model_name", "Not specified")
        if name != "Not specified":
            names.append(name)

    unique = set(names)
    print(f"\n{model_name}: {len(unique)} unique / {len(names)} specified")
    duplicates = {n: c for n, c in Counter(names).items() if c > 1}
    if duplicates:
        print(f"  Duplicates: {duplicates}")

### 6.2 Hardware Diversity

In [None]:
for model_name, articles in data.items():
    hw_list = []
    for a in articles:
        info = a.get("information", {})
        hw = info.get("hardware", "Not specified")
        if hw != "Not specified":
            hw_list.append(hw)

    print(f"\n{model_name}: {len(set(hw_list))} unique / {len(hw_list)} specified")
    print(f"  Top 5: {Counter(hw_list).most_common(5)}")

### 6.3 Country Diversity

In [None]:
for model_name, articles in data.items():
    countries = []
    for a in articles:
        info = a.get("information", {})
        c = info.get("country", "Not specified")
        if c != "Not specified":
            countries.append(c)

    print(f"\n{model_name}: {len(set(countries))} unique / {len(countries)} specified")
    print(f"  Top 5: {Counter(countries).most_common(5)}")

### 6.4 Year Distribution

In [None]:
fig, axes = plt.subplots(1, len(data), figsize=(5*len(data), 4), sharey=True)
if len(data) == 1:
    axes = [axes]

for ax, (model_name, articles) in zip(axes, data.items()):
    years = []
    for a in articles:
        info = a.get("information", {})
        y = info.get("year", "Not specified")
        if y != "Not specified":
            years.append(str(y))

    counts = Counter(years)
    x = sorted(counts.keys())
    y = [counts[k] for k in x]
    ax.bar(x, y)
    ax.set_title(model_name)
    ax.set_xlabel("Year")
    ax.set_ylabel("Count")

plt.suptitle("Distribution des années", fontsize=14)
plt.tight_layout()
plt.show()

### 6.5 Bigram & Trigram de début par modèle

## 7. Summary Table

In [None]:
summary_data = []

for model_name, articles in data.items():
    total = len(articles)
    model_names = set()
    hardware = set()
    countries = set()
    years = set()
    omission_counts = []

    for article in articles:
        info = article.get("information", {})

        m = info.get("model_name", "Not specified")
        if m != "Not specified": model_names.add(m)

        h = info.get("hardware", "Not specified")
        if h != "Not specified": hardware.add(h)

        c = info.get("country", "Not specified")
        if c != "Not specified": countries.add(c)

        y = info.get("year", "Not specified")
        if y != "Not specified": years.add(str(y))

        n = sum(1 for f in INFO_FIELDS if info.get(f, "Not specified") == "Not specified")
        omission_counts.append(n)

    summary_data.append({
        "Model": model_name,
        "Articles": total,
        "Unique Models": len(model_names),
        "Unique Hardware": len(hardware),
        "Unique Countries": len(countries),
        "Unique Years": len(years),
        "Avg Omissions": f"{sum(omission_counts)/len(omission_counts):.1f}"
    })

pd.DataFrame(summary_data)

In [None]:
def clean_start(text):
    cleaned = re.sub(r'<[^>]+>', '', text).strip()
    return cleaned.split()

for model_name, articles in data.items():
    bigrams = []
    trigrams = []

    for a in articles:
        words = clean_start(a.get("article", ""))
        if len(words) >= 2:
            bigrams.append(" ".join(words[:2]))
        if len(words) >= 3:
            trigrams.append(" ".join(words[:3]))

    print(f"\n{'='*50}")
    print(f"{model_name}")
    print(f"{'='*50}")
    print(f"\n  Top 10 Bigrams ({len(set(bigrams))} unique / {len(bigrams)} total):")
    for bg, count in Counter(bigrams).most_common(10):
        print(f"    {count:3d}x  \"{bg}\"")

    print(f"\n  Top 10 Trigrams ({len(set(trigrams))} unique / {len(trigrams)} total):")
    for tg, count in Counter(trigrams).most_common(10):
        print(f"    {count:3d}x  \"{tg}\"")