### Basic scraping

In [16]:
import requests
from newspaper import Article
from typing import Tuple, Any

def download_article(url: str) -> str:

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return response.text
        
    except requests.RequestException as e:
        print(f"Error downloading article: {e}")
        return ""

def extract_title_and_text(html: str) -> Tuple[str, str]:

    article = Article('')
    article.set_html(html)
    article.parse()
    return article.title, article.text

def main(url: str) -> Tuple[str, str]:
    
    html = download_article(url)

    if html:
        title, text = extract_title_and_text(html)
        return title, text
    else:
        return "", ""

urls = [
    "https://www.nytimes.com/2024/09/29/us/north-carolina-helene-relief-damage.html",
    "https://www.faz.net/aktuell/wirtschaft/kuenstliche-intelligenz/today-s-ai-can-t-be-trusted-19532136.html",
    "http://www.chinatoday.com.cn/ctenglish/2018/commentaries/202409/t20240925_800378506.html",
    "https://english.elpais.com/economy-and-business/2024-09-28/from-the-hermes-heir-to-nicolas-cage-millionaires-who-went-bankrupt.html",
    "https://insatiable.info/2023/06/30/quels-futur-pour-les-reseaux-sociaux/",
    "https://actu.fr/auvergne-rhone-alpes/lyon_69123/lyon-le-projet-de-reamenagement-des-quais-les-plus-mortels-pour-les-cyclistes-devoile_61667371.html",
    "https://www.lesnumeriques.com/intelligence-artificielle/huggingchat-l-ia-generative-au-coeur-de-votre-mac-avec-cette-nouvelle-application-n226761.html",
    "https://www.lebigdata.fr/nebius-mise-sur-paris-pour-son-nouveau-centre-de-donnees-ia-1-milliard-en-jeu",
    "https://actu.orange.fr/societe/fait-divers/allo-mon-fils-c-est-maman-grace-a-l-ia-des-escrocs-clonent-la-voix-de-sa-mere-pour-lui-extorquer-de-l-argent-magic-CNT000002fcZQN.html",
    "https://www.cbnews.fr/digital/image-ia-creation-du-collectif-enthousiastes-87456",
    "https://www.tradingsat.com/brent-crude-dr-sp-FR0011227370/actualites/brent-crude-dr-sp-et-si-l-intelligence-artificielle-reduisait-a-terme-les-prix-du-petrole-1122826.html",
    "https://www.journaldunet.com/intelligence-artificielle/1533915-comment-exploiter-notebooklm-l-ovni-ia-experimental-de-google/",
    "https://www.lemonde.fr/international/article/2024/10/01/israel-mene-des-operations-terrestres-dans-le-sud-du-liban_6340509_3210.html",
    "https://www.lemonde.fr/politique/article/2024/10/01/greve-du-1-octobre-les-syndicats-veulent-donner-le-tempo-mais-s-attendent-a-une-faible-mobilisation_6340346_823448.html",
    "https://www.lemonde.fr/societe/article/2024/10/01/derives-des-creches-privees-aurore-berge-confirme-avoir-depose-plainte-pour-diffamation-apres-la-publication-du-livre-enquete-les-ogres_6340515_3224.html",
    "https://www.lemonde.fr/pixels/article/2024/10/01/j-ai-plaide-coupable-d-avoir-fait-du-journalisme-affirme-julian-assange-au-conseil-de-l-europe_6340588_4408996.html",
    "https://www.lefigaro.fr/sports/rugby/rugby-16-membres-demissionnent-du-comite-directeur-de-la-ffr-20241001",
    "https://www.lefigaro.fr/international/dans-ses-memoires-boris-johnson-critique-particulierement-emmanuel-macron-20241001",
    "https://www.lefigaro.fr/flash-actu/normandie-mort-d-un-jeune-de-17-ans-poignarde-pres-d-une-boite-de-nuit-20240929",
    "https://www.francetvinfo.fr/monde/usa/ouragan-helene-aux-etats-unis-un-presentateur-meteo-sauve-une-femme-en-direct_6812057.html",
    "https://www.francetvinfo.fr/sports/foot/ligue-des-champions/arsenal-psg-premier-gros-choc-en-ligue-des-champions-et-premieres-crispations_6810745.html",
    "https://www.liberation.fr/culture/arts/un-tableau-deniche-dans-une-cave-par-un-brocanteur-italien-en-1962-serait-un-original-de-picasso-20241001_YCBPEC5MCNCNTLQMBKGV5EMXZM/",
    "https://www.nytimes.com/2024/10/01/science/shipwreck-ghost-ship-pacific-drones.html"
]

all_titles = []
all_texts = []

for url in urls:

    title, text = main(url)
    all_titles.append(title)
    all_texts.append(text)
    #print(f"Title: {title}\nText: {text}...\n")
    #print('#' * 150)


Error downloading article: 403 Client Error: Forbidden for url: https://www.nytimes.com/2024/09/29/us/north-carolina-helene-relief-damage.html
Error downloading article: 403 Client Error: Forbidden for url: https://www.nytimes.com/2024/10/01/science/shipwreck-ghost-ship-pacific-drones.html


### Similarity metrics

In [17]:
import numpy as np
import editdistance
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

def jaccard_similarity(text1, text2):

    set1 = set(text1.split())
    set2 = set(text2.split())
    
    return len(set1.intersection(set2)) / len(set1.union(set2))


def cosine_sim(text1, text2):

    vectorizer = CountVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    
    cos_sim = cosine_similarity(vectors)
    
    return cos_sim[0][1]

def edit_score(text1, text2):

    return 1 - (editdistance.eval(text1, text2) / max(len(text1), len(text2)))

In [18]:
import editdistance
from ngram import NGram 

precision = 2
threshold = .95

def compute_metrics_for_articles(table, article_number, journal, article_type, manual, scraped, n_title, n_text):
    
    ngram_similarity = NGram.compare(manual, scraped, N=n_title) if article_type == 'Title' else NGram.compare(manual, scraped, N=n_text)
    
    edit_dist = editdistance.eval(manual, scraped)
    edit_sim = edit_score(manual, scraped)

    cos_sim = cosine_sim(manual, scraped)
    
    avg_similarity = (ngram_similarity + edit_sim) / 2
    #avg_similarity = (ngram_similarity + edit_sim + cos_sim) / 3
    
    mark = 'X' if avg_similarity >= threshold else '-'

    table.append([
        article_number,
        journal,
        article_type,
        ngram_similarity,
        f'{edit_dist} ({edit_sim:.{precision}f})',
        #cos_sim,
        avg_similarity,
        mark
    ])

    return avg_similarity


### Print params

In [19]:
from urllib.parse import urlparse


def get_journal_name(url):

    second_level_domains = ['com.cn', 'co.uk', 'org.cn', 'net.cn', 'fr', 'de', 'es']
    
    parsed_url = urlparse(url)
    domain_parts = parsed_url.netloc.split('.')
    
    if len(domain_parts) > 2 and '.'.join(domain_parts[-2:]) in second_level_domains:
        return domain_parts[-3]

    elif len(domain_parts) > 2:
        return domain_parts[-2]
        
    else:
        return domain_parts[0]

In [20]:
from tabulate import tabulate

def print_similarity_results(table):
    
    headers = [
        'Article #', 
        'Journal Name', 
        'Type', 
        'N-gram Similarity', 
        'Edit Distance (Score)', 
        #'Cosine Similarity',
        'Combined Syntaxic Similarity (CSS)', 
        f'CSS > {threshold}', 
    ]

    print(
        tabulate(
            table, 
            headers=headers, 
            tablefmt='grid', 
            floatfmt=f'.{precision}f', 
            colalign=('center','center', 'center', 'center', 'center', 'center', 'center')
        )
    )

### Main

In [21]:
import numpy as np
import pandas as pd

df = pd.read_csv('text-mining-articles-scraping - Feuille 1.csv')

In [22]:
from statistics import mean

journals = df['url'].apply(get_journal_name).tolist()
titles = df['title'].tolist()
contents = df['content'].tolist()

title_scores_by_n = {n: [] for n in range(1, 10)}
text_scores_by_n = {n: [] for n in range(1, 10)}

for i in range(len(df)):

    if i < len(all_titles) and i < len(all_texts):
        
        journal = journals[i]
        title = titles[i]
        content = contents[i]

        for n in range(1, 10):
            title_score = compute_metrics_for_articles([], i+1, journal, 'Title', title, all_titles[i], n_title=n, n_text=10)
            title_scores_by_n[n].append(title_score)
            
            text_score = compute_metrics_for_articles([], i+1, journal, 'Content', content, all_texts[i], n_title=1, n_text=n)
            text_scores_by_n[n].append(text_score)

av_title_scores = {n: mean(scores) for n, scores in title_scores_by_n.items()}
n_title_opt = max(av_title_scores, key=av_title_scores.get)
print(f'Best n for titles (n-grams): {n_title_opt}')

av_text_scores = {n: mean(scores) for n, scores in text_scores_by_n.items()}
n_text_opt = max(av_text_scores, key=av_text_scores.get)
print(f'Best n for texts (n-grams): {n_text_opt}')

table = []
final_title_scores = []
final_text_scores = []

for i in range(len(df)):
    
    if i < len(all_titles) and i < len(all_texts):
        journal = journals[i]
        title = titles[i]
        content = contents[i]
        
        final_title_scores.append(compute_metrics_for_articles(table, i+1, journal, 'Title', title, all_titles[i], n_title=n_title_opt, n_text=n_text_opt))
        final_text_scores.append(compute_metrics_for_articles(table, i+1, journal, 'Content', content, all_texts[i], n_title=n_title_opt, n_text=n_text_opt))

print(f"Average CSS over all titles: {mean(final_title_scores):.{precision}f}")
print(f"Average CSS over all texts: {mean(final_text_scores):.{precision}f}\n")
print_similarity_results(table)


Best n for titles (n-grams): 1
Best n for texts (n-grams): 1
Average CSS over all titles: 0.86
Average CSS over all texts: 0.77

+-------------+----------------+---------+---------------------+-------------------------+--------------------------------------+--------------+
|  Article #  |  Journal Name  |  Type   |  N-gram Similarity  |  Edit Distance (Score)  |  Combined Syntaxic Similarity (CSS)  |  CSS > 0.95  |
|      1      |    nytimes     |  Title  |        0.00         |        71 (0.00)        |                 0.00                 |      -       |
+-------------+----------------+---------+---------------------+-------------------------+--------------------------------------+--------------+
|      1      |    nytimes     | Content |        0.00         |       7580 (0.00)       |                 0.00                 |      -       |
+-------------+----------------+---------+---------------------+-------------------------+--------------------------------------+--------------+
|

### Better approach for the 6 articles

In [8]:
new_df = pd.read_csv('results.csv')
all_titles = new_df['title']
all_texts = new_df['content']

In [9]:
from statistics import mean

journals = df['url'].apply(get_journal_name).tolist()
titles = df['title'].tolist()
contents = df['content'].tolist()

title_scores_by_n = {n: [] for n in range(1, 10)}
text_scores_by_n = {n: [] for n in range(1, 10)}

for i in range(len(df)):

    if i < len(all_titles) and i < len(all_texts):

        journal = journals[i]
        title = titles[i]
        content = contents[i]

        for n in range(1, 10):
            
            title_score = compute_metrics_for_articles([], i+1, journal, 'Title', title, all_titles[i], n_title=n, n_text=10)
            title_scores_by_n[n].append(title_score)
            
            text_score = compute_metrics_for_articles([], i+1, journal, 'Content', content, all_texts[i], n_title=1, n_text=n)
            text_scores_by_n[n].append(text_score)

av_title_scores = {n: mean(scores) for n, scores in title_scores_by_n.items()}
n_title_opt = max(av_title_scores, key=av_title_scores.get)
print(f'Best n for titles (n-grams): {n_title_opt}')

av_text_scores = {n: mean(scores) for n, scores in text_scores_by_n.items()}
n_text_opt = max(av_text_scores, key=av_text_scores.get)
print(f'Best n for texts (n-grams): {n_text_opt}')

table = []
final_title_scores = []
final_text_scores = []

for i in range(len(df)):

    if i < len(all_titles) and i < len(all_texts):
        
        journal = journals[i]
        title = titles[i]
        content = contents[i]
        
        final_title_scores.append(compute_metrics_for_articles(table, i+1, journal, 'Title', title, all_titles[i], n_title=n_title_opt, n_text=n_text_opt))
        final_text_scores.append(compute_metrics_for_articles(table, i+1, journal, 'Content', content, all_texts[i], n_title=n_title_opt, n_text=n_text_opt))

print(f"Average CSS over all titles: {mean(final_title_scores):.{precision}f}")
print(f"Average CSS over all texts: {mean(final_text_scores):.{precision}f}\n")
print_similarity_results(table)


Best n for titles (n-grams): 1
Best n for texts (n-grams): 1
Average CSS over all titles: 0.88
Average CSS over all texts: 0.78

+-------------+----------------+---------+---------------------+-------------------------+--------------------------------------+--------------+
|  Article #  |  Journal Name  |  Type   |  N-gram Similarity  |  Edit Distance (Score)  |  Combined Syntaxic Similarity (CSS)  |  CSS > 0.95  |
|      1      |    nytimes     |  Title  |        1.00         |        0 (1.00)         |                 1.00                 |      X       |
+-------------+----------------+---------+---------------------+-------------------------+--------------------------------------+--------------+
|      1      |    nytimes     | Content |        0.89         |       884 (0.88)        |                 0.88                 |      -       |
+-------------+----------------+---------+---------------------+-------------------------+--------------------------------------+--------------+
|