### How to use

#### Step 1: Run the Metrics Functions cells

In [1]:
import numpy as np
import editdistance
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

def jaccard_similarity(text1, text2):

    set1 = set(text1.split())
    set2 = set(text2.split())
    
    return len(set1.intersection(set2)) / len(set1.union(set2))


def cosine_sim(text1, text2):

    vectorizer = CountVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    
    cos_sim = cosine_similarity(vectors)
    
    return cos_sim[0][1]

def edit_score(text1, text2):

    return 1 - (editdistance.eval(text1, text2) / max(len(text1), len(text2)))

In [2]:
import editdistance
from ngram import NGram 

precision = 2
threshold = .95

def compute_metrics_for_articles(table, article_number, journal, article_type, manual, scraped, n_title=1, n_text=1):
    
    ngram_similarity = NGram.compare(manual, scraped, N=n_title) if article_type == 'Title' else NGram.compare(manual, scraped, N=n_text)
    
    edit_dist = editdistance.eval(manual, scraped)
    edit_sim = edit_score(manual, scraped)

    cos_sim = cosine_sim(manual, scraped)
    
    avg_similarity = (ngram_similarity + edit_sim) / 2
    #avg_similarity = (ngram_similarity + edit_sim + cos_sim) / 3
    
    mark = 'X' if avg_similarity >= threshold else '-'

    table.append([
        article_number,
        journal,
        article_type,
        ngram_similarity,
        f'{edit_dist} ({edit_sim:.{precision}f})',
        #cos_sim,
        avg_similarity,
        mark
    ])

    return avg_similarity


#### Step 2: Run the Print Params cells

In [3]:
from urllib.parse import urlparse


def get_journal_name(url):

    second_level_domains = ['com.cn', 'co.uk', 'org.cn', 'net.cn', 'fr', 'de', 'es']
    
    parsed_url = urlparse(url)
    domain_parts = parsed_url.netloc.split('.')
    
    if len(domain_parts) > 2 and '.'.join(domain_parts[-2:]) in second_level_domains:
        return domain_parts[-3]

    elif len(domain_parts) > 2:
        return domain_parts[-2]
        
    else:
        return domain_parts[0]

In [4]:
from tabulate import tabulate

def print_similarity_results(table):
    
    headers = [
        'Article #', 
        'Journal Name', 
        'Type', 
        'N-gram Similarity', 
        'Edit Distance (Score)', 
        #'Cosine Similarity',
        'Combined Syntaxic Similarity (CSS)', 
        f'CSS > {threshold}', 
    ]

    print(
        tabulate(
            table, 
            headers=headers, 
            tablefmt='grid', 
            floatfmt=f'.{precision}f', 
            colalign=('center','center', 'center', 'center', 'center', 'center', 'center')
        )
    )

#### Step 3: Run on Scraped Data and compare to Test Data

In [5]:
import numpy as np
import pandas as pd

test_set_df = pd.read_csv('text-mining-articles-scraping - Feuille 1.csv')

scraped_df = pd.read_csv('results.csv')

all_titles = scraped_df['title'].replace(np.nan, "").replace('\n', ' ', regex=True)

all_texts = scraped_df['content'].replace('\n', ' ', regex=True)

In [6]:
from statistics import mean

journals = test_set_df['url'].apply(get_journal_name).tolist()

titles = test_set_df['title'].replace('\n', ' ', regex=True).tolist()
contents = test_set_df['all'].replace('\n', ' ', regex=True).tolist()

table = []
final_title_scores = []
final_text_scores = []

for i in range(len(test_set_df)):

    if i < len(all_titles) and i < len(all_texts):
        
        journal = journals[i]
        title = titles[i]
        content = contents[i]
        
        final_title_scores.append(compute_metrics_for_articles(table, i+1, journal, 'Title', title, all_titles[i]))
        final_text_scores.append(compute_metrics_for_articles(table, i+1, journal, 'Content', content, all_texts[i]))

print(f"Average CSS over all titles: {mean(final_title_scores):.{precision}f}")
print(f"Average CSS over all texts: {mean(final_text_scores):.{precision}f}\n")
print_similarity_results(table)


Average CSS over all titles: 0.92
Average CSS over all texts: 0.86

+-------------+----------------+---------+---------------------+-------------------------+--------------------------------------+--------------+
|  Article #  |  Journal Name  |  Type   |  N-gram Similarity  |  Edit Distance (Score)  |  Combined Syntaxic Similarity (CSS)  |  CSS > 0.95  |
|      1      |    nytimes     |  Title  |        1.00         |        0 (1.00)         |                 1.00                 |      X       |
+-------------+----------------+---------+---------------------+-------------------------+--------------------------------------+--------------+
|      1      |    nytimes     | Content |        0.89         |       864 (0.89)        |                 0.89                 |      -       |
+-------------+----------------+---------+---------------------+-------------------------+--------------------------------------+--------------+
|      2      |      faz       |  Title  |        1.00        