In [1]:
import pandas as pd
import numpy as np
import yake as yk

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import SVC

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import string

from joblib import dump, load

## Recommending articles using Doc2Vec

### Approach:
1. **Train a Doc2Vec Model:**
   - Utilize Doc2Vec to transform entire articles into vector representations.
   - Train the model to capture contextual information and semantic meaning of each document.

2. **Compute Similarity:**
   - Classify the article provided by the user.
   - Measure similarity between the provided article and the other articles based on the cosine similarity of their Doc2Vec vectors.
   - Cosine similarity provides a metric for how closely the semantic meanings align.

3. **Recommend articles:**
   - Provide a ranking of the articles based on the cosine similarity value.
   - Recommend the top n articles.    
     

4. **Benefits of Doc2Vec:**
   - Doc2Vec models excel in capturing contextual nuances and semantic relationships within entire documents.

This approach enables the recommendation of article based on their semantic similarity.  
  
We use the BBC_News_Test.csv file to simulate non classified articles provided by the user.  
The BBC_News_Train.csv file is used as a database of classified articles to be recommended.


## Train a Doc2Vec Model

Doc2Vec is a Model that represents each Document as a Vector. It usually outperforms Word2Vec model.  
  
Corresponding research paper: https://cs.stanford.edu/~quocle/paragraph_vector.pdf

### Articles vectorization

First import preprocessed data (punctuation and stop words removed + lemmatization)

In [2]:
df = pd.read_csv("./data/BBC_News_Train_PREPROCESSED.csv")
documents = df['Text']
df.head(10)

Unnamed: 0.1,Unnamed: 0,ArticleId,Text,Category
0,0,1833,worldcom exboss launch defence lawyer defendin...,business
1,1,154,german business confidence slide german busine...,business
2,2,1101,bbc poll indicates economic gloom citizen majo...,business
3,3,1976,lifestyle governs mobile choice faster better ...,tech
4,4,917,enron boss 168m payout eighteen former enron d...,business
5,5,1582,howard truanted play snooker conservative lead...,politics
6,6,651,wale silent grand slam talk rhys williams say ...,sport
7,7,1797,french honour director parker british film dir...,entertainment
8,8,2034,car giant hit mercedes slump slump profitabili...,business
9,9,1866,fockers fuel festive film chart comedy meet fo...,entertainment


Train the Doc2Vec model (cf documentation: https://radimrehurek.com/gensim/models/doc2vec.html)

In [3]:
# Be careful: the words property should be a list of strings, not a string.
tagged_data = [TaggedDocument(words=doc.split(), tags=[str(i)]) for i, doc in enumerate(documents)]

# Training the Doc2Vec model
vector_size = 100
model = Doc2Vec(vector_size=vector_size, window=5, min_count=1, workers=4, epochs=10)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

# Retrieving vectors for existing documents
existing_document_index = 0
# the dv object contains the paragraph vectors learned from the training data
existing_document_vector = model.dv[existing_document_index]
existing_document_vector

array([ 0.00178032,  0.420868  ,  0.07124538,  0.26665634,  0.19016576,
       -0.14554183,  0.08105411,  0.37632608, -0.46756446, -0.07438911,
       -0.20769759, -0.35889044, -0.06579841, -0.10286285, -0.24111718,
       -0.08798007,  0.09795902, -0.00475875, -0.01577073, -0.15740243,
        0.26975903,  0.08292557,  0.50679517, -0.20901638,  0.1132718 ,
        0.03333049, -0.08538502, -0.24275617, -0.37508044, -0.15356559,
        0.31994173,  0.07724895, -0.15764071, -0.18187338,  0.05838601,
        0.11984175,  0.30638444, -0.16219519,  0.1627064 , -0.29744607,
        0.28053048, -0.25672776, -0.7180381 , -0.5070249 ,  0.25081497,
        0.00343167,  0.01667547,  0.12490556, -0.01538069,  0.28683633,
        0.18046391, -0.0666643 , -0.14327464,  0.02839914, -0.22129428,
        0.39195028,  0.18440206, -0.19804817, -0.27500996,  0.24864206,
       -0.13658412,  0.37566778, -0.3310513 ,  0.16831039,  0.16979222,
        0.49784404,  0.3315094 ,  0.30372477, -0.18056433,  0.40

To vectorize a new document, we should execute : 

`new_document = ['new', 'document', 'to', 'vectorize']`<br>
`vector = model.infer_vector(new_document)`

Here's an example.

In [4]:
new_doc = ["let's", "test", "the", "vectorizer"]
model.infer_vector(new_doc)

array([-0.00172785,  0.01598249,  0.00959428,  0.01714118,  0.00288896,
       -0.01390108,  0.004217  ,  0.03962549, -0.00752923, -0.0077175 ,
       -0.02584913, -0.03436141, -0.01722602,  0.00341914, -0.0166019 ,
       -0.04048571,  0.01847427, -0.03862223,  0.00808784, -0.02803555,
        0.0043587 ,  0.00503165,  0.01180524,  0.01358912, -0.0010808 ,
        0.00258788, -0.02572967, -0.01129488, -0.02149267, -0.00245236,
        0.01740257,  0.01276386, -0.00895005,  0.01643727,  0.00888155,
        0.00661044,  0.00327763, -0.02524641, -0.00877393, -0.00093612,
        0.0011985 , -0.02688934, -0.02322255, -0.02488077,  0.00339175,
       -0.02134925,  0.00015468, -0.00838014,  0.0167106 ,  0.0265915 ,
       -0.01575251, -0.01153053,  0.01526872,  0.01831424, -0.02752504,
        0.01693225,  0.00805872,  0.00790549, -0.02000716,  0.01726758,
        0.00537474,  0.02457773, -0.01652905,  0.01028937, -0.01093186,
        0.02507559,  0.00235086,  0.03375262, -0.00684314,  0.00

Now, we want to vectorize each document of the dataset.

In [11]:
# Get vectors
df["vectors"] = df.apply(lambda row: model.infer_vector(row["Text"].split()), axis=1)
df.head()

Unnamed: 0.1,Unnamed: 0,ArticleId,Text,Category,vectors
0,0,1833,worldcom exboss launch defence lawyer defendin...,business,"[0.061915487, 0.74867785, 0.22579156, 0.434003..."
1,1,154,german business confidence slide german busine...,business,"[-0.14426371, 0.1524045, -0.07887332, 0.255450..."
2,2,1101,bbc poll indicates economic gloom citizen majo...,business,"[0.06966864, -0.25320932, 0.37633792, 0.388539..."
3,3,1976,lifestyle governs mobile choice faster better ...,tech,"[-0.32473436, -0.38306633, 1.3080124, 0.926283..."
4,4,917,enron boss 168m payout eighteen former enron d...,business,"[-0.21187346, 0.5938589, 0.0014157373, 0.48634..."


### Add articles without preprocessing

To be fed to the keywords extractor that will handle this task.

In [12]:
#df.set_index(["Unnamed: 0"],inplace=True)
# Get the initial DataFrame with the articles before processing
articles = pd.read_csv("./data/BBC_News_Train.csv")

# add articles to the vectorized df
df["Article"] = articles["Text"]

In [57]:
df["Article"].loc[0]



### Keywords extraction

We extract the key words (using yake keywords extractor: https://github.com/LIAAD/yake) as an evaluation material for the recommendation.

In [13]:
kw_extractor = yk.KeywordExtractor(top=10, n=2)

In [14]:
def extract_keywords_only(article):
    keywords = []
    keywords_and_weight = kw_extractor.extract_keywords(article)

    for couple in keywords_and_weight :
        keywords.append(couple[0])
    
    return keywords

In [15]:
df["Keywords"] = df["Article"].apply(lambda article: extract_keywords_only(article))

In [16]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,ArticleId,Text,Category,vectors,Article,Keywords
0,0,1833,worldcom exboss launch defence lawyer defendin...,business,"[0.061915487, 0.74867785, 0.22579156, 0.434003...",worldcom ex-boss launches defence lawyers defe...,"[ex-boss launches, launches defence, worldcom,..."
1,1,154,german business confidence slide german busine...,business,"[-0.14426371, 0.1524045, -0.07887332, 0.255450...",german business confidence slides german busin...,"[knocking hopes, speedy recovery, business con..."
2,2,1101,bbc poll indicates economic gloom citizen majo...,business,"[0.06966864, -0.25320932, 0.37633792, 0.388539...",bbc poll indicates economic gloom citizens in ...,"[world economy, world, countries, economy, glo..."


### Export the dataset

For easier use of the preprocessed data later...

In [17]:
dump(df, "./assets/BBC_News_Preprocessed.joblib")
dump(model, "./assets/model_Doc2Vec.joblib")

['./assets/model_Doc2Vec.joblib']

### Compute most similiar article 

Function to preprocess an article.

In [18]:
def preprocess_article(article):
    # Tokenize the article
    words = article.split()

    # Remove punctuation
    punctuation = string.punctuation
    words = [word for word in words if word not in punctuation]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.lower() not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    return words

Function to compute the cosine similarity between an article and all the articles of the test dataset.

In [19]:
def compute_similarity_scores(article, articles_vec, model):
    article = preprocess_article(article)
    article_vector = model.infer_vector(article)

    vectors = articles_vec['vectors']
    i = 0
    similarity_score_dict = dict()
    for vector in vectors:

        # Reshape the vectors to be 2D arrays with a single row
        vector = vector.reshape(1, -1)
        article_vector = article_vector.reshape(1, -1)

        similarity_score = cosine_similarity(vector, article_vector)[0, 0]
        
        similarity_score_dict[i] = similarity_score
        i += 1

    return similarity_score_dict

Functions to get the top 3 most similar articles.

In [20]:
def most_similar_articles(article, category):
    articles_vec = load("./assets/BBC_News_Preprocessed.joblib")
    model = load("./assets/model_Doc2Vec.joblib")
    
    # Filter articles by the specified category
    articles_vec = articles_vec[articles_vec["Category"] == category]

    scores = compute_similarity_scores(article, articles_vec, model)
    sorted_scores = dict(sorted(scores.items(), key=lambda item: item[1], reverse=True))

    # Get the top three keys (indices)
    top_three_keys = list(sorted_scores.keys())[:3]
    top_three_keys = articles_vec.index[top_three_keys]

    # Extract the corresponding articles
    top_three_articles = articles_vec.loc[top_three_keys, ["Article", "Category"]]  

    return top_three_articles

In [21]:
def get_recommendations(article):
    svc = load("./assets/classifier.joblib")
    category = svc.predict([article])[0]
    print(f"Initial article categorized as {category}.")

    recommendations = most_similar_articles(article, category)
    return recommendations

### Testing the model

In [22]:
articles_test = pd.read_csv("./data/BBC_News_Test.csv")
article = articles_test.iloc[1]["Text"]
print(article)

software watching while you work software that can not only monitor every keystroke and action performed at a pc but also be used as legally binding evidence of wrong-doing has been unveiled.  worries about cyber-crime and sabotage have prompted many employers to consider monitoring employees. the developers behind the system claim it is a break-through in the way data is monitored and stored. but privacy advocates are concerned by the invasive nature of such software.  the system is a joint venture between security firm 3ami and storage specialists bridgehead software. they have joined forces to create a system which can monitor computer activity  store it and retrieve disputed files within minutes. more and more firms are finding themselves in deep water as a result of data misuse. sabotage and data theft are most commonly committed from within an organisation according to the national hi-tech crime unit (nhtcu) a survey conducted on its behalf by nop found evidence that more than 80

In [23]:
get_recommendations(article)

Initial article categorized as tech.


Unnamed: 0,Article,Category
1388,software watching while you work software that...,tech
1007,intel unveils laser breakthrough intel has unv...,tech
1190,rich pickings for hi-tech thieves viruses tro...,tech


## Evaluation (or alternative): keywords extraction

We tried to automate the evaluation of recommendation system using keywords extraction.   
 

For each article in the test dataset, we compare the top 3 articles recommended using cosine similarity and the top 5 articles recommended using the keywords extraction (ranking obtained using the number of keywords in common between the article provided by the user and the others).  
  

The issue with this method is that we're trying to evaluate a method (ranking according to cosine similarity) using another method (ranking according to keywords in common) that is supposed to be less efficient.  
  
This is probably a bad method for evaluation, but we let it here anyway to show out attempt to automate the evaluation process...

### Ranking based on keywords extraction

In [24]:
def rank_article_basedOn_keywords(user_article):
    df = load("./assets/BBC_News_Preprocessed.joblib")
    keywords_user_article = extract_keywords_only(user_article)

    similar_articles = {}

    for idx, article_row in df.iterrows():
        keywords_article = article_row["Keywords"]
        nombre_elements_communs = len(set(keywords_user_article) & set(keywords_article))
        similar_articles[idx] = nombre_elements_communs
        

    sorted_similar_articles = dict(sorted(similar_articles.items(), key=lambda item: item[1], reverse=True))

    # Indices of articles and number of keywords in common with the article provided by the user
    return dict(list(sorted_similar_articles.items())[:5])


In [25]:
# Get recommandation accuracy for one article
def calculate_recommendation_accuracy(recommendations_cosine, top_5_similar_articles_kw):
    # Get top 3 recommended articles indices (using cosine similarity)
    recommended_indices = recommendations_cosine.index.values.tolist()
    
    # Get top 5 recommended articles indices (using keywords)
    top_5_indices = list(top_5_similar_articles_kw.keys())
    
    # Get the intersection 
    common_indices = set(recommended_indices) & set(top_5_indices)
    
    
    # Get "accuracy": if there are 3 articles in common, accuracy is 1. If only 2, accuracy is 2/3, and so on...
    accuracy = len(common_indices) / len(recommended_indices)
    
    return accuracy

### Compute the accuracy of our recommandation system forthe hole dataset

Get test articles (provided by the user)

In [26]:
df_test = pd.read_csv("./data/BBC_News_Test.csv")
articles_test = df_test["Text"]

Get final accuracy for the whole dataset

In [29]:
accuracies = []
for article_test in articles_test :
    recommendations = get_recommendations(article=article_test)
    top_5_similar_articles_indices = rank_article_basedOn_keywords(user_article=article_test)
    accuracies.append(calculate_recommendation_accuracy(recommendations_cosine=recommendations, top_5_similar_articles_kw=top_5_similar_articles_indices))
accuracy = sum(accuracies)/len(accuracies)
print(f"Recommandation System has an accuracy of : {accuracy}")

Initial article categorized as sport.
Initial article categorized as tech.
Initial article categorized as sport.
Initial article categorized as business.
Initial article categorized as sport.
Initial article categorized as sport.
Initial article categorized as politics.
Initial article categorized as politics.
Initial article categorized as entertainment.
Initial article categorized as business.
Initial article categorized as business.
Initial article categorized as tech.
Initial article categorized as politics.
Initial article categorized as tech.
Initial article categorized as entertainment.
Initial article categorized as sport.
Initial article categorized as politics.
Initial article categorized as tech.
Initial article categorized as entertainment.
Initial article categorized as entertainment.
Initial article categorized as business.
Initial article categorized as politics.
Initial article categorized as sport.
Initial article categorized as business.
Initial article categorized as

In [30]:
accuracy

0.1455782312925168

The accuracy is low, but it doesn't really mean something...

## Manual evaluation

To evaluate the recommendation system, we provide one article non classified and take 10 articles in the dataset as the articles to be recommended.  
We individually choose 3 articles that appear to us as the most similar and see if it aligns with what the model predicted.