In [439]:
import pandas as pd
import numpy as np
import yake as yk

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import SVC

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import string

from joblib import dump, load

## Recommending Books using Doc2Vec

### Approach:
1. **Train a Doc2Vec Model:**
   - Utilize Doc2Vec to transform entire articles into continuous vector representations.
   - Train the model to capture contextual information and semantic meaning of each document.

2. **Compute Similarity:**
   - Measure similarity between articles based on the cosine similarity of their Doc2Vec vectors.
   - Cosine similarity provides a metric for how closely the semantic meanings align.

3. **Benefits of Doc2Vec:**
   - Doc2Vec models excel in capturing contextual nuances and semantic relationships within entire documents.

This approach enables the recommendation of article based on their semantic similarity.


## Train a Doc2Vec Model

### Vectorisation of articles 

In [440]:
df = pd.read_csv("../data/BBC_News_Train_PREPROCESSED.csv")
documents = df['Text']

In [441]:
tagged_data = [TaggedDocument(words=doc, tags=[str(i)]) for i, doc in enumerate(documents)]

# Training the Doc2Vec model
vector_size = 100
model = Doc2Vec(vector_size=vector_size, window=5, min_count=1, workers=4, epochs=10)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

# Retrieving vectors for existing documents
existing_document_index = 0
existing_document_vector = model.dv[existing_document_index]
existing_document_vector

array([-0.18654838,  0.0320508 , -0.10247894,  0.10078338, -0.15181562,
       -0.05612711, -0.01862613,  0.02214451, -0.4313594 , -0.0611221 ,
        0.06436293, -0.0086362 ,  0.01427858,  0.05466851,  0.01077656,
       -0.08681717,  0.2361141 ,  0.11933956, -0.22163433, -0.42892206,
       -0.10107687,  0.04794652,  0.14405523,  0.03136609, -0.04635189,
        0.09047499, -0.30306217, -0.11784234,  0.10618225, -0.06864321,
        0.43203887,  0.10327868, -0.06775154,  0.01380885,  0.05947003,
       -0.04138461, -0.10150874, -0.14420663, -0.02861814, -0.07934901,
        0.0624795 ,  0.0052607 ,  0.0936746 , -0.11308631, -0.09200615,
       -0.20891182, -0.0258419 , -0.10020083,  0.09570782, -0.06443476,
       -0.0603913 , -0.07906148, -0.22281295, -0.04093896,  0.01881555,
        0.10061052,  0.0364387 ,  0.05460335,  0.06212843,  0.10394233,
        0.01302209,  0.0193369 ,  0.11672506,  0.13054305, -0.1261743 ,
        0.05025452,  0.02010757,  0.10371368, -0.21474852,  0.06

To vectorize a new document, we should execute : 

`new_document = ['new', 'document', 'to', 'vectorize']`<br>
`vector = model.infer_vector(new_document)`

In [442]:
def get_vector(row):
    return model.dv[row.name]

# Apply the function to create the 'vectors' column
df['vectors'] = df.apply(get_vector, axis=1)

In [443]:
df['vectors'].loc[0]

array([-0.18654838,  0.0320508 , -0.10247894,  0.10078338, -0.15181562,
       -0.05612711, -0.01862613,  0.02214451, -0.4313594 , -0.0611221 ,
        0.06436293, -0.0086362 ,  0.01427858,  0.05466851,  0.01077656,
       -0.08681717,  0.2361141 ,  0.11933956, -0.22163433, -0.42892206,
       -0.10107687,  0.04794652,  0.14405523,  0.03136609, -0.04635189,
        0.09047499, -0.30306217, -0.11784234,  0.10618225, -0.06864321,
        0.43203887,  0.10327868, -0.06775154,  0.01380885,  0.05947003,
       -0.04138461, -0.10150874, -0.14420663, -0.02861814, -0.07934901,
        0.0624795 ,  0.0052607 ,  0.0936746 , -0.11308631, -0.09200615,
       -0.20891182, -0.0258419 , -0.10020083,  0.09570782, -0.06443476,
       -0.0603913 , -0.07906148, -0.22281295, -0.04093896,  0.01881555,
        0.10061052,  0.0364387 ,  0.05460335,  0.06212843,  0.10394233,
        0.01302209,  0.0193369 ,  0.11672506,  0.13054305, -0.1261743 ,
        0.05025452,  0.02010757,  0.10371368, -0.21474852,  0.06

### Adding articles with no process to the DF

In [444]:
df.set_index(["Unnamed: 0"],inplace=True)
# Get the initial DataFrame with the articles before processing
articles = pd.read_csv("../data/BBC_News_Train.csv")

# add articles to the vectorized df
df["Article"] = articles["Text"]

### Extraction of keywords 

In [445]:
kw_extractor = yk.KeywordExtractor(top=10)

In [446]:
def extract_keywords_only(article):
    keywords = []
    keywords_and_weight = kw_extractor.extract_keywords(article)

    for couple in keywords_and_weight :
        keywords.append(couple[0])
    
    return keywords

In [447]:
df["Keywords"] = df["Article"].apply(lambda article: extract_keywords_only(article))

### Export the dataset

In [448]:
df.head(3)

Unnamed: 0_level_0,ArticleId,Text,Category,vectors,Article,Keywords
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1833,worldcom exboss launch defence lawyer defendin...,business,"[-0.18654838, 0.032050803, -0.102478944, 0.100...",worldcom ex-boss launches defence lawyers defe...,"[ex-boss launches defence, launches defence la..."
1,154,german business confidence slide german busine...,business,"[-0.12748794, 0.0774179, 0.08212813, 0.2218682...",german business confidence slides german busin...,"[february knocking hopes, business confidence ..."
2,1101,bbc poll indicates economic gloom citizen majo...,business,"[-0.24952345, -0.12041191, -0.110438704, -0.02...",bbc poll indicates economic gloom citizens in ...,"[world service poll, world economy, economic g..."


In [449]:
dump(df, "../assets/BBC_News_Processed.joblib")
dump(model, "../assets/model_Doc2Vec.joblib")

['../assets/model_Doc2Vec.joblib']

### Compute most similiar article 

In [450]:
def preprocess_article(article):
    # Tokenize the article
    words = article.split()

    # Remove punctuation
    punctuation = string.punctuation
    words = [word for word in words if word not in punctuation]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.lower() not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    return words

In [451]:
def compute_similarity_scores(article, articles_vec, model):
    article = preprocess_article(article)
    article_vector = model.infer_vector(article)

    vectors = articles_vec['vectors']
    i = 0
    similarity_score_dict = dict()
    for vector in vectors:

        # Reshape the vectors to be 2D arrays with a single row
        vector = vector.reshape(1, -1)
        article_vector = article_vector.reshape(1, -1)

        similarity_score = cosine_similarity(vector, article_vector)[0, 0]
        
        similarity_score_dict[i] = similarity_score
        i += 1

    return similarity_score_dict

In [452]:
def most_similar_articles(article, category):
    articles_vec = load("../assets/BBC_News_Processed.joblib")
    model = load("../assets/model_Doc2Vec.joblib")
    
    # Filter articles by the specified category
    articles_vec = articles_vec[articles_vec["Category"] == category]

    scores = compute_similarity_scores(article, articles_vec, model)
    sorted_scores = dict(sorted(scores.items(), key=lambda item: item[1], reverse=True))

    # Get the top three keys (indices)
    top_three_keys = list(sorted_scores.keys())[:3]
    top_three_keys = articles_vec.index[top_three_keys]

    # Extract the corresponding articles
    top_three_articles = articles_vec.loc[top_three_keys, ["Article", "Category"]]  

    return top_three_articles

In [453]:
def get_recommendations(article):
    svc = load("../assets/SVC.joblib")
    category = svc.predict([article])[0]
    print(f"Initial article categorized as {category}.")

    recommendations = most_similar_articles(article, category)
    return recommendations

### Testing the model

In [454]:
articles_test = pd.read_csv("../data/BBC_News_Test.csv")
article = articles_test.iloc[1]["Text"]
print(article)

software watching while you work software that can not only monitor every keystroke and action performed at a pc but also be used as legally binding evidence of wrong-doing has been unveiled.  worries about cyber-crime and sabotage have prompted many employers to consider monitoring employees. the developers behind the system claim it is a break-through in the way data is monitored and stored. but privacy advocates are concerned by the invasive nature of such software.  the system is a joint venture between security firm 3ami and storage specialists bridgehead software. they have joined forces to create a system which can monitor computer activity  store it and retrieve disputed files within minutes. more and more firms are finding themselves in deep water as a result of data misuse. sabotage and data theft are most commonly committed from within an organisation according to the national hi-tech crime unit (nhtcu) a survey conducted on its behalf by nop found evidence that more than 80

In [455]:
get_recommendations(article)

Initial article categorized as tech.


Unnamed: 0_level_0,Article,Category
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1
48,halo 2 sells five million copies microsoft is ...,tech
120,apple laptop is greatest gadget the apple po...,tech
91,2d metal slug offers retro fun like some drill...,tech


## Evaluation

Correspondances des mots clés entre l’article entré par le User et le classement des articles recommandés (le nombre de mots-clés correspondant doit )

### Similiarity of keywords

In [464]:
def rank_article_basedOn_keywords(user_article):
    df = load("../assets/BBC_News_Processed.joblib")
    keywords_user_article = extract_keywords_only(user_article)

    similar_articles = {}

    for idx, article_row in df.iterrows():
        keywords_article = article_row["Keywords"]
        nombre_elements_communs = len(set(keywords_user_article) & set(keywords_article))
        similar_articles[idx] = nombre_elements_communs

    sorted_similar_articles = dict(sorted(similar_articles.items(), key=lambda item: item[1], reverse=True))

    return dict(list(sorted_similar_articles.items())[:10])


In [465]:
def calculate_recommendation_accuracy(recommendations, top_5_similar_articles):
    # Obtenez les indices des articles recommandés
    recommended_indices = recommendations.index.values.tolist()
    
    # Obtenez les indices des articles du top 5
    top_5_indices = list(top_5_similar_articles.keys())
    
    # Calculez l'intersection des deux ensembles
    common_indices = set(recommended_indices) & set(top_5_indices)
    
    # Calculez le taux d'exactitude
    accuracy = len(common_indices) / len(recommended_indices)
    
    return accuracy

### Compute the accuracy of our recommandation system

In [466]:
df_test = pd.read_csv("../data/BBC_News_Test.csv")
articles_test = df_test["Text"]

In [467]:
accuracies = []
for article_test in articles_test :
    recommandations = get_recommendations(article=article_test)
    top_5_similar_articles = rank_article_basedOn_keywords(user_article=article_test)
    accuracies.append(calculate_recommendation_accuracy(recommendations=recommandations, top_5_similar_articles=top_5_similar_articles))
accuracy = sum(accuracies)/len(accuracies)
print(f"Recommandation System has an accuracy of : {accuracy}")

Initial article categorized as sport.
Initial article categorized as tech.
Initial article categorized as sport.
Initial article categorized as business.
Initial article categorized as sport.
Initial article categorized as sport.
Initial article categorized as politics.
Initial article categorized as politics.
Initial article categorized as entertainment.
Initial article categorized as business.
Initial article categorized as business.
Initial article categorized as tech.
Initial article categorized as politics.
Initial article categorized as tech.
Initial article categorized as entertainment.
Initial article categorized as sport.
Initial article categorized as politics.
Initial article categorized as tech.
Initial article categorized as entertainment.
Initial article categorized as entertainment.
Initial article categorized as business.
Initial article categorized as politics.
Initial article categorized as sport.
Initial article categorized as business.
Initial article categorized as