In [306]:
import pandas as pd
import numpy as np

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import SVC

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import string

# import ast

from joblib import dump, load

## Recommending Books using Doc2Vec

### Approach:
1. **Train a Doc2Vec Model:**
   - Utilize Doc2Vec to transform entire articles into continuous vector representations.
   - Train the model to capture contextual information and semantic meaning of each document.

2. **Compute Similarity:**
   - Measure similarity between articles based on the cosine similarity of their Doc2Vec vectors.
   - Cosine similarity provides a metric for how closely the semantic meanings align.

3. **Benefits of Doc2Vec:**
   - Doc2Vec models excel in capturing contextual nuances and semantic relationships within entire documents.

This approach enables the recommendation of books based on their semantic similarity, allowing for a more nuanced understanding of content beyond simple keyword matching.


## Train a Doc2Vec Model

### Vectorisation of articles 

In [307]:
df = pd.read_csv("../data/BBC_News_Train_PREPROCESSED.csv")
documents = df['Text']

In [308]:
tagged_data = [TaggedDocument(words=doc, tags=[str(i)]) for i, doc in enumerate(documents)]

# Training the Doc2Vec model
vector_size = 100
model = Doc2Vec(vector_size=vector_size, window=5, min_count=1, workers=4, epochs=10)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

# Retrieving vectors for existing documents
existing_document_index = 0
existing_document_vector = model.dv[existing_document_index]
existing_document_vector

array([ 0.04412849,  0.04196702, -0.1751237 , -0.05107028, -0.02442005,
       -0.03083043,  0.03644574, -0.03671683, -0.356493  , -0.05342098,
       -0.01799106,  0.07412537,  0.0735252 , -0.00090576,  0.06554895,
       -0.1527184 ,  0.17697322,  0.02930997, -0.11212368, -0.23931308,
       -0.10960032, -0.1447083 , -0.01049796, -0.06360933, -0.07149747,
       -0.12323113, -0.25467908, -0.16970187,  0.08936521, -0.02387957,
        0.23850912,  0.08414102, -0.03859787,  0.10782184,  0.03359393,
        0.20117247, -0.12455205, -0.0595403 , -0.00265377, -0.02435957,
        0.07036934, -0.07602061,  0.04109928,  0.03612635,  0.01235057,
        0.00465285,  0.02722651, -0.0761276 ,  0.02662544, -0.13655598,
       -0.11038294, -0.11001816, -0.07485324, -0.02365346, -0.11764125,
        0.06043214, -0.02589022, -0.00209121,  0.10730504, -0.1039249 ,
       -0.00792989, -0.08163863,  0.13263145, -0.06044759, -0.07666414,
        0.09355614, -0.02641441,  0.05969002, -0.17673804, -0.08

Pour vectoriser un nouveau document, il suffira d'Ã©xecuter les lignes suivantes : 

`new_document = ['new', 'document', 'to', 'vectorize']`<br>
`vector = model.infer_vector(new_document)`

In [309]:
def get_vector(row):
    return model.dv[row.name]

# Apply the function to create the 'vectors' column
df['vectors'] = df.apply(get_vector, axis=1)

In [310]:
df['vectors'].loc[0]

array([ 0.04412849,  0.04196702, -0.1751237 , -0.05107028, -0.02442005,
       -0.03083043,  0.03644574, -0.03671683, -0.356493  , -0.05342098,
       -0.01799106,  0.07412537,  0.0735252 , -0.00090576,  0.06554895,
       -0.1527184 ,  0.17697322,  0.02930997, -0.11212368, -0.23931308,
       -0.10960032, -0.1447083 , -0.01049796, -0.06360933, -0.07149747,
       -0.12323113, -0.25467908, -0.16970187,  0.08936521, -0.02387957,
        0.23850912,  0.08414102, -0.03859787,  0.10782184,  0.03359393,
        0.20117247, -0.12455205, -0.0595403 , -0.00265377, -0.02435957,
        0.07036934, -0.07602061,  0.04109928,  0.03612635,  0.01235057,
        0.00465285,  0.02722651, -0.0761276 ,  0.02662544, -0.13655598,
       -0.11038294, -0.11001816, -0.07485324, -0.02365346, -0.11764125,
        0.06043214, -0.02589022, -0.00209121,  0.10730504, -0.1039249 ,
       -0.00792989, -0.08163863,  0.13263145, -0.06044759, -0.07666414,
        0.09355614, -0.02641441,  0.05969002, -0.17673804, -0.08

In [311]:
df["vectors"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1490 entries, 0 to 1489
Series name: vectors
Non-Null Count  Dtype 
--------------  ----- 
1490 non-null   object
dtypes: object(1)
memory usage: 11.8+ KB


In [312]:
dump(df, "../assets/BBC_News_Vectorized.joblib")
dump(model, "../assets/model_Doc2Vec.joblib")

['../assets/model_Doc2Vec.joblib']

### Compute most similiar article 

In [313]:
def preprocess_article(article):
    # Tokenize the article
    words = article.split()

    # Remove punctuation
    punctuation = string.punctuation
    words = [word for word in words if word not in punctuation]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.lower() not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    return words

In [314]:
def compute_similarity_scores(article, articles_vec, model):
    article = preprocess_article(article)
    article_vector = model.infer_vector(article)

    vectors = articles_vec['vectors']
    i = 0
    similarity_score_dict = dict()
    for vector in vectors:

        # Reshape the vectors to be 2D arrays with a single row
        vector = vector.reshape(1, -1)
        article_vector = article_vector.reshape(1, -1)

        similarity_score = cosine_similarity(vector, article_vector)[0, 0]
        
        similarity_score_dict[i] = similarity_score
        i += 1

    return similarity_score_dict

In [315]:
def most_similar_articles(article, category):
    # Load precomputed vectors
    articles_vec = load("../assets/BBC_News_Vectorized.joblib")
    articles_vec.set_index(["Unnamed: 0"],inplace=True)
    
    # Get the initial DataFrame with the articles before processing
    articles = pd.read_csv("../data/BBC_News_Train.csv")

    # Combine vectors with the original DataFrame
    articles_vec["Article"] = articles["Text"]

    print(articles_vec.info())
    
    # Filter articles by the specified category
    articles_vec = articles_vec[articles_vec["Category"] == category]

    # Load the Doc2Vec model
    model = load("../assets/model_Doc2Vec.joblib")

    # Compute similarity scores
    scores = compute_similarity_scores(article, articles_vec, model)
    
    # Sort the scores in descending order
    sorted_scores = dict(sorted(scores.items(), key=lambda item: item[1], reverse=True))

    # Get the top three keys (indices)
    top_three_keys = list(sorted_scores.keys())[:3]
    top_three_keys = articles_vec.index[top_three_keys]

    # Extract the corresponding articles
    top_three_articles = articles_vec.loc[top_three_keys, ["Article", "Category"]]  

    return top_three_articles

### Testing the model

In [316]:
svc = load("../assets/SVC.joblib")
articles_test = pd.read_csv("../data/BBC_News_Test.csv")

In [317]:
article = articles_test.iloc[1]["Text"]
print(article)

software watching while you work software that can not only monitor every keystroke and action performed at a pc but also be used as legally binding evidence of wrong-doing has been unveiled.  worries about cyber-crime and sabotage have prompted many employers to consider monitoring employees. the developers behind the system claim it is a break-through in the way data is monitored and stored. but privacy advocates are concerned by the invasive nature of such software.  the system is a joint venture between security firm 3ami and storage specialists bridgehead software. they have joined forces to create a system which can monitor computer activity  store it and retrieve disputed files within minutes. more and more firms are finding themselves in deep water as a result of data misuse. sabotage and data theft are most commonly committed from within an organisation according to the national hi-tech crime unit (nhtcu) a survey conducted on its behalf by nop found evidence that more than 80

In [318]:
category = svc.predict([article])[0]
category

'tech'

The result seems coherent and correct. Let us continue.

In [319]:
most_similar_articles(article, category)

<class 'pandas.core.frame.DataFrame'>
Index: 1490 entries, 0 to 1489
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
 3   vectors    1490 non-null   object
 4   Article    1490 non-null   object
dtypes: int64(1), object(4)
memory usage: 69.8+ KB
None


KeyError: "None of [Index([22, 252, 21], dtype='int32', name='Unnamed: 0')] are in the [index]"