In [360]:
import pandas as pd
import numpy as np

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import SVC

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import string

# import ast

from joblib import dump, load

## Recommending Books using Doc2Vec

### Approach:
1. **Train a Doc2Vec Model:**
   - Utilize Doc2Vec to transform entire articles into continuous vector representations.
   - Train the model to capture contextual information and semantic meaning of each document.

2. **Compute Similarity:**
   - Measure similarity between articles based on the cosine similarity of their Doc2Vec vectors.
   - Cosine similarity provides a metric for how closely the semantic meanings align.

3. **Benefits of Doc2Vec:**
   - Doc2Vec models excel in capturing contextual nuances and semantic relationships within entire documents.

This approach enables the recommendation of books based on their semantic similarity, allowing for a more nuanced understanding of content beyond simple keyword matching.


## Train a Doc2Vec Model

### Vectorisation of articles 

In [361]:
df = pd.read_csv("../data/BBC_News_Train_PREPROCESSED.csv")
documents = df['Text']

In [362]:
tagged_data = [TaggedDocument(words=doc, tags=[str(i)]) for i, doc in enumerate(documents)]

# Training the Doc2Vec model
vector_size = 100
model = Doc2Vec(vector_size=vector_size, window=5, min_count=1, workers=4, epochs=10)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

# Retrieving vectors for existing documents
existing_document_index = 0
existing_document_vector = model.dv[existing_document_index]
existing_document_vector

array([-0.12892829, -0.14085585, -0.06172242,  0.15094805, -0.16697381,
        0.11059604, -0.05945716, -0.02944281, -0.26169553, -0.06190001,
       -0.0919568 , -0.09961358, -0.08542875,  0.0278306 , -0.04654634,
       -0.15304457,  0.15102224,  0.13160914,  0.00130734, -0.23104867,
        0.14889644,  0.05780197,  0.14653105,  0.08296006,  0.00786588,
        0.05391062, -0.36035788, -0.03804403,  0.02786314, -0.04631222,
        0.4705585 ,  0.06089744, -0.04739401, -0.14474416, -0.16000086,
       -0.12576482, -0.16102254, -0.03453794, -0.15830232,  0.01161368,
        0.06650274,  0.01028084,  0.18908545, -0.00852653,  0.04990214,
       -0.16647975,  0.03139812, -0.10031002,  0.08946379, -0.09847921,
       -0.04651788,  0.00546268, -0.06035695, -0.06309605, -0.05641305,
        0.12740244, -0.02256392,  0.0020253 ,  0.02265512,  0.05173049,
        0.04746727,  0.03716122,  0.29919875,  0.16928664,  0.00595477,
        0.10677752,  0.07032375,  0.11482392, -0.04068539,  0.03

Pour vectoriser un nouveau document, il suffira d'éxecuter les lignes suivantes : 

`new_document = ['new', 'document', 'to', 'vectorize']`<br>
`vector = model.infer_vector(new_document)`

In [363]:
def get_vector(row):
    return model.dv[row.name]

# Apply the function to create the 'vectors' column
df['vectors'] = df.apply(get_vector, axis=1)

In [364]:
df['vectors'].loc[0]

array([-0.12892829, -0.14085585, -0.06172242,  0.15094805, -0.16697381,
        0.11059604, -0.05945716, -0.02944281, -0.26169553, -0.06190001,
       -0.0919568 , -0.09961358, -0.08542875,  0.0278306 , -0.04654634,
       -0.15304457,  0.15102224,  0.13160914,  0.00130734, -0.23104867,
        0.14889644,  0.05780197,  0.14653105,  0.08296006,  0.00786588,
        0.05391062, -0.36035788, -0.03804403,  0.02786314, -0.04631222,
        0.4705585 ,  0.06089744, -0.04739401, -0.14474416, -0.16000086,
       -0.12576482, -0.16102254, -0.03453794, -0.15830232,  0.01161368,
        0.06650274,  0.01028084,  0.18908545, -0.00852653,  0.04990214,
       -0.16647975,  0.03139812, -0.10031002,  0.08946379, -0.09847921,
       -0.04651788,  0.00546268, -0.06035695, -0.06309605, -0.05641305,
        0.12740244, -0.02256392,  0.0020253 ,  0.02265512,  0.05173049,
        0.04746727,  0.03716122,  0.29919875,  0.16928664,  0.00595477,
        0.10677752,  0.07032375,  0.11482392, -0.04068539,  0.03

In [365]:
df["vectors"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1490 entries, 0 to 1489
Series name: vectors
Non-Null Count  Dtype 
--------------  ----- 
1490 non-null   object
dtypes: object(1)
memory usage: 11.8+ KB


In [366]:
dump(df, "../assets/BBC_News_Vectorized.joblib")
dump(model, "../assets/model_Doc2Vec.joblib")

['../assets/model_Doc2Vec.joblib']

### Compute most similiar article 

In [367]:
def preprocess_article(article):
    # Tokenize the article
    words = article.split()

    # Remove punctuation
    punctuation = string.punctuation
    words = [word for word in words if word not in punctuation]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.lower() not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    return words

In [368]:
def compute_similarity_scores(article, articles_vec, model):
    article = preprocess_article(article)
    article_vector = model.infer_vector(article)

    vectors = articles_vec['vectors']
    i = 0
    similarity_score_dict = dict()
    for vector in vectors:

        # Reshape the vectors to be 2D arrays with a single row
        vector = vector.reshape(1, -1)
        article_vector = article_vector.reshape(1, -1)

        similarity_score = cosine_similarity(vector, article_vector)[0, 0]
        
        similarity_score_dict[i] = similarity_score
        i += 1

    return similarity_score_dict

In [369]:
def most_similar_articles(article, category):
    articles_vec = load("../assets/BBC_News_Vectorized.joblib")
    articles_vec.set_index(["Unnamed: 0"],inplace=True)
    model = load("../assets/model_Doc2Vec.joblib")
    
    # Get the initial DataFrame with the articles before processing
    articles = pd.read_csv("../data/BBC_News_Train.csv")

    # add articles to the vectorized df
    articles_vec["Article"] = articles["Text"]
    
    # Filter articles by the specified category
    articles_vec = articles_vec[articles_vec["Category"] == category]

    scores = compute_similarity_scores(article, articles_vec, model)
    sorted_scores = dict(sorted(scores.items(), key=lambda item: item[1], reverse=True))

    # Get the top three keys (indices)
    top_three_keys = list(sorted_scores.keys())[:3]
    top_three_keys = articles_vec.index[top_three_keys]

    # Extract the corresponding articles
    top_three_articles = articles_vec.loc[top_three_keys, ["Article", "Category"]]  

    return top_three_articles

In [370]:
def get_recommendations(article):
    svc = load("../assets/SVC.joblib")
    category = svc.predict([article])[0]
    print(f"Initial article categorized as {category}.")

    recommendations = most_similar_articles(article, category)
    return recommendations

### Testing the model

In [371]:
articles_test = pd.read_csv("../data/BBC_News_Test.csv")
article = articles_test.iloc[1]["Text"]
print(article)

software watching while you work software that can not only monitor every keystroke and action performed at a pc but also be used as legally binding evidence of wrong-doing has been unveiled.  worries about cyber-crime and sabotage have prompted many employers to consider monitoring employees. the developers behind the system claim it is a break-through in the way data is monitored and stored. but privacy advocates are concerned by the invasive nature of such software.  the system is a joint venture between security firm 3ami and storage specialists bridgehead software. they have joined forces to create a system which can monitor computer activity  store it and retrieve disputed files within minutes. more and more firms are finding themselves in deep water as a result of data misuse. sabotage and data theft are most commonly committed from within an organisation according to the national hi-tech crime unit (nhtcu) a survey conducted on its behalf by nop found evidence that more than 80

In [372]:
get_recommendations(article)

Initial article categroized as tech.


Unnamed: 0_level_0,Article,Category
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1
1072,games maker fights for survival one of britain...,tech
616,web radio takes spanish rap global spin the ra...,tech
899,cyber criminals step up the pace so-called phi...,tech
