In [118]:
import pandas as pd
import numpy as np

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import SVC

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import string

# import ast

from joblib import dump, load

## Recommending Books using Doc2Vec

### Approach:
1. **Train a Doc2Vec Model:**
   - Utilize Doc2Vec to transform entire articles into continuous vector representations.
   - Train the model to capture contextual information and semantic meaning of each document.

2. **Compute Similarity:**
   - Measure similarity between articles based on the cosine similarity of their Doc2Vec vectors.
   - Cosine similarity provides a metric for how closely the semantic meanings align.

3. **Benefits of Doc2Vec:**
   - Doc2Vec models excel in capturing contextual nuances and semantic relationships within entire documents.

This approach enables the recommendation of books based on their semantic similarity, allowing for a more nuanced understanding of content beyond simple keyword matching.


## Train a Doc2Vec Model

### Vectorisation of articles 

In [119]:
df = pd.read_csv("../data/BBC_News_Train_PREPROCESSED.csv")
documents = df['Text']

In [120]:
tagged_data = [TaggedDocument(words=doc, tags=[str(i)]) for i, doc in enumerate(documents)]

# Training the Doc2Vec model
vector_size = 100
model = Doc2Vec(vector_size=vector_size, window=5, min_count=1, workers=4, epochs=10)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

# Retrieving vectors for existing documents
existing_document_index = 0
existing_document_vector = model.dv[existing_document_index]
existing_document_vector

array([-0.16820695, -0.16386934, -0.11466306,  0.17859745,  0.02471507,
       -0.13082625, -0.02174445,  0.19763207, -0.40533647, -0.1359295 ,
        0.03789003, -0.16246106,  0.03443849,  0.00705553,  0.05489259,
       -0.05401694,  0.00959428,  0.20757201, -0.13236567, -0.43132684,
       -0.05026904,  0.04216481,  0.05836882,  0.0348097 ,  0.07203481,
        0.05889795, -0.24727908, -0.21111465,  0.10123955, -0.08069197,
        0.2861252 ,  0.013252  , -0.02052416, -0.1765319 , -0.0586794 ,
        0.19217052,  0.05463048, -0.23824358, -0.1140677 , -0.00863943,
        0.08923928, -0.17095836, -0.00069714, -0.00612225,  0.1396039 ,
       -0.2001712 , -0.07856872, -0.1913196 ,  0.17930265, -0.09241138,
       -0.0340836 , -0.09832505, -0.2480383 , -0.01755597,  0.00226811,
        0.23376927,  0.00194942,  0.01123526,  0.04868018,  0.04661107,
       -0.03506247,  0.1122677 ,  0.25407228,  0.05757903, -0.21821968,
        0.14918123, -0.06852049,  0.04628516, -0.1304316 ,  0.00

Pour vectoriser un nouveau document, il suffira d'Ã©xecuter les lignes suivantes : 

`new_document = ['new', 'document', 'to', 'vectorize']`<br>
`vector = model.infer_vector(new_document)`

In [121]:
def get_vector(row):
    return model.dv[row.name]

# Apply the function to create the 'vectors' column
df['vectors'] = df.apply(get_vector, axis=1)

In [122]:
df.to_csv("../data/vectorized_articles.csv")
dump(model, "../assets/model_Doc2Vec.joblib")

['../assets/model_Doc2Vec.joblib']

### Compute most similiar article 

In [123]:
def preprocess_article(article):
    # Tokenize the article
    words = article.split()

    # Remove punctuation
    punctuation = string.punctuation
    words = [word for word in words if word not in punctuation]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.lower() not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    return words

In [124]:
def compute_similarity_scores(article, articles, model):
    article = preprocess_article(article)
    article_vector = model.infer_vector(article)

    vectors = articles['vectors']
    i = 0
    similarity_score_dict = dict()
    for vector in vectors:

        # Reshape the vectors to be 2D arrays with a single row
        vector = vector.reshape(1, -1)
        article_vector = article_vector.reshape(1, -1)

        similarity_score = cosine_similarity(vector, article_vector)[0, 0]

        similarity_score_dict[i] = similarity_score
        i += 1

    return similarity_score_dict

In [125]:
def most_similar_articles(article, category):
    articles = pd.read_csv("../data/vectorized_articles.csv")
    articles = articles[articles["Category"] == category]

    model = load("../assets/model_Doc2Vec.joblib")

    scores = compute_similarity_scores(article,articles, model)
    sorted_scores = dict(sorted(scores.items(), key=lambda item: item[1], reverse=True))

    top_three_keys = list(sorted_scores.keys())[:3]

    top_three_articles_df = df.loc[top_three_keys]

    return top_three_articles_df["Text"].to_list()

In [126]:
articles = pd.read_csv("../data/vectorized_articles.csv")
articles = articles["vectors"]
type(articles.loc[0])

str

In [127]:
articles = pd.read_csv("../data/vectorized_articles.csv")
articles = articles[articles["Category"] == "tech"]
articles.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,ArticleId,Text,Category,vectors
3,3,3,1976,lifestyle governs mobile choice faster better ...,tech,[-0.16623871 -0.12261564 -0.06123098 0.105811...
19,19,19,1552,moving mobile improves golf swing mobile phone...,tech,[ 0.01223124 0.02895926 -0.01085722 0.060944...
24,24,24,405,bt boost broadband package british telecom sai...,tech,[-4.17283475e-02 -2.03106582e-01 5.38718328e-...
26,26,26,702,peertopeer net stay peertopeer p2p network sta...,tech,[-5.94673097e-01 -2.43753478e-01 1.41981453e-...
30,30,30,1951,pompeii get digital makeover oldfashioned audi...,tech,[-0.28353983 0.08798035 -0.1239863 0.418030...


### Testing the model

In [128]:
svc = load("../assets/SVC.joblib")
articles_test = pd.read_csv("../data/BBC_News_Test.csv")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [129]:
article = articles_test.iloc[1]["Text"]
print(article)

software watching while you work software that can not only monitor every keystroke and action performed at a pc but also be used as legally binding evidence of wrong-doing has been unveiled.  worries about cyber-crime and sabotage have prompted many employers to consider monitoring employees. the developers behind the system claim it is a break-through in the way data is monitored and stored. but privacy advocates are concerned by the invasive nature of such software.  the system is a joint venture between security firm 3ami and storage specialists bridgehead software. they have joined forces to create a system which can monitor computer activity  store it and retrieve disputed files within minutes. more and more firms are finding themselves in deep water as a result of data misuse. sabotage and data theft are most commonly committed from within an organisation according to the national hi-tech crime unit (nhtcu) a survey conducted on its behalf by nop found evidence that more than 80

In [130]:
category = svc.predict([article])[0]
category

'tech'

The result seems coherent and correct. Let us continue.

In [131]:
most_similar_articles(article, category)

AttributeError: 'str' object has no attribute 'reshape'