## PIP INSTALL

In [1]:
#!pip install torch torchvision torchaudio

In [2]:
#!pip install requests beautifulsoup4 gensim nltk spacy transformers
  

In [3]:
#!pip install bert-embedding

In [4]:
#!python -m spacy download fr_core_news_lg

In [5]:
#!pip uninstall -y numpy

In [6]:
#!pip install numpy

## Preprocess

In [7]:
from gensim.parsing.preprocessing import remove_stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import spacy
nlp = spacy.load("fr_core_news_lg") 



  "class": algorithms.Blowfish,


In [8]:
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

In [9]:
import pandas as pd


QA_df = pd.read_csv("winamax_faq.csv",index_col=0)
QA_df

Unnamed: 0,questions,answers
0,Comment compléter mon inscription ?,Pour compléter votre inscription et accéder au...
1,Mes documents ont été validés mais mon compte ...,Une fois votre compte validé (pièce d’identité...
2,J’ai renseigné une date de naissance incorrect...,"Afin que vous puissiez vous connecter, nous vo..."
3,Je n'ai pas reçu mon email d'activation,Il se peut qu'un logiciel de type anti-spam ai...
4,J'ai oublié mon mot de passe,"Pour vous connecter à Winamax, vous devez indi..."
...,...,...
59,L’un des paris de mon MyMatch a été annulé,Si une sélection est annulée à l’intérieur d’u...
60,Les paris sportifs permettent-ils de remporter...,"Lorsque vous placez des paris sportifs, vous c..."
61,Que sont les Freebets ?,Les Freebets sont des paris gratuits offerts p...
62,Comment fonctionnent les paris système ?,Les paris système vous permettent de parier en...


In [10]:
## Data Preprocessing
import re
class TextPreprocessor():
    def __init__(self, data_df, column_name=None):
        self.data_df = data_df  
        if not column_name and type(colum_name) == str:
            raise Exception("column name is mandatory. Make sure type is string format")
        self.column = column_name
        self.convert_lowercase()    
        self.applied_stopword = False
        self.processed_column_name = f"processed_{self.column}"
        
    def convert_lowercase(self):
        ## fill empty values into empty
        self.data_df.fillna('',inplace=True)
        ## reduce all the columns to lowercase
        self.data_df = self.data_df.apply(lambda column: column.astype(str).str.lower(), axis=0)    

    def remove_question_no(self):
        ## remove question no        
        self.data_df[self.column] = self.data_df[self.column].apply(lambda row: re.sub(r'^\d+[.]',' ', row))    
        
    def remove_symbols(self):
        ## remove unwanted character          
        self.data_df[self.column] = self.data_df[self.column].apply(lambda row: re.sub(r'[^A-Za-z0-9\sàâäéèêëïîôöùûüÿç]', ' ', row))    

    def remove_stopwords(self):
        ## remove stopwords and create a new column 
        for idx, question in enumerate(self.data_df[self.column]):      
            self.data_df.loc[idx, self.processed_column_name] = remove_stopwords(question)        

    def apply_lemmatization(self, perform_stopword):
        ## get the root words to reduce inflection of words 
        lemmatizer = WordNetLemmatizer()    
        ## get the column name to perform lemma operation whether stopwords removed text or not
        if perform_stopword:
            column_name = self.processed_column_name
        else:
            column_name = self.column
        ## iterate every question, perform tokenize and lemma
        for idx, question in enumerate(self.data_df[column_name]):

            lemmatized_sentence = []
            ## use spacy for lemmatization
            doc = nlp(question.strip())
            for word in doc:       
                lemmatized_sentence.append(word.lemma_)      
                ## update to the same column
                self.data_df.loc[idx, self.processed_column_name] = " ".join(lemmatized_sentence)

    def process(self, perform_stopword = True):
        self.remove_question_no()
        self.remove_symbols()
        if perform_stopword:
            self.remove_stopwords()
        self.apply_lemmatization(perform_stopword)    
        return self.data_df

In [11]:
## pre-process training question data

text_preprocessor = TextPreprocessor(QA_df.copy(), column_name="questions")
processed_QA_df = text_preprocessor.process(perform_stopword=True)
processed_QA_df.head(10)

Unnamed: 0,questions,answers,processed_questions
0,comment compléter mon inscription,pour compléter votre inscription et accéder au...,comment compléter mon inscription
1,mes documents ont été validés mais mon compte ...,une fois votre compte validé (pièce d’identité...,mon document avoir être valider mais mon compt...
2,j ai renseigné une date de naissance incorrect...,"afin que vous puissiez vous connecter, nous vo...",j avoir renseigner un date naissance incorrect...
3,je n ai pas reçu mon email d activation,il se peut qu'un logiciel de type anti-spam ai...,je ne avoir pas recevoir mon e-mail d activation
4,j ai oublié mon mot de passe,"pour vous connecter à winamax, vous devez indi...",j avoir oublier mon mot passer
5,j ai oublié le mot de passe de mon compte wina...,"dans ce cas, veuillez contacter le support en ...",j avoir oublier le mot passer mon compte winam...
6,je voudrais modifier mon mot de passe,connectez-vous sur le site avec votre email et...,je vouloir modifier mon mot passer
7,je souhaite modifier l adresse email enregistr...,pour changer votre adresse email d'identificat...,je souhaiter modifier l adresse e-mail enregis...
8,je souhaite changer mon pseudo,connectez-vous sur le site à l’aide de vos ide...,je souhaiter changer mon pseudo
9,je souhaite modifier mon avatar,"il vous suffit de cliquer le bouton ""mon compt...",je souhaiter modifier mon avatar


In [12]:
#from bert_embedding import BertEmbedding

In [13]:
class TF_IDF():
    def __init__(self):
        self.dictionary = None    
        self.model = None
        self.bow_corpus = None

    def create_tf_idf_model(self, data_df, column_name):
        ## create sentence token list
        sentence_token_list = [sentence.split(" ") for sentence in data_df[column_name]]

        ## dataset vocabulary
        self.dictionary = Dictionary(sentence_token_list) 

        ## bow representation of dataset
        self.bow_corpus = [self.dictionary.doc2bow(sentence_tokens) for sentence_tokens in sentence_token_list]

        ## compute TF-IDF score for corpus
        self.model = TfidfModel(self.bow_corpus)

        ## representation of question and respective TF-IDF value
        print(f"First 10 question representation of TF-IDF vector")
        for index, sentence in enumerate(data_df[column_name]):
            if index <= 100:
                print(f"{sentence} {self.model[self.bow_corpus[index]]}")
            else:
                break

    def get_vector_for_test_set(self, test_df, column_name):
        ## store tf-idf vector
        testset_tf_idf_vector = []
        sentence_token_list = [sentence.split(" ") for sentence in test_df[column_name]]
        test_bow_corpus = [self.dictionary.doc2bow(sentence_tokens) for sentence_tokens in sentence_token_list]   
        for test_sentence in test_bow_corpus:
            testset_tf_idf_vector.append(self.model[test_sentence])      

        return testset_tf_idf_vector

    def get_training_QA_vectors(self):
        QA_vectors = []
        for sentence_vector in self.bow_corpus:
            QA_vectors.append(self.model[sentence_vector])      
        return QA_vectors

    def get_train_vocabulary(self):
        vocab = []
        for index in self.dictionary:
            vocab.append(self.dictionary[index])
        return vocab

In [14]:
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [15]:
import torch

In [16]:
from transformers import BertTokenizer, BertModel

In [17]:
from transformers import AutoModel, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-french-europeana-cased")
model = AutoModel.from_pretrained("dbmdz/bert-base-french-europeana-cased")




Some weights of the model checkpoint at dbmdz/bert-base-french-europeana-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
def get_bert_embeddings(sentences):
    all_tokens = []
    all_embeddings = []
    
    for sentence in sentences:
        # Tokenize input text
        inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True)
        
        # Get BERT embeddings
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Extract tokens and embeddings
        tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])  # Assuming batch size of 1
        embeddings = outputs.last_hidden_state[0].tolist()
        
        all_tokens.append(tokens)
        all_embeddings.append(embeddings)
    
    # Pair tokens with embeddings for each sentence
    bert_output = list(zip(all_tokens, all_embeddings))
    return bert_output



In [19]:
tf_idf = TF_IDF()
tf_idf.create_tf_idf_model(processed_QA_df, "processed_questions")
## get the tf-idf reprentation 
question_QA_vectors = tf_idf.get_training_QA_vectors()

First 10 question representation of TF-IDF vector
comment compléter mon inscription [(0, 0.21878517146541068), (1, 0.6863715158615069), (2, 0.6863715158615069), (3, 0.09960588925764022)]
mon document avoir être valider mais mon compte être bloquer [(3, 0.13580810698690746), (4, 0.22070726519916853), (5, 0.4679181971749496), (6, 0.1632339964987991), (7, 0.4679181971749496), (8, 0.38993183097912465), (9, 0.4679181971749496), (10, 0.31194546478329976)]
j avoir renseigner un date naissance incorrect et je ne pouvoir pas connecter [(4, 0.17824626524673423), (11, 0.31491438449082665), (12, 0.377897261388992), (13, 0.2519315075926613), (14, 0.377897261388992), (15, 0.2316555500223097), (16, 0.07511621812724871), (17, 0.377897261388992), (18, 0.14483292215222995), (19, 0.15210600952153247), (20, 0.31491438449082665), (21, 0.377897261388992), (22, 0.17824626524673423)]
je ne avoir pas recevoir mon e-mail d activation [(3, 0.08233441498928427), (4, 0.26760999718242456), (16, 0.11277572011711878)

In [20]:
## Get the document vocabulary list from TF-IDF
document_vocabulary = tf_idf.get_train_vocabulary()

In [21]:
question_QA_bert_embeddings_list = get_bert_embeddings(processed_QA_df["questions"].to_list())

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

## Hybride TF

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Load data
df = processed_QA_df.copy()

import spacy
import string
import nltk
from nltk.corpus import stopwords

# Download French stopwords from NLTK
nltk.download('stopwords')
french_stopwords = set(stopwords.words('french'))

def preprocess(text):
    # Tokenization and lowercase
    doc = nlp(text.lower())
    
    # Remove punctuation and stopwords
    tokens = [
        token.lemma_ for token in doc
        if not token.is_punct
        and not token.is_space
        and token.text not in french_stopwords
    ]
    
    # Join tokens back into a single string
    processed_text = ' '.join(tokens)
    
    return processed_text

# Apply preprocessing to all text columns
df['processed_questions'] = df['questions'].apply(preprocess)
df['processed_answers'] = df['answers'].apply(preprocess)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform your data
tfidf_matrix = vectorizer.fit_transform(df['processed_questions'].tolist() + df['processed_answers'].tolist())

# Split the matrix back into questions and answers
n = len(df)
tfidf_questions = tfidf_matrix[:n]
tfidf_answers = tfidf_matrix[n:]


In [25]:
def TF_Reco(input_question):
    processed_input_question = preprocess(input_question)
    tfidf_input_question = vectorizer.transform([processed_input_question])
    similarity_scores = cosine_similarity(tfidf_input_question, tfidf_questions)[0]
    top_5_indices = similarity_scores.argsort()[-5:][::-1]
    return top_5_indices.tolist()


## Test

In [26]:
## helps to retrieve similar question based of input vectors/embeddings for test query
def retrieveSimilarFAQ(train_question_vectors, test_question_vectors, train_QA_df, train_column_name, test_QA_df, test_column_name):
    for test_index, test_vector in enumerate(test_question_vectors):
        # Store top 5 similar questions - tuple of (similarity_score, train_index)
        top_5_similar = [(-1, -1) for _ in range(5)]
        
        for train_index, train_vector in enumerate(train_question_vectors):
            sim_score = cosine_similarity(train_vector, test_vector)[0][0]
            
            # Check if this score is higher than the smallest in top_5
            if sim_score > top_5_similar[0][0]:
                top_5_similar[0] = (sim_score, train_index)
                # Sort the top 5 by score to always have the smallest score at 0 index
                top_5_similar.sort(key=lambda x: x[0])
                
        top_5_similar.reverse()
        top_5_similar = [x[1] for x in top_5_similar]
        top_5_TF = TF_Reco(test_QA_df[test_column_name].iloc[test_index])        
        
        for element in top_5_TF:
            if element not in top_5_similar:
                top_5_similar.append(element)
        
        print("######")
        print(f"Query Question: {test_QA_df[test_column_name].iloc[test_index]}")    
        for sim_Q_index in top_5_similar:
            print(f"Retrieved Question: {train_QA_df[train_column_name].iloc[sim_Q_index]}")
        print("######")


In [27]:
test_query_string = ["Je voudrais annuler un pari que je viens de placer", 
                     "changer de mot de passe",
                "Comment s'inscrire aux tournois gratuits ?",
                "Je souhaite retirer mes gains ",
                     "freebet",
                    "annuler un pari"]

test_QA_df = pd.DataFrame(test_query_string, columns=["test_questions"])              
## pre-process testing QA data
text_preprocessor = TextPreprocessor(test_QA_df, column_name="test_questions")
query_QA_df = text_preprocessor.process(perform_stopword=True)

In [28]:
## TF-IDF vector represetation
query_QA_vectors = tf_idf.get_vector_for_test_set(query_QA_df, "processed_test_questions")
query_QA_df.head()

Unnamed: 0,test_questions,processed_test_questions
0,je voudrais annuler un pari que je viens de pl...,je vouloir annuler pari que je venir placer
1,changer de mot de passe,changer mot passer
2,comment s inscrire aux tournois gratuits,comment s inscrire à tournoi gratuit
3,je souhaite retirer mes gains,je souhaiter retirer mon gain
4,freebet,freebet


In [29]:
query_QA_bert_embeddings_list = get_bert_embeddings(test_QA_df["test_questions"].to_list())

In [30]:
## store QA bert embeddings in list
question_QA_bert_embeddings = []
for embeddings in question_QA_bert_embeddings_list:
    question_QA_bert_embeddings.append(embeddings[1])

## store query string bert embeddings in list
query_QA_bert_embeddings = []
for embeddings in query_QA_bert_embeddings_list:
    query_QA_bert_embeddings.append(embeddings[1])

In [31]:
retrieveSimilarFAQ(question_QA_bert_embeddings, query_QA_bert_embeddings, processed_QA_df, "questions", query_QA_df, "test_questions")

######
Query Question: je voudrais annuler un pari que je viens de placer
Retrieved Question: je voudrais annuler un pari que je viens de placer
Retrieved Question: je voudrais modifier mon mot de passe
Retrieved Question: je voudrais créditer mon compte
Retrieved Question: je voudrais créer un autre compte
Retrieved Question: je voudrais modifier mes informations personnelles
Retrieved Question: l un des paris de mon combiné a été annulé
Retrieved Question: l un des paris de mon mymatch a été annulé
######
######
Query Question: changer de mot de passe
Retrieved Question: je voudrais modifier mon mot de passe
Retrieved Question: j ai oublié mon mot de passe
Retrieved Question: comment compléter mon inscription  
Retrieved Question: comment rejoindre une partie de poker  
Retrieved Question: je souhaite modifier l adresse email enregistrée sur mon compte
Retrieved Question: je souhaite changer mon pseudo
Retrieved Question: j ai oublié le mot de passe de mon compte winamax et je n ai p