In [1]:
import numpy as np
import json
import re
import random
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import torch.nn.functional as F

import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
def tokenize(sentence):
    """
    split sentence into array of words/tokens
    a token can be a word or punctuation character, or number
    """
    return nltk.word_tokenize(sentence)

In [3]:
def stem(word):
    """
    stemming = find the root form of the word
    examples:
    words = ["organize", "organizes", "organizing"]
    words = [stem(w) for w in words]
    -> ["organ", "organ", "organ"]
    """
    return stemmer.stem(word.lower())

In [4]:
def bag_of_words(tokenized_sentence, words):
    """
    return bag of words array:
    1 for each known word that exists in the sentence, 0 otherwise
    example:
    sentence = ["hello", "how", "are", "you"]
    words = ["hi", "hello", "I", "you", "bye", "thank", "cool"]
    bog   = [  0 ,    1 ,    0 ,   1 ,    0 ,    0 ,      0]
    """
    # stem each word
    sentence_words = [stem(word) for word in tokenized_sentence]
    # initialize bag with 0 for each word
    bag = np.zeros(len(words), dtype=np.float32)
    for idx, w in enumerate(words):
        if w in sentence_words: 
            bag[idx] = 1

    return bag

In [5]:
class ChatDataset(Dataset):
    def __init__(self, intents, all_words, tags):
        self.intents = intents
        self.all_words = all_words
        self.tags = tags
        self.xy = self._prepare_data()

    def _prepare_data(self):
        xy = []
        for intent in self.intents:
            tag = intent['tag']
            for pattern in intent['patterns']:
                tokenized_sentence = tokenize(pattern)
                bag = bag_of_words(tokenized_sentence, self.all_words)
                label = self.tags.index(tag)
                xy.append((bag, label))
        return xy

    def __getitem__(self, index):
        bag, label = self.xy[index]
        return bag, label

    def __len__(self):
        return len(self.xy)

In [6]:
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.l1 = nn.Linear(input_size, hidden_size) 
        self.l2 = nn.Linear(hidden_size, hidden_size) 
        self.l3 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)  # Dropout avec taux de 0.5

        # Initialisation des poids avec la méthode de He
        torch.nn.init.kaiming_uniform_(self.l1.weight, nonlinearity='relu')
        torch.nn.init.kaiming_uniform_(self.l2.weight, nonlinearity='relu')
        torch.nn.init.kaiming_uniform_(self.l3.weight)

    def forward(self, x):
        out = self.l1(x)
        out = self.relu(out)
        out = self.dropout(out)  # Dropout après la première couche cachée
        out = self.l2(out)
        out = self.relu(out)
        out = self.dropout(out)  # Dropout après la deuxième couche cachée
        out = self.l3(out)
        return out

In [7]:
with open('intents.json', 'r') as f:
    intents = json.load(f)

all_words = []
tags = []
xy = []
for intent in intents['intents']:
    tag = intent['tag']
    tags.append(tag)
    for pattern in intent['patterns']:
        tokenized_sentence = tokenize(pattern)
        all_words.extend(tokenized_sentence)
        xy.append((tokenized_sentence, tag))

ignore_words = ['?', '.', '!']
all_words = [stem(w) for w in all_words if w not in ignore_words]
all_words = sorted(set(all_words))
tags = sorted(set(tags))

X_train = []
y_train = []
for (pattern_sentence, tag) in xy:
    bag = bag_of_words(pattern_sentence, all_words)
    X_train.append(bag)
    label = tags.index(tag)
    y_train.append(label)

X_train = np.array(X_train)
y_train = np.array(y_train)

batch_size = 16
learning_rate = 0.001
input_size = len(X_train[0])
hidden_size = 512
output_size = len(tags)

dataset = ChatDataset(intents['intents'], all_words, tags)
train_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True)

model = NeuralNet(input_size, hidden_size, output_size).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


num_epochs = 2500
correct=0
total=0
for epoch in range(num_epochs):
    for (words, labels) in train_loader:
        words = words.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(words)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        total += labels.size(0)
        correct += (outputs.argmax(dim=1) == labels).sum().item()

    if (epoch+1) % 100 == 0:
        accuracy =  100 * correct / total
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Accuracy: {accuracy}')

Epoch [100/2500], Loss: 0.0743, Accuracy: 92.48780487804878
Epoch [200/2500], Loss: 0.0019, Accuracy: 95.82926829268293
Epoch [300/2500], Loss: 0.0003, Accuracy: 96.92682926829268
Epoch [400/2500], Loss: 0.0003, Accuracy: 97.48373983739837
Epoch [500/2500], Loss: 0.0001, Accuracy: 97.82764227642276
Epoch [600/2500], Loss: 0.0000, Accuracy: 98.06233062330624
Epoch [700/2500], Loss: 0.0001, Accuracy: 98.20673635307782
Epoch [800/2500], Loss: 0.0000, Accuracy: 98.32825203252033
Epoch [900/2500], Loss: 0.0002, Accuracy: 98.42005420054201
Epoch [1000/2500], Loss: 0.0000, Accuracy: 98.4910569105691
Epoch [1100/2500], Loss: 0.0000, Accuracy: 98.54249815225425
Epoch [1200/2500], Loss: 0.0000, Accuracy: 98.59349593495935
Epoch [1300/2500], Loss: 0.0000, Accuracy: 98.63727329580988
Epoch [1400/2500], Loss: 0.0000, Accuracy: 98.66898954703832
Epoch [1500/2500], Loss: 0.0000, Accuracy: 98.69539295392954
Epoch [1600/2500], Loss: 0.0000, Accuracy: 98.72103658536585
Epoch [1700/2500], Loss: 0.0000, A

In [8]:
data = {
    "model_state": model.state_dict(),
    "input_size": input_size,
    "hidden_size": hidden_size,
    "output_size": output_size,
    "all_words": all_words,
    "tags": tags
    }

FILE = "chatbot_model_base.pth"
torch.save(data, FILE)

print(f'training complete. file saved to {FILE}')

training complete. file saved to chatbot_model_base.pth


## Deploying Q/A model

In [9]:
import json
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from sklearn.metrics.pairwise import cosine_similarity

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
# Charger le dataset JSON
with open('piaf-v1.1.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)

# Prétraitement des données
paragraphs = []
questions = []
answers_start = []
answers_end = []

for article in dataset['data']:
    for paragraph in article['paragraphs']:
        context = paragraph['context']
        for qa in paragraph['qas']:
            question = qa['question']
            for answer in qa['answers']:
                answer_start = answer['answer_start']
                answer_end = answer_start + len(answer['text'])

                paragraphs.append(context)
                questions.append(question)
                answers_start.append(answer_start)
                answers_end.append(answer_end)

In [11]:
# Chargement du modèle de question-réponse
model_qa = AutoModelForQuestionAnswering.from_pretrained("etalab-ia/camembert-base-squadFR-fquad-piaf")
tokenizer_qa = AutoTokenizer.from_pretrained("etalab-ia/camembert-base-squadFR-fquad-piaf")

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [12]:
# Fonction pour générer une réponse à partir d'une question
def generate_answer(question, context):
    # Prétraitement des données
    inputs = tokenizer_qa(question, context, add_special_tokens=True, return_tensors="pt")
    
    # Prédiction de la réponse
    outputs = model_qa(**inputs)
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits
    
    # Obtenir les indices de début et de fin de la réponse prédite
    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores)
    
    # Convertir les indices en texte
    answer = tokenizer_qa.convert_tokens_to_string(tokenizer_qa.convert_ids_to_tokens(inputs["input_ids"][0][start_index:end_index+1]))
    
    return answer

In [13]:
# Fonction pour trouver le contexte le plus pertinent pour une question donnée
def find_most_relevant_context(question, dataset):
    # Utiliser TF-IDF pour représenter les contextes en vecteurs
    contexts=[]
    for article in dataset['data']:
        for paragraph in article['paragraphs']:
            contexts += [paragraph['context']]
    
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(contexts)
    
    # Convertir la question en vecteur
    question_vector = vectorizer.transform([question])
    
    # Calculer les similarités entre la question et les contextes
    similarities = cosine_similarity(question_vector, vectors)
    
    # Trouver l'index du contexte le plus similaire
    most_similar_index = similarities.argmax()
    
    # Renvoyer le contexte le plus similaire
    return contexts[most_similar_index]

## Test

In [14]:
# -------------- IMPORT LIBRARY -----------------
import random
import json

import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# ------ IMPORT DATA & MODEL FOR BASE CASE ------
with open('intents.json', 'r') as json_data:
    intents = json.load(json_data)

FILE = "chatbot_model_base.pth"
data = torch.load(FILE)

input_size = data["input_size"]
hidden_size = data["hidden_size"]
output_size = data["output_size"]
all_words = data['all_words']
tags = data['tags']
model_state = data["model_state"]

model = NeuralNet(input_size, hidden_size, output_size).to(device)
model.load_state_dict(model_state)
model.eval()

NeuralNet(
  (l1): Linear(in_features=146, out_features=512, bias=True)
  (l2): Linear(in_features=512, out_features=512, bias=True)
  (l3): Linear(in_features=512, out_features=32, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.5, inplace=False)
)

In [15]:
def process_question(text):
    cleaned_text = text.lower().strip() 
    return cleaned_text


In [16]:
import random
import json
import torch
import pymongo
import re
from dateutil import parser
from datetime import datetime
from transformers import pipeline
import warnings

import warnings

# Suppress specific warnings
warnings.filterwarnings("ignore", message="No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision")
warnings.filterwarnings("ignore", message="`resume_download` is deprecated and will be removed in version 1.0.0")

from transformers import pipeline

# Fonction pour résumer un article
def summarize_article(content):
    summarizer = pipeline("summarization", min_length=30, max_length=90)
    max_tokens = 1024  # Longueur maximale du modèle à ajuster si nécessaire
    chunks = [content[i:i + max_tokens] for i in range(0, len(content), max_tokens)]
    summary = ""
    for chunk in chunks:
        summary += summarizer(chunk, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
    return summary


# Fonction pour récupérer les articles depuis MongoDB
def fetch_articles():
    client = pymongo.MongoClient("mongodb://localhost:27017/")
    db = client["StageTT"]
    collection = db["TT"]
    articles = list(collection.find({}))
    
    for article in articles:
        if isinstance(article['Publish Date'], str):
            try:
                parsed_date = parser.parse(article['Publish Date'])
                article['Publish Date'] = parsed_date.replace(tzinfo=None)
            except ValueError:
                print(f"Warning: Failed to parse date for article {article['_id']}")
    
    return articles



# Chargement des intentions depuis le fichier JSON
with open('intents.json', 'r') as json_data:
    intents = json.load(json_data)

# Chargement du modèle de chatbot
FILE = "chatbot_model_base.pth"
data = torch.load(FILE)

input_size = data["input_size"]
hidden_size = data["hidden_size"]
output_size = data["output_size"]
all_words = data['all_words']
tags = data['tags']
model_state = data["model_state"]

# Nom du bot
bot_name = "QueryBot"

print("Début de la conversation ('quit' pour finir la conversation...)")
while True:
    sentence = input("User: ")
    if sentence == 'quit':
        break
    
    # Intent pour récupérer la dernière nouvelle sur Tunisie Télécom
    if "tunisie télécom" in sentence.lower():
        articles = fetch_articles()
        if articles:
            latest_article = max(articles, key=lambda x: x['Publish Date'])
            print(f"{bot_name}: La dernière nouvelle sur Tunisie Télécom est : {latest_article['Title']}")
        else:
            print(f"{bot_name}: Il n'y a pas d'articles disponibles sur Tunisie Télécom.")
    
    # Intent pour gérer les questions sur les articles positifs/négatifs par année
    elif any(word in sentence.lower() for word in ["combien d'articles positifs", "combien d'articles négatifs", "année avec le plus d'articles négatifs", "année avec le plus d'articles positifs"]):
        year_match = re.search(r'\b\d{4}\b', sentence)
        articles = fetch_articles()
        if articles:
            articles_by_year = {}
            for article in articles:
                year = article['Publish Date'].year
                sentiment = article['SentimentCamembert']
                if year not in articles_by_year:
                    articles_by_year[year] = {'positive': 0, 'negative': 0}
                if sentiment == 'positive':
                    articles_by_year[year]['positive'] += 1
                elif sentiment == 'negative':
                    articles_by_year[year]['negative'] += 1
            
            if "combien d'articles positifs" in sentence.lower():
                if year_match:
                    year = int(year_match.group(0))
                    positive_count = articles_by_year.get(year, {}).get('positive', 0)
                    print(f"{bot_name}: Il y a {positive_count} articles positifs en {year}.")
                else:
                    print(f"{bot_name}: Veuillez spécifier une année valide pour obtenir le nombre d'articles positifs.")
            elif "combien d'articles négatifs" in sentence.lower():
                if year_match:
                    year = int(year_match.group(0))
                    negative_count = articles_by_year.get(year, {}).get('negative', 0)
                    print(f"{bot_name}: Il y a {negative_count} articles négatifs en {year}.")
                else:
                    print(f"{bot_name}: Veuillez spécifier une année valide pour obtenir le nombre d'articles négatifs.")
            elif "année avec le plus d'articles négatifs" in sentence.lower():
                year = max(articles_by_year, key=lambda x: articles_by_year[x]['negative'])
                count = articles_by_year[year]['negative']
                print(f"{bot_name}: L'année avec le plus d'articles négatifs est {year} avec {count} articles.")
            elif "année avec le plus d'articles positifs" in sentence.lower():
                year = max(articles_by_year, key=lambda x: articles_by_year[x]['positive'])
                count = articles_by_year[year]['positive']
                print(f"{bot_name}: L'année avec le plus d'articles positifs est {year} avec {count} articles.")
        else:
            print(f"{bot_name}: Aucun article trouvé.")
    
    # Intent pour récupérer un article positif ou négatif pour une certaine année
    elif "donne moi un article positif" in sentence.lower() or "donne moi un article négatif" in sentence.lower():
        year_match = re.search(r'\b\d{4}\b', sentence)
        sentiment_filter = 'positive' if "positif" in sentence.lower() else 'negative'
        
        articles = fetch_articles()
        if articles:
            filtered_articles = [article for article in articles if article['SentimentCamembert'] == sentiment_filter]
            
            if year_match:
                year = int(year_match.group(0))
                filtered_articles = [article for article in filtered_articles if article['Publish Date'].year == year]
            
            if filtered_articles:
                selected_article = random.choice(filtered_articles)
                summary = summarize_article(selected_article['Content'])
                print(f"{bot_name}: Voici un article {sentiment_filter} pour l'année {selected_article['Publish Date'].year}: {selected_article['Title']}\nRésumé: {summary}")
            else:
                print(f"{bot_name}: Aucun article {sentiment_filter} trouvé pour cette année.")
        else:
            print(f"{bot_name}: Aucun article trouvé dans la base de données.")
    
    # Intent pour résumer tous les articles positifs/négatifs d'une certaine année
    elif "résume moi tous les articles positifs" in sentence.lower() or "résume moi tous les articles négatifs" in sentence.lower() or "résume moi tous les articles de" in sentence.lower():
        year_match = re.search(r'\b\d{4}\b', sentence)
        sentiment_filter = None
        if "positifs" in sentence.lower():
            sentiment_filter = 'positive'
        elif "négatifs" in sentence.lower():
            sentiment_filter = 'negative'
        
        articles = fetch_articles()
        if articles:
            if year_match:
                year = int(year_match.group(0))
                filtered_articles = [article for article in articles if article['Publish Date'].year == year]
                if sentiment_filter:
                    filtered_articles = [article for article in filtered_articles if article['SentimentCamembert'] == sentiment_filter]
                
                if filtered_articles:
                    for article in filtered_articles:
                        summary = summarize_article(article['Content'])
                        print(f"{bot_name}: {article['Title']}\nRésumé: {summary}\n")
                else:
                    print(f"{bot_name}: Aucun article {sentiment_filter} trouvé pour l'année {year}.")
            else:
                filtered_articles = [article for article in articles if sentiment_filter is None or article['SentimentCamembert'] == sentiment_filter]
                if filtered_articles:
                    for article in filtered_articles:
                        summary = summarize_article(article['Content'])
                        print(f"{bot_name}: {article['Title']}\nRésumé: {summary}\n")
                else:
                    print(f"{bot_name}: Aucun article {sentiment_filter} trouvé.")
        else:
            print(f"{bot_name}: Aucun article trouvé dans la base de données.")
    
    # Si aucune intention spécifique n'est détectée, utiliser le modèle de classification de questions
    else:
        sentence_qa = sentence
        # Utiliser des fonctions appropriées pour tokenizer et transformer la phrase en vecteurs
        sentence = tokenize(sentence)
        X = bag_of_words(sentence, all_words)
        X = X.reshape(1, X.shape[0])
        X = torch.from_numpy(X).to(device)

        output = model(X)
        _, predicted = torch.max(output, dim=1)

        tag = tags[predicted.item()]

        probs = torch.softmax(output, dim=1)
        prob = probs[0][predicted.item()]

        if prob.item() > 0.75:
            for intent in intents['intents']:
                if tag == intent['tag']:
                    response = random.choice(intent['responses'])
                    print(f"{bot_name}: {response}")
        else:
            print(f"{bot_name}: Je ne suis pas sûr de comprendre votre demande. Pouvez-vous reformuler?")

Début de la conversation ('quit' pour finir la conversation...)
User: bonjour
QueryBot: Bonjour
User: ça va?
QueryBot: Tout va bien.. Et vous?
User: bien
QueryBot: Content de le savoir!
User: blague 
QueryBot: Comment appelle-t-on un bonhomme de neige avec un bronzage? Une flaque.
User: devinette
QueryBot: Pourquoi un velo ne peut-il pas tenir tout seul ?.....Il est trop fatigue.
User: combien d'articles positifs en 2022
QueryBot: Il y a 18 articles positifs en 2022.
User: combien d'articles négatifs en 2022
QueryBot: Il y a 3 articles négatifs en 2022.
User: tt
QueryBot: Tunisie Telecom est l'operateur historique de telecommunications en Tunisie. Fondee en 1996, elle fournit une gamme de services de telephonie fixe et mobile, d'acces a Internet et d'autres services de telecommunication a travers le pays. Tunisie Telecom joue un role crucial dans l'infrastructure de communication nationale et dans le developpement des technologies de l'information et de la communication en Tunisie.
Use

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


QueryBot: Voici un article positive pour l'année 2024: Tunisie Télécom et l’AtGmo : “Run of Heroes” pour soutenir les malades d’hémopathies
Résumé:  Tunisie Télécom, partenaire of l’association tunisienne des greffés de la moelle osseuse . L’AtGmo peut assurer l'hébergement de 126 malades et accompagnateurs .
User: un article négatif de 2019
QueryBot: ...
User: donne moi un article négatif de 2019


No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
Your max_length is set to 150, but your input_length is only 95. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=47)


QueryBot: Voici un article negative pour l'année 2019: Frais élevés pour des services de qualité inférieure chez Tunisie Télécom
Résumé:  Tunisie Télécom facturés facturé des services de qualité inférieure suscitent des critiques parmi les abonnesnes .


KeyboardInterrupt: Interrupted by user

In [None]:
#import random
#import json
#import torch

#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Charger les données et le modèle pré-entraîné
#with open('intents.json', 'r') as json_data:
#   intents = json.load(json_data)

#FILE = "chatbot_model_base.pth"
#data = torch.load(FILE)

#input_size = data["input_size"]
#hidden_size = data["hidden_size"]
#output_size = data["output_size"]
#all_words = data['all_words']
#tags = data['tags']
#model_state = data["model_state"]

# Charger le modèle
#model = NeuralNet(input_size, hidden_size, output_size).to(device)
#model.load_state_dict(model_state)
#model.eval()

# Nom du chatbot
#bot_name = "QueryBot"

#print("Début de la conversation ('quit' pour finir la conversation...)")
#while True:
    # Saisie utilisateur
    #sentence = input("User: ")
    #if sentence == 'quit':
     #   break
    
    # Prédiction de l'intention
   # sentence_qa = sentence
   # sentence = tokenize(sentence)
   # X = bag_of_words(sentence, all_words)
    #X = X.reshape(1, X.shape[0])
    #X = torch.from_numpy(X).to(device)

    #output = model(X)
    #_, predicted = torch.max(output, dim=1)

    #tag = tags[predicted.item()]

    #probs = torch.softmax(output, dim=1)
    #prob = probs[0][predicted.item()]
    
    # Répondre en fonction de l'intention prédite ou en utilisant la question-réponse
    #if prob.item() > 0.75:  # ajuster le seuil de probabilité selon la précision souhaitée
     #   for intent in intents['intents']:
      #      if tag == intent["tag"]:
       #         print(f"{bot_name}: {random.choice(intent['responses'])}")
        #        break  # sortir après avoir trouvé une réponse appropriée
   # else:
    #    response = process_question(sentence_qa)
     #   if response:
      #      print(f"{bot_name}: {response}")
       # else:
        #    context = find_most_relevant_context(sentence_qa, dataset)
         #   answer = generate_answer(sentence_qa, context)
          #  print(f"{bot_name}: {answer}")


In [None]:
#User: donne moi un article positif de 2023
#QueryBot: Voici un article positif pour l'année 2023: Title Article Positif 2023
#User: combien d'articles positifs en 2022
#QueryBot: Il y a 3 articles positifs en 2022.
#User: année avec le plus d'articles négatifs
#QueryBot: L'année avec le plus d'articles négatifs est 2021 avec 5 articles.
#User: tunisie télécom
#QueryBot: La dernière nouvelle sur Tunisie Télécom est : Title Dernière Nouvelle
#User: donne moi un article négatif
#QueryBot: Voici un article négatif pour l'année 2023: Title Article Négatif 2023
#User: raconte-moi une blague
#QueryBot: Désolé, je n'ai pas compris votre question. Pouvez-vous reformuler ?
#User: quit



In [None]:
#import random
#import json
#import torch
#import pymongo
#import re
#from dateutil import parser
#from datetime import datetime

#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


#def fetch_articles():
    #client = pymongo.MongoClient("mongodb://localhost:27017/")
    #db = client["StageTT"]
    #collection = db["TT"]
    #articles = list(collection.find({}))
    
  
    #for article in articles:
        #if isinstance(article['Publish Date'], str):
            #try:
               
                #parsed_date = parser.parse(article['Publish Date'])
               
                #article['Publish Date'] = parsed_date.replace(tzinfo=None)
            #except ValueError:
                #print(f"Warning: Failed to parse date for article {article['_id']}")
    
    #return articles


#with open('intents.json', 'r') as json_data:
    #intents = json.load(json_data)

#FILE = "chatbot_model_base.pth"
#data = torch.load(FILE)

#input_size = data["input_size"]
#hidden_size = data["hidden_size"]
#output_size = data["output_size"]
#all_words = data['all_words']
#tags = data['tags']
#model_state = data["model_state"]


#model = NeuralNet(input_size, hidden_size, output_size).to(device)
#model.load_state_dict(model_state)
#model.eval()


#bot_name = "QueryBot"

#print("Début de la conversation ('quit' pour finir la conversation...)")
#while True:
  
    #sentence = input("User: ")
    #if sentence == 'quit':
        #break
    
  
    #if "tunisie télécom" in sentence.lower():
        #articles = fetch_articles()
        #if articles:
            #latest_article = max(articles, key=lambda x: x['Publish Date'])
            #print(f"{bot_name}: La dernière nouvelle sur Tunisie Télécom est : {latest_article['Title']}")
        #else:
            #print(f"{bot_name}: Il n'y a pas d'articles disponibles sur Tunisie Télécom.")
    

    #elif any(word in sentence.lower() for word in ["combien d'articles positifs", "combien d'articles négatifs", "année avec le plus d'articles négatifs", "année avec le plus d'articles positifs"]):
        #year_match = re.search(r'\b\d{4}\b', sentence)
        #articles = fetch_articles()
        #if articles:
            #articles_by_year = {}
            #for article in articles:
                #year = article['Publish Date'].year
                #sentiment = article['SentimentCamembert']
                #if year not in articles_by_year:
                    #articles_by_year[year] = {'positive': 0, 'negative': 0}
                #if sentiment == 'positive':
                    #articles_by_year[year]['positive'] += 1
                #elif sentiment == 'negative':
                    #articles_by_year[year]['negative'] += 1
            
            #if "combien d'articles positifs" in sentence.lower():
                #if year_match:
                    #year = int(year_match.group(0))
                    #positive_count = articles_by_year.get(year, {}).get('positive', 0)
                    #print(f"{bot_name}: Il y a {positive_count} articles positifs en {year}.")
                #else:
                    #print(f"{bot_name}: Veuillez spécifier une année valide pour obtenir le nombre d'articles positifs.")
            #elif "combien d'articles négatifs" in sentence.lower():
                #if year_match:
                    #year = int(year_match.group(0))
                    #negative_count = articles_by_year.get(year, {}).get('negative', 0)
                    #print(f"{bot_name}: Il y a {negative_count} articles négatifs en {year}.")
                #else:
                    #print(f"{bot_name}: Veuillez spécifier une année valide pour obtenir le nombre d'articles négatifs.")
            #elif "année avec le plus d'articles négatifs" in sentence.lower():
                #year = max(articles_by_year, key=lambda x: articles_by_year[x]['negative'])
                #count = articles_by_year[year]['negative']
                #print(f"{bot_name}: L'année avec le plus d'articles négatifs est {year} avec {count} articles.")
            #elif "année avec le plus d'articles positifs" in sentence.lower():
                #year = max(articles_by_year, key=lambda x: articles_by_year[x]['positive'])
                #count = articles_by_year[year]['positive']
                #print(f"{bot_name}: L'année avec le plus d'articles positifs est {year} avec {count} articles.")
        #else:
            #print(f"{bot_name}: Aucun article trouvé.")
    
    
    #else:
        #sentence_qa = sentence
        #sentence = tokenize(sentence)
        #X = bag_of_words(sentence, all_words)
        #X = X.reshape(1, X.shape[0])
        #X = torch.from_numpy(X).to(device)

        #output = model(X)
        #_, predicted = torch.max(output, dim=1)

        #tag = tags[predicted.item()]

        #probs = torch.softmax(output, dim=1)
        #prob = probs[0][predicted.item()]

        
        #if prob.item() > 0.75:  
            #for intent in intents['intents']:
                #if tag == intent["tag"]:
                    #print(f"{bot_name}: {random.choice(intent['responses'])}")
                    #break
        #else:
            #response = process_question(sentence_qa)
            #if response:
                #print(f"{bot_name}: {response}")
            #else:
                #context = find_most_relevant_context(sentence_qa, dataset)
                #answer = generate_answer(sentence_qa, context)
                #print(f"{bot_name}: {answer}")

## Summary Model for the chatbot

In [94]:
from datasets import load_dataset

dataset_orangesum = load_dataset("GEM/OrangeSum", "abstract") # we can also specify "title" to obtain pairs of text-title
#dataset_xlsum = load_dataset("csebuetnlp/xlsum", "french")
#dataset_mlsum = load_dataset("mlsum", "fr")

In [95]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
from nltk.corpus import stopwords
import time
from tensorflow.python.layers.core import Dense
from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
print('TensorFlow Version: {}'.format(tf.__version__))

TensorFlow Version: 2.17.0


In [96]:
def clean_text(text, remove_stopwords = True):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''
    
    # Convert words to lower case
    text = text.lower()
    

    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    # Optionally, remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("french"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    return text

In [97]:
dataset_orangesum.shape

{'train': (21401, 4), 'test': (1500, 4), 'validation': (1500, 4)}

In [98]:
df_test_OS = pd.DataFrame(dataset_orangesum['test'])
df_train_OS = pd.DataFrame(dataset_orangesum['train'])
df_validation_OS = pd.DataFrame(dataset_orangesum['validation'])

df_train_OS.isnull().sum()

gem_id        0
input         0
target        0
references    0
dtype: int64

In [99]:
df_train_OS.head()

Unnamed: 0,gem_id,input,target,references
0,OrangeSum_abstract-train-0,Thierry Mariani sur la liste du Rassemblement ...,L'information n'a pas été confirmée par l'inté...,[L'information n'a pas été confirmée par l'int...
1,OrangeSum_abstract-train-1,C'est désormais officiel : Alain Juppé n'est p...,Le maire de Bordeaux ne fait plus partie des R...,[Le maire de Bordeaux ne fait plus partie des ...
2,OrangeSum_abstract-train-2,La mesure est décriée par les avocats et les m...,"En 2020, les tribunaux d'instance fusionnent a...","[En 2020, les tribunaux d'instance fusionnent ..."
3,OrangeSum_abstract-train-3,Dans une interview accordée au Figaro mercredi...,"Les médecins jugés ""gros prescripteurs d'arrêt...","[Les médecins jugés ""gros prescripteurs d'arrê..."
4,OrangeSum_abstract-train-4,Le préjudice est estimé à 2 millions d'euros. ...,Il aura fallu mobiliser 90 gendarmes pour cett...,[Il aura fallu mobiliser 90 gendarmes pour cet...


In [100]:
import nltk
nltk.download('stopwords')
  
# Clean the summaries and texts
clean_target = []
for target in df_train_OS.target:
    clean_target.append(clean_text(target, remove_stopwords=False))
print("Summaries are complete.")

clean_input = []
for input in df_train_OS.input:
    clean_input.append(clean_text(input))
print("Texts are complete.")

[nltk_data] Downloading package stopwords to C:\Users\Mega-
[nltk_data]     Pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Summaries are complete.
Texts are complete.


In [101]:
# Inspect the cleaned summaries and texts to ensure they have been cleaned well
for i in range(3):
    print("Clean News #",i+1)
    print(clean_target[i])
    print(clean_input[i])
    print()

Clean News # 1
l information n a pas été confirmée par l intéressé qui déclare toutefois étudier la question 

thierry mariani liste rassemblement national rn ex fn européennes affirme mardi 11 septembre chez pol nouvelle newsletter politique libération ancien député républicain ministre nicolas sarkozy point rejoindre troupes marine pen élections européennes 2019 ça va faire plus question calendrier obligé annoncer tout suite huit mois européennes ainsi assuré membre influent rn contacté franceinfo mariani a confirmé information élections juin sais numéro 1 liste a répondu ancien ministre transports reconnaît toutefois toujours cité franceinfo nom liste rn fait partie possibilités fréjus ville sympathique prévu rendre week end a ailleurs commenté twitter alors marine pen réunit cadres parti week end cité varoise proximité connue fnla proximité thierry mariani parti frontiste nouvelle sans alliés allons rester opposition longtemps temps renverser table front national a évolué regardons

In [102]:
def count_words(count_dict, text):
    '''Count the number of occurrences of each word in a set of text'''
    for sentence in text:
        for word in sentence.split():
            if word not in count_dict:
                count_dict[word] = 1
            else:
                count_dict[word] += 1

In [103]:
# Find the number of times each word was used and the size of the vocabulary
word_counts = {}

count_words(word_counts, clean_target)
count_words(word_counts, clean_input)
            
print("Size of Vocabulary:", len(word_counts))

Size of Vocabulary: 111529


In [104]:
from torchtext.vocab import FastText
from torchtext.vocab import Vocab


In [105]:
# Load of pretrained embeddings vectors :
pretrained_vectors_fasttext = FastText(language='fr')

In [106]:
print(f'The pre-trained vocabulary contains {pretrained_vectors_fasttext.vectors.shape[0]} words, with embeddings in vectors of size {pretrained_vectors_fasttext.vectors.shape[1]}')

The pre-trained vocabulary contains 1152449 words, with embeddings in vectors of size 300


In [107]:
pretrained_vectors_words = pretrained_vectors_fasttext.stoi.keys()
pretrained_vectors_values = pretrained_vectors_fasttext.stoi.values()

In [108]:
pretrained_vectors_fasttext['maison']

tensor([ 0.1663, -0.2434, -0.1298,  0.2558,  0.2620,  0.4340,  0.2389,  0.2846,
        -0.0285,  0.2952, -0.1806, -0.0203,  0.2095,  0.2392,  0.4044,  0.2178,
         0.3261,  0.1015,  0.1417, -0.1413, -0.1626, -0.6919, -0.1303,  0.5766,
         0.2136, -0.0434, -0.4864,  0.2376, -0.3875,  0.0248,  0.5002,  0.4109,
        -0.2349,  0.2109, -0.1231, -0.1220, -0.2864, -0.2508, -0.2469,  0.0470,
         0.2941, -0.2932, -0.0470, -0.0928,  0.0722, -0.0158,  0.2090,  0.1393,
         0.3059,  0.3177, -0.1812, -0.0239, -0.1266,  0.0802, -0.1903, -0.2608,
        -0.3757, -0.0703,  0.3611,  0.2268, -0.1355,  0.2499, -0.0559, -0.1626,
         0.1937,  0.3333, -0.0398, -0.0106, -0.2556, -0.2036,  0.3537,  0.0297,
         0.0255, -0.1837,  0.1164, -0.3757,  0.2895, -0.2726,  0.0061, -0.2071,
        -0.2901, -0.0297, -0.0647,  0.1851, -0.0209, -0.0855,  0.0574,  0.3292,
        -0.3409, -0.4960, -0.1257, -0.3342,  0.0513, -0.0179,  0.0588,  0.0645,
        -0.2976,  0.0638,  0.2410, -0.25

In [109]:

import numpy as np

import tensorflow as tf
import timeit

embeddings_index = {}

#word_dict = []
#with open(data_path, 'r', encoding='utf-8') as f: 
 #   for line in f:
        #values = line.split(' ')
        #line = re.split(r'fr/',line)
        #values = re.split(" ", line[1])
word = pretrained_vectors_fasttext.stoi.keys()#[12:]
        #values = pretrained_vectors_fasttext.stoi.values()
        #word_dict.append(word) 
for word in word_counts:
    embedding = pretrained_vectors_fasttext[word]
            #embedding = np.asarray(values[1:])#, dtype='float32'
    embeddings_index[word] = embedding
        #print(word)
print('Word embeddings:', len(embeddings_index))

Word embeddings: 111529


In [110]:
# Find the number of words that are missing from CN, and are used more than our threshold.
missing_words = 0
threshold = 20

for word, count in word_counts.items():
    if count > threshold:
        if word not in embeddings_index:
            missing_words += 1
            
missing_ratio = round(missing_words/len(word_counts),4)*100
            
print("Number of words missing from fastText:", missing_words)
print("Percent of words that are missing from vocabulary: {}%".format(missing_ratio))

Number of words missing from fastText: 0
Percent of words that are missing from vocabulary: 0.0%


In [111]:
# Limit the vocab that we will use to words that appear ≥ threshold or are in CN

#dictionary to convert words to integers
vocab_to_int = {} 

value = 0
for word, count in word_counts.items():
    if count >= threshold or word in embeddings_index:
        vocab_to_int[word] = value
        value += 1

# Special tokens that will be added to our vocab
codes = ["<UNK>","<PAD>","<EOS>","<GO>"]   

# Add codes to vocab
for code in codes:
    vocab_to_int[code] = len(vocab_to_int)

# Dictionary to convert integers to words
int_to_vocab = {}
for word, value in vocab_to_int.items():
    int_to_vocab[value] = word

usage_ratio = round(len(vocab_to_int) / len(word_counts),4)*100

print("Total number of unique words:", len(word_counts))
print("Number of words we will use:", len(vocab_to_int))
print("Percent of words we will use: {}%".format(usage_ratio))

Total number of unique words: 111529
Number of words we will use: 111533
Percent of words we will use: 100.0%


In [112]:
# Need to use 300 for embedding dimensions to match CN's vectors.
embedding_dim = 300
nb_words = len(vocab_to_int)

# Create matrix with default values of zero
word_embedding_matrix = np.zeros((nb_words, embedding_dim), dtype=np.float32)
for word, i in vocab_to_int.items():
    if word in embeddings_index:
        word_embedding_matrix[i] = embeddings_index[word]
    else:
        # If word not in CN, create a random embedding for it
        new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
        embeddings_index[word] = new_embedding
        word_embedding_matrix[i] = new_embedding

# Check if value matches len(vocab_to_int)
print(len(word_embedding_matrix))

111533


In [113]:
def convert_to_ints(text, word_count, unk_count, eos=False):
    '''Convert words in text to an integer.
       If word is not in vocab_to_int, use UNK's integer.
       Total the number of words and UNKs.
       Add EOS token to the end of texts'''
    ints = []
    for sentence in text:
        sentence_ints = []
        for word in sentence.split():
            word_count += 1
            if word in vocab_to_int:
                sentence_ints.append(vocab_to_int[word])
            else:
                sentence_ints.append(vocab_to_int["<UNK>"])
                unk_count += 1
        if eos:
            sentence_ints.append(vocab_to_int["<EOS>"])
        ints.append(sentence_ints)
    return ints, word_count, unk_count

In [114]:
# Apply convert_to_ints to clean_target and clean_input
word_count = 0
unk_count = 0

int_target, word_count, unk_count = convert_to_ints(clean_target, word_count, unk_count)
int_input, word_count, unk_count = convert_to_ints(clean_input, word_count, unk_count, eos=True)

unk_percent = round(unk_count/word_count,4)*100

print("Total number of words in headlines:", word_count)
print("Total number of UNKs in headlines:", unk_count)
print("Percent of words that are UNK: {}%".format(unk_percent))

Total number of words in headlines: 5339165
Total number of UNKs in headlines: 0
Percent of words that are UNK: 0.0%


In [115]:
def create_lengths(text):
    '''Create a data frame of the sentence lengths from a text'''
    lengths = []
    for sentence in text:
        lengths.append(len(sentence))
    return pd.DataFrame(lengths, columns=['counts'])

In [116]:
lengths_target = create_lengths(int_target)
lengths_input = create_lengths(int_input)

print("Summaries:")
print(lengths_target.describe())
print()
print("Texts:")
print(lengths_input.describe())

Summaries:
             counts
count  21401.000000
mean      34.393112
std       12.316238
min        3.000000
25%       26.000000
50%       34.000000
75%       42.000000
max      164.000000

Texts:
             counts
count  21401.000000
mean     216.088921
std      106.650884
min       11.000000
25%      142.000000
50%      192.000000
75%      260.000000
max     1884.000000


In [117]:
# Inspect the length of "input"
print(np.percentile(lengths_input.counts, 90))
print(np.percentile(lengths_input.counts, 95))
print(np.percentile(lengths_input.counts, 99))

377.0
417.0
525.0


In [118]:
# Inspect the length of "target"
print(np.percentile(lengths_target.counts, 90))
print(np.percentile(lengths_target.counts, 95))
print(np.percentile(lengths_target.counts, 99))

49.0
54.0
70.0


In [119]:
def unk_counter(sentence):
    '''Counts the number of time UNK appears in a sentence.'''
    unk_count = 0
    for word in sentence:
        if word == vocab_to_int["<UNK>"]:
            unk_count += 1
    return unk_count

In [120]:
sorted_target = []
sorted_input = []
max_input_length = 377
max_target_length = 70
min_length = 2
unk_input_limit = 1
unk_target_limit = 0

for length in range(min(lengths_input.counts), max_input_length): 
    for count, words in enumerate(int_target):
        if (len(int_target[count]) >= min_length and
            len(int_target[count]) <= max_target_length and
            len(int_input[count]) >= min_length and
            unk_counter(int_target[count]) <= unk_target_limit and
            unk_counter(int_input[count]) <= unk_input_limit and
            length == len(int_input[count])
           ):
            sorted_target.append(int_target[count])
            sorted_input.append(int_input[count])
        
# Compare lengths to ensure they match
print(len(sorted_target))
print(len(sorted_input))

19058
19058


## Defining the model

In [None]:
import pymongo
from transformers import pipeline


def summarize_article(content):
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn")
    max_tokens = 1024  # Ajustez selon la longueur maximale du modèle
    chunks = [content[i:i + max_tokens] for i in range(0, len(content), max_tokens)]
    summary = ""
    for chunk in chunks:
        summary += summarizer(chunk, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
    return summary


def summarize_and_store_articles():
    client = pymongo.MongoClient("mongodb://localhost:27017/")
    db = client["StageTT"]
    collection = db["TT"]

    articles = list(collection.find({}))
    for article in articles:
        content = article.get('Content')
        if content:
            summary = summarize_article(content)
            collection.update_one(
                {"_id": article["_id"]},
                {"$set": {"Resume": summary}},
                upsert=True
            )
    print("Résumé des articles stockés dans la collection 'TT'.")


summarize_and_store_articles()


Your max_length is set to 150, but your input_length is only 75. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=37)
Your max_length is set to 150, but your input_length is only 54. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=27)
Your max_length is set to 150, but your input_length is only 92. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=46)
Your max_length is set to 150, but your input_length is only 139. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=69)
You