In [1]:
import numpy as np
import json
import re
import random
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import torch.nn.functional as F

import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
def tokenize(sentence):
    """
    split sentence into array of words/tokens
    a token can be a word or punctuation character, or number
    """
    return nltk.word_tokenize(sentence)

In [3]:
def stem(word):
    """
    stemming = find the root form of the word
    examples:
    words = ["organize", "organizes", "organizing"]
    words = [stem(w) for w in words]
    -> ["organ", "organ", "organ"]
    """
    return stemmer.stem(word.lower())

In [4]:
def bag_of_words(tokenized_sentence, words):
    """
    return bag of words array:
    1 for each known word that exists in the sentence, 0 otherwise
    example:
    sentence = ["hello", "how", "are", "you"]
    words = ["hi", "hello", "I", "you", "bye", "thank", "cool"]
    bog   = [  0 ,    1 ,    0 ,   1 ,    0 ,    0 ,      0]
    """
    # stem each word
    sentence_words = [stem(word) for word in tokenized_sentence]
    # initialize bag with 0 for each word
    bag = np.zeros(len(words), dtype=np.float32)
    for idx, w in enumerate(words):
        if w in sentence_words: 
            bag[idx] = 1

    return bag

In [5]:
class ChatDataset(Dataset):
    def __init__(self, intents, all_words, tags):
        self.intents = intents
        self.all_words = all_words
        self.tags = tags
        self.xy = self._prepare_data()

    def _prepare_data(self):
        xy = []
        for intent in self.intents:
            tag = intent['tag']
            for pattern in intent['patterns']:
                tokenized_sentence = tokenize(pattern)
                bag = bag_of_words(tokenized_sentence, self.all_words)
                label = self.tags.index(tag)
                xy.append((bag, label))
        return xy

    def __getitem__(self, index):
        bag, label = self.xy[index]
        return bag, label

    def __len__(self):
        return len(self.xy)

In [6]:
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.l1 = nn.Linear(input_size, hidden_size) 
        self.l2 = nn.Linear(hidden_size, hidden_size) 
        self.l3 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)  # Dropout avec taux de 0.5

        # Initialisation des poids avec la méthode de He
        torch.nn.init.kaiming_uniform_(self.l1.weight, nonlinearity='relu')
        torch.nn.init.kaiming_uniform_(self.l2.weight, nonlinearity='relu')
        torch.nn.init.kaiming_uniform_(self.l3.weight)

    def forward(self, x):
        out = self.l1(x)
        out = self.relu(out)
        out = self.dropout(out)  # Dropout après la première couche cachée
        out = self.l2(out)
        out = self.relu(out)
        out = self.dropout(out)  # Dropout après la deuxième couche cachée
        out = self.l3(out)
        return out

In [7]:
with open('intents.json', 'r') as f:
    intents = json.load(f)

all_words = []
tags = []
xy = []
for intent in intents['intents']:
    tag = intent['tag']
    tags.append(tag)
    for pattern in intent['patterns']:
        tokenized_sentence = tokenize(pattern)
        all_words.extend(tokenized_sentence)
        xy.append((tokenized_sentence, tag))

ignore_words = ['?', '.', '!']
all_words = [stem(w) for w in all_words if w not in ignore_words]
all_words = sorted(set(all_words))
tags = sorted(set(tags))

X_train = []
y_train = []
for (pattern_sentence, tag) in xy:
    bag = bag_of_words(pattern_sentence, all_words)
    X_train.append(bag)
    label = tags.index(tag)
    y_train.append(label)

X_train = np.array(X_train)
y_train = np.array(y_train)

batch_size = 16
learning_rate = 0.001
input_size = len(X_train[0])
hidden_size = 512
output_size = len(tags)

dataset = ChatDataset(intents['intents'], all_words, tags)
train_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True)

model = NeuralNet(input_size, hidden_size, output_size).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


num_epochs = 2500
correct=0
total=0
for epoch in range(num_epochs):
    for (words, labels) in train_loader:
        words = words.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(words)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        total += labels.size(0)
        correct += (outputs.argmax(dim=1) == labels).sum().item()

    if (epoch+1) % 100 == 0:
        accuracy =  100 * correct / total
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Accuracy: {accuracy}')

Epoch [100/2500], Loss: 0.0073, Accuracy: 92.46341463414635
Epoch [200/2500], Loss: 0.0031, Accuracy: 95.79674796747967
Epoch [300/2500], Loss: 0.0004, Accuracy: 96.92140921409214
Epoch [400/2500], Loss: 0.0001, Accuracy: 97.44918699186992
Epoch [500/2500], Loss: 0.0713, Accuracy: 97.80813008130082
Epoch [600/2500], Loss: 0.0020, Accuracy: 98.02439024390245
Epoch [700/2500], Loss: 0.0000, Accuracy: 98.18350754936121
Epoch [800/2500], Loss: 0.0000, Accuracy: 98.3180894308943
Epoch [900/2500], Loss: 0.0000, Accuracy: 98.41824751580849
Epoch [1000/2500], Loss: 0.0000, Accuracy: 98.49512195121952
Epoch [1100/2500], Loss: 0.0000, Accuracy: 98.54988913525499
Epoch [1200/2500], Loss: 0.0000, Accuracy: 98.6050135501355
Epoch [1300/2500], Loss: 0.0000, Accuracy: 98.64602876797998
Epoch [1400/2500], Loss: 0.0000, Accuracy: 98.68524970963995
Epoch [1500/2500], Loss: 0.0395, Accuracy: 98.71978319783197
Epoch [1600/2500], Loss: 0.0000, Accuracy: 98.744918699187
Epoch [1700/2500], Loss: 0.0000, Accu

In [8]:
data = {
    "model_state": model.state_dict(),
    "input_size": input_size,
    "hidden_size": hidden_size,
    "output_size": output_size,
    "all_words": all_words,
    "tags": tags
    }

FILE = "chatbot_model_base.pth"
torch.save(data, FILE)

print(f'training complete. file saved to {FILE}')

training complete. file saved to chatbot_model_base.pth


## Deploying Q/A model

In [9]:
import json
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from sklearn.metrics.pairwise import cosine_similarity

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer




In [10]:
# Charger le dataset JSON
with open('piaf-v1.1.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)

# Prétraitement des données
paragraphs = []
questions = []
answers_start = []
answers_end = []

for article in dataset['data']:
    for paragraph in article['paragraphs']:
        context = paragraph['context']
        for qa in paragraph['qas']:
            question = qa['question']
            for answer in qa['answers']:
                answer_start = answer['answer_start']
                answer_end = answer_start + len(answer['text'])

                paragraphs.append(context)
                questions.append(question)
                answers_start.append(answer_start)
                answers_end.append(answer_end)

In [11]:
# Chargement du modèle de question-réponse
model_qa = AutoModelForQuestionAnswering.from_pretrained("etalab-ia/camembert-base-squadFR-fquad-piaf")
tokenizer_qa = AutoTokenizer.from_pretrained("etalab-ia/camembert-base-squadFR-fquad-piaf")

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [12]:
# Fonction pour générer une réponse à partir d'une question
def generate_answer(question, context):
    # Prétraitement des données
    inputs = tokenizer_qa(question, context, add_special_tokens=True, return_tensors="pt")
    
    # Prédiction de la réponse
    outputs = model_qa(**inputs)
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits
    
    # Obtenir les indices de début et de fin de la réponse prédite
    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores)
    
    # Convertir les indices en texte
    answer = tokenizer_qa.convert_tokens_to_string(tokenizer_qa.convert_ids_to_tokens(inputs["input_ids"][0][start_index:end_index+1]))
    
    return answer

In [13]:
# Fonction pour trouver le contexte le plus pertinent pour une question donnée
def find_most_relevant_context(question, dataset):
    # Utiliser TF-IDF pour représenter les contextes en vecteurs
    contexts=[]
    for article in dataset['data']:
        for paragraph in article['paragraphs']:
            contexts += [paragraph['context']]
    
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(contexts)
    
    # Convertir la question en vecteur
    question_vector = vectorizer.transform([question])
    
    # Calculer les similarités entre la question et les contextes
    similarities = cosine_similarity(question_vector, vectors)
    
    # Trouver l'index du contexte le plus similaire
    most_similar_index = similarities.argmax()
    
    # Renvoyer le contexte le plus similaire
    return contexts[most_similar_index]

## Test

In [14]:
# -------------- IMPORT LIBRARY -----------------
import random
import json

import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# ------ IMPORT DATA & MODEL FOR BASE CASE ------
with open('intents.json', 'r') as json_data:
    intents = json.load(json_data)

FILE = "chatbot_model_base.pth"
data = torch.load(FILE)

input_size = data["input_size"]
hidden_size = data["hidden_size"]
output_size = data["output_size"]
all_words = data['all_words']
tags = data['tags']
model_state = data["model_state"]

model = NeuralNet(input_size, hidden_size, output_size).to(device)
model.load_state_dict(model_state)
model.eval()

NeuralNet(
  (l1): Linear(in_features=146, out_features=512, bias=True)
  (l2): Linear(in_features=512, out_features=512, bias=True)
  (l3): Linear(in_features=512, out_features=32, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.5, inplace=False)
)

In [15]:
def process_question(text):
    cleaned_text = text.lower().strip() 
    return cleaned_text


In [19]:
import random
import json
import torch
import pymongo
import re
from dateutil import parser
from datetime import datetime

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def fetch_articles():
    client = pymongo.MongoClient("mongodb://localhost:27017/")
    db = client["StageTT"]
    collection = db["TT"]
    articles = list(collection.find({}))
    
  
    for article in articles:
        if isinstance(article['Publish Date'], str):
            try:
               
                parsed_date = parser.parse(article['Publish Date'])
               
                article['Publish Date'] = parsed_date.replace(tzinfo=None)
            except ValueError:
                print(f"Warning: Failed to parse date for article {article['_id']}")
    
    return articles


with open('intents.json', 'r') as json_data:
    intents = json.load(json_data)

FILE = "chatbot_model_base.pth"
data = torch.load(FILE)

input_size = data["input_size"]
hidden_size = data["hidden_size"]
output_size = data["output_size"]
all_words = data['all_words']
tags = data['tags']
model_state = data["model_state"]


model = NeuralNet(input_size, hidden_size, output_size).to(device)
model.load_state_dict(model_state)
model.eval()


bot_name = "QueryBot"

print("Début de la conversation ('quit' pour finir la conversation...)")
while True:
  
    sentence = input("User: ")
    if sentence == 'quit':
        break
    
  
    if "tunisie télécom" in sentence.lower():
        articles = fetch_articles()
        if articles:
            latest_article = max(articles, key=lambda x: x['Publish Date'])
            print(f"{bot_name}: La dernière nouvelle sur Tunisie Télécom est : {latest_article['Title']}")
        else:
            print(f"{bot_name}: Il n'y a pas d'articles disponibles sur Tunisie Télécom.")
    

    elif any(word in sentence.lower() for word in ["combien d'articles positifs", "combien d'articles négatifs", "année avec le plus d'articles négatifs", "année avec le plus d'articles positifs"]):
        year_match = re.search(r'\b\d{4}\b', sentence)
        articles = fetch_articles()
        if articles:
            articles_by_year = {}
            for article in articles:
                year = article['Publish Date'].year
                sentiment = article['SentimentCamembert']
                if year not in articles_by_year:
                    articles_by_year[year] = {'positive': 0, 'negative': 0}
                if sentiment == 'positive':
                    articles_by_year[year]['positive'] += 1
                elif sentiment == 'negative':
                    articles_by_year[year]['negative'] += 1
            
            if "combien d'articles positifs" in sentence.lower():
                if year_match:
                    year = int(year_match.group(0))
                    positive_count = articles_by_year.get(year, {}).get('positive', 0)
                    print(f"{bot_name}: Il y a {positive_count} articles positifs en {year}.")
                else:
                    print(f"{bot_name}: Veuillez spécifier une année valide pour obtenir le nombre d'articles positifs.")
            elif "combien d'articles négatifs" in sentence.lower():
                if year_match:
                    year = int(year_match.group(0))
                    negative_count = articles_by_year.get(year, {}).get('negative', 0)
                    print(f"{bot_name}: Il y a {negative_count} articles négatifs en {year}.")
                else:
                    print(f"{bot_name}: Veuillez spécifier une année valide pour obtenir le nombre d'articles négatifs.")
            elif "année avec le plus d'articles négatifs" in sentence.lower():
                year = max(articles_by_year, key=lambda x: articles_by_year[x]['negative'])
                count = articles_by_year[year]['negative']
                print(f"{bot_name}: L'année avec le plus d'articles négatifs est {year} avec {count} articles.")
            elif "année avec le plus d'articles positifs" in sentence.lower():
                year = max(articles_by_year, key=lambda x: articles_by_year[x]['positive'])
                count = articles_by_year[year]['positive']
                print(f"{bot_name}: L'année avec le plus d'articles positifs est {year} avec {count} articles.")
        else:
            print(f"{bot_name}: Aucun article trouvé.")
    
    
    else:
        sentence_qa = sentence
        sentence = tokenize(sentence)
        X = bag_of_words(sentence, all_words)
        X = X.reshape(1, X.shape[0])
        X = torch.from_numpy(X).to(device)

        output = model(X)
        _, predicted = torch.max(output, dim=1)

        tag = tags[predicted.item()]

        probs = torch.softmax(output, dim=1)
        prob = probs[0][predicted.item()]

        
        if prob.item() > 0.75:  
            for intent in intents['intents']:
                if tag == intent["tag"]:
                    print(f"{bot_name}: {random.choice(intent['responses'])}")
                    break
        else:
            response = process_question(sentence_qa)
            if response:
                print(f"{bot_name}: {response}")
            else:
                context = find_most_relevant_context(sentence_qa, dataset)
                answer = generate_answer(sentence_qa, context)
                print(f"{bot_name}: {answer}")


Début de la conversation ('quit' pour finir la conversation...)
User: blague
QueryBot: J'ai mange une horloge hier, c'etait tres chronophage
User: devinette
QueryBot: Pourquoi un velo ne peut-il pas tenir tout seul ?.....Il est trop fatigue.
User: bonjour
QueryBot: Content de vous revoir
User: ça va?
QueryBot: Tout va bien.. Et vous?
User: bien
QueryBot: Content de le savoir!
User:  Quelle est la dernière nouvelle sur Tunisie Télécom ?
QueryBot: La dernière nouvelle sur Tunisie Télécom est : Tunisie Télécom et l’AtGmo : “Run of Heroes” pour soutenir les malades d’hémopathies
User: Combien d'articles positifs y a-t-il en 2023 ?
QueryBot: Il y a 9 articles positifs en 2023.
User: Quelle est l'année avec le plus d'articles négatifs ?
QueryBot: L'année avec le plus d'articles négatifs est 2019 avec 7 articles.
User: Combien d'articles négatifs y a-t-il en 2024 ?
QueryBot: Il y a 0 articles négatifs en 2024.


KeyboardInterrupt: Interrupted by user

In [25]:
#interface
import random
import json
import torch
import pymongo
import re
from dateutil import parser
from datetime import datetime
import tkinter as tk
from tkinter import scrolledtext, Entry, END
def send_message():
    user_input = input_field.get()
    input_field.delete(0, END)
    chat_history.config(state=tk.NORMAL)
    chat_history.insert(tk.END, f"User: {user_input}\n", "user")
    
    if user_input.lower() == 'quit':
        root.quit()
    elif "tunisie télécom" in user_input.lower():
        articles = fetch_articles()
        if articles:
            latest_article = max(articles, key=lambda x: x['Publish Date'])
            response = f"{bot_name}: La dernière nouvelle sur Tunisie Télécom est : {latest_article['Title']}\n"
        else:
            response = f"{bot_name}: Il n'y a pas d'articles disponibles sur Tunisie Télécom.\n"
    elif any(word in user_input.lower() for word in ["combien d'articles positifs", "combien d'articles négatifs", "année avec le plus d'articles négatifs", "année avec le plus d'articles positifs"]):
        year_match = re.search(r'\b\d{4}\b', user_input)
        articles = fetch_articles()
        if articles:
            articles_by_year = {}
            for article in articles:
                year = article['Publish Date'].year
                sentiment = article.get('SentimentCamembert', '')
                if year not in articles_by_year:
                    articles_by_year[year] = {'positive': 0, 'negative': 0}
                if sentiment == 'positive':
                    articles_by_year[year]['positive'] += 1
                elif sentiment == 'negative':
                    articles_by_year[year]['negative'] += 1
            
            if "combien d'articles positifs" in user_input.lower():
                if year_match:
                    year = int(year_match.group(0))
                    positive_count = articles_by_year.get(year, {}).get('positive', 0)
                    response = f"{bot_name}: Il y a {positive_count} articles positifs en {year}.\n"
                else:
                    response = f"{bot_name}: Veuillez spécifier une année valide pour obtenir le nombre d'articles positifs.\n"
            elif "combien d'articles négatifs" in user_input.lower():
                if year_match:
                    year = int(year_match.group(0))
                    negative_count = articles_by_year.get(year, {}).get('negative', 0)
                    response = f"{bot_name}: Il y a {negative_count} articles négatifs en {year}.\n"
                else:
                    response = f"{bot_name}: Veuillez spécifier une année valide pour obtenir le nombre d'articles négatifs.\n"
            elif "année avec le plus d'articles négatifs" in user_input.lower():
                year = max(articles_by_year, key=lambda x: articles_by_year[x]['negative'])
                count = articles_by_year[year]['negative']
                response = f"{bot_name}: L'année avec le plus d'articles négatifs est {year} avec {count} articles.\n"
            elif "année avec le plus d'articles positifs" in user_input.lower():
                year = max(articles_by_year, key=lambda x: articles_by_year[x]['positive'])
                count = articles_by_year[year]['positive']
                response = f"{bot_name}: L'année avec le plus d'articles positifs est {year} avec {count} articles.\n"
        else:
            response = f"{bot_name}: Aucun article trouvé.\n"
    else:
        sentence_qa = user_input
        sentence = tokenize(user_input)
        X = bag_of_words(sentence, all_words)
        X = X.reshape(1, X.shape[0])
        X = torch.from_numpy(X).to(device)

        output = model(X)
        _, predicted = torch.max(output, dim=1)

        tag = tags[predicted.item()]

        probs = torch.softmax(output, dim=1)
        prob = probs[0][predicted.item()]

        if prob.item() > 0.75:
            for intent in intents['intents']:
                if tag == intent["tag"]:
                    response = f"{bot_name}: {random.choice(intent['responses'])}\n"
                    break
        else:
            response = process_question(sentence_qa)
            if response:
                response = f"{bot_name}: {response}\n"
            else:
                context = find_most_relevant_context(sentence_qa, dataset)
                answer = generate_answer(sentence_qa, context)
                response = f"{bot_name}: {answer}\n"

    chat_history.insert(tk.END, response, "bot")
    chat_history.config(state=tk.DISABLED)
    chat_history.see(tk.END)


root = tk.Tk()
root.title("Chatbot Interface")

frame = tk.Frame(root)
frame.pack(pady=10)

scrollbar = tk.Scrollbar(frame)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)

chat_history = scrolledtext.ScrolledText(frame, width=50, height=20, wrap=tk.WORD, state=tk.DISABLED)
chat_history.pack(padx=10, pady=10)

input_field = Entry(root, font=("Helvetica", 14))
input_field.pack(fill=tk.X, padx=20, pady=10)

input_field.bind("<Return>", lambda _: send_message())

send_button = tk.Button(root, text="Envoyer", command=send_message)
send_button.pack(pady=10)

root.mainloop()

In [None]:
#import random
#import json
#import torch

#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Charger les données et le modèle pré-entraîné
#with open('intents.json', 'r') as json_data:
#   intents = json.load(json_data)

#FILE = "chatbot_model_base.pth"
#data = torch.load(FILE)

#input_size = data["input_size"]
#hidden_size = data["hidden_size"]
#output_size = data["output_size"]
#all_words = data['all_words']
#tags = data['tags']
#model_state = data["model_state"]

# Charger le modèle
#model = NeuralNet(input_size, hidden_size, output_size).to(device)
#model.load_state_dict(model_state)
#model.eval()

# Nom du chatbot
#bot_name = "QueryBot"

#print("Début de la conversation ('quit' pour finir la conversation...)")
#while True:
    # Saisie utilisateur
    #sentence = input("User: ")
    #if sentence == 'quit':
     #   break
    
    # Prédiction de l'intention
   # sentence_qa = sentence
   # sentence = tokenize(sentence)
   # X = bag_of_words(sentence, all_words)
    #X = X.reshape(1, X.shape[0])
    #X = torch.from_numpy(X).to(device)

    #output = model(X)
    #_, predicted = torch.max(output, dim=1)

    #tag = tags[predicted.item()]

    #probs = torch.softmax(output, dim=1)
    #prob = probs[0][predicted.item()]
    
    # Répondre en fonction de l'intention prédite ou en utilisant la question-réponse
    #if prob.item() > 0.75:  # ajuster le seuil de probabilité selon la précision souhaitée
     #   for intent in intents['intents']:
      #      if tag == intent["tag"]:
       #         print(f"{bot_name}: {random.choice(intent['responses'])}")
        #        break  # sortir après avoir trouvé une réponse appropriée
   # else:
    #    response = process_question(sentence_qa)
     #   if response:
      #      print(f"{bot_name}: {response}")
       # else:
        #    context = find_most_relevant_context(sentence_qa, dataset)
         #   answer = generate_answer(sentence_qa, context)
          #  print(f"{bot_name}: {answer}")


In [None]:
#Début de la conversation ('quit' pour finir la conversation...)
#User: Quelle est la dernière nouvelle sur Tunisie Télécom ?
#--------------------------------------------------
#QueryBot: La dernière nouvelle sur Tunisie Télécom est : Tunisie Télécom lance un nouveau service de téléphonie. En savoir plus : http://example.com/article
#==================================================
#User: Combien y a-t-il eu d'articles positifs en 2020 ?
#==================================================
#QueryBot: Il y a 8 articles positifs en 2020.
#--------------------------------------------------
#User: Quelle année a eu le plus d'articles négatifs ?
#==================================================
#QueryBot: L'année avec le plus d'articles négatifs est 2022 avec 15 articles.
#--------------------------------------------------



## Summary Model for the chatbot

In [22]:
from datasets import load_dataset

dataset_orangesum = load_dataset("GEM/OrangeSum", "abstract") # we can also specify "title" to obtain pairs of text-title
#dataset_xlsum = load_dataset("csebuetnlp/xlsum", "french")
#dataset_mlsum = load_dataset("mlsum", "fr")

In [23]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
from nltk.corpus import stopwords
import time
from tensorflow.python.layers.core import Dense
from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
print('TensorFlow Version: {}'.format(tf.__version__))

TensorFlow Version: 2.15.0


In [24]:
def clean_text(text, remove_stopwords = True):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''
    
    # Convert words to lower case
    text = text.lower()
    
    # Replace contractions with their longer forms 
    #if True:
     #   text = text.split()
      #  new_text = []
       # for word in text:
        #    if word in contractions:
         #       new_text.append(contractions[word])
          #  else:
           #     new_text.append(word)
        #text = " ".join(new_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    # Optionally, remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("french"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    return text

In [25]:
dataset_orangesum.shape

{'train': (21401, 4), 'test': (1500, 4), 'validation': (1500, 4)}

In [26]:
df_test_OS = pd.DataFrame(dataset_orangesum['test'])
df_train_OS = pd.DataFrame(dataset_orangesum['train'])
df_validation_OS = pd.DataFrame(dataset_orangesum['validation'])

df_train_OS.isnull().sum()

gem_id        0
input         0
target        0
references    0
dtype: int64

In [27]:
df_train_OS.head()

Unnamed: 0,gem_id,input,target,references
0,OrangeSum_abstract-train-0,Thierry Mariani sur la liste du Rassemblement ...,L'information n'a pas été confirmée par l'inté...,[L'information n'a pas été confirmée par l'int...
1,OrangeSum_abstract-train-1,C'est désormais officiel : Alain Juppé n'est p...,Le maire de Bordeaux ne fait plus partie des R...,[Le maire de Bordeaux ne fait plus partie des ...
2,OrangeSum_abstract-train-2,La mesure est décriée par les avocats et les m...,"En 2020, les tribunaux d'instance fusionnent a...","[En 2020, les tribunaux d'instance fusionnent ..."
3,OrangeSum_abstract-train-3,Dans une interview accordée au Figaro mercredi...,"Les médecins jugés ""gros prescripteurs d'arrêt...","[Les médecins jugés ""gros prescripteurs d'arrê..."
4,OrangeSum_abstract-train-4,Le préjudice est estimé à 2 millions d'euros. ...,Il aura fallu mobiliser 90 gendarmes pour cett...,[Il aura fallu mobiliser 90 gendarmes pour cet...


In [28]:
import nltk
nltk.download('stopwords')
  
# Clean the summaries and texts
clean_target = []
for target in df_train_OS.target:
    clean_target.append(clean_text(target, remove_stopwords=False))
print("Summaries are complete.")

clean_input = []
for input in df_train_OS.input:
    clean_input.append(clean_text(input))
print("Texts are complete.")

[nltk_data] Downloading package stopwords to C:\Users\Mega-
[nltk_data]     Pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Summaries are complete.
Texts are complete.


In [29]:
# Inspect the cleaned summaries and texts to ensure they have been cleaned well
for i in range(3):
    print("Clean News #",i+1)
    print(clean_target[i])
    print(clean_input[i])
    print()

Clean News # 1
l information n a pas été confirmée par l intéressé qui déclare toutefois étudier la question 

thierry mariani liste rassemblement national rn ex fn européennes affirme mardi 11 septembre chez pol nouvelle newsletter politique libération ancien député républicain ministre nicolas sarkozy point rejoindre troupes marine pen élections européennes 2019 ça va faire plus question calendrier obligé annoncer tout suite huit mois européennes ainsi assuré membre influent rn contacté franceinfo mariani a confirmé information élections juin sais numéro 1 liste a répondu ancien ministre transports reconnaît toutefois toujours cité franceinfo nom liste rn fait partie possibilités fréjus ville sympathique prévu rendre week end a ailleurs commenté twitter alors marine pen réunit cadres parti week end cité varoise proximité connue fnla proximité thierry mariani parti frontiste nouvelle sans alliés allons rester opposition longtemps temps renverser table front national a évolué regardons

In [30]:
def count_words(count_dict, text):
    '''Count the number of occurrences of each word in a set of text'''
    for sentence in text:
        for word in sentence.split():
            if word not in count_dict:
                count_dict[word] = 1
            else:
                count_dict[word] += 1

In [31]:
# Find the number of times each word was used and the size of the vocabulary
word_counts = {}

count_words(word_counts, clean_target)
count_words(word_counts, clean_input)
            
print("Size of Vocabulary:", len(word_counts))

Size of Vocabulary: 111529


In [34]:
from torchtext.vocab import FastText
from torchtext import vocab


In [35]:
# Load of pretrained embeddings vectors :
pretrained_vectors_fasttext = FastText(language='fr')

In [36]:
print(f'The pre-trained vocabulary contains {pretrained_vectors_fasttext.vectors.shape[0]} words, with embeddings in vectors of size {pretrained_vectors_fasttext.vectors.shape[1]}')

The pre-trained vocabulary contains 1152449 words, with embeddings in vectors of size 300


In [37]:
pretrained_vectors_words = pretrained_vectors_fasttext.stoi.keys()
pretrained_vectors_values = pretrained_vectors_fasttext.stoi.values()

In [38]:
pretrained_vectors_fasttext['maison']

tensor([ 0.1663, -0.2434, -0.1298,  0.2558,  0.2620,  0.4340,  0.2389,  0.2846,
        -0.0285,  0.2952, -0.1806, -0.0203,  0.2095,  0.2392,  0.4044,  0.2178,
         0.3261,  0.1015,  0.1417, -0.1413, -0.1626, -0.6919, -0.1303,  0.5766,
         0.2136, -0.0434, -0.4864,  0.2376, -0.3875,  0.0248,  0.5002,  0.4109,
        -0.2349,  0.2109, -0.1231, -0.1220, -0.2864, -0.2508, -0.2469,  0.0470,
         0.2941, -0.2932, -0.0470, -0.0928,  0.0722, -0.0158,  0.2090,  0.1393,
         0.3059,  0.3177, -0.1812, -0.0239, -0.1266,  0.0802, -0.1903, -0.2608,
        -0.3757, -0.0703,  0.3611,  0.2268, -0.1355,  0.2499, -0.0559, -0.1626,
         0.1937,  0.3333, -0.0398, -0.0106, -0.2556, -0.2036,  0.3537,  0.0297,
         0.0255, -0.1837,  0.1164, -0.3757,  0.2895, -0.2726,  0.0061, -0.2071,
        -0.2901, -0.0297, -0.0647,  0.1851, -0.0209, -0.0855,  0.0574,  0.3292,
        -0.3409, -0.4960, -0.1257, -0.3342,  0.0513, -0.0179,  0.0588,  0.0645,
        -0.2976,  0.0638,  0.2410, -0.25

In [39]:
# Load Conceptnet Numberbatch's (CN) embeddings, similar to GloVe, but probably better 
# (https://github.com/commonsense/conceptnet-numberbatch)
import numpy as np

import tensorflow as tf
import timeit


#data_path = 'drive/MyDrive/numberbatch-fr.txt'
#data_path = 'drive/MyDrive/Colab Notebooks/numberbatch-fr-clean.txt'
#data_path = 'C:/Users/Giuseppe/Desktop/NLP/Embeddings/cc.fr.300.vec.gz'

embeddings_index = {}#'rb' encoding='utf-8'

#word_dict = []
#with open(data_path, 'r', encoding='utf-8') as f: 
 #   for line in f:
        #values = line.split(' ')
        #line = re.split(r'fr/',line)
        #values = re.split(" ", line[1])
word = pretrained_vectors_fasttext.stoi.keys()#[12:]
        #values = pretrained_vectors_fasttext.stoi.values()
        #word_dict.append(word) 
for word in word_counts:
    embedding = pretrained_vectors_fasttext[word]
            #embedding = np.asarray(values[1:])#, dtype='float32'
    embeddings_index[word] = embedding
        #print(word)
print('Word embeddings:', len(embeddings_index))

Word embeddings: 111529


In [40]:
# Find the number of words that are missing from CN, and are used more than our threshold.
missing_words = 0
threshold = 20

for word, count in word_counts.items():
    if count > threshold:
        if word not in embeddings_index:
            missing_words += 1
            
missing_ratio = round(missing_words/len(word_counts),4)*100
            
print("Number of words missing from fastText:", missing_words)
print("Percent of words that are missing from vocabulary: {}%".format(missing_ratio))

Number of words missing from fastText: 0
Percent of words that are missing from vocabulary: 0.0%


In [41]:
# Limit the vocab that we will use to words that appear ≥ threshold or are in CN

#dictionary to convert words to integers
vocab_to_int = {} 

value = 0
for word, count in word_counts.items():
    if count >= threshold or word in embeddings_index:
        vocab_to_int[word] = value
        value += 1

# Special tokens that will be added to our vocab
codes = ["<UNK>","<PAD>","<EOS>","<GO>"]   

# Add codes to vocab
for code in codes:
    vocab_to_int[code] = len(vocab_to_int)

# Dictionary to convert integers to words
int_to_vocab = {}
for word, value in vocab_to_int.items():
    int_to_vocab[value] = word

usage_ratio = round(len(vocab_to_int) / len(word_counts),4)*100

print("Total number of unique words:", len(word_counts))
print("Number of words we will use:", len(vocab_to_int))
print("Percent of words we will use: {}%".format(usage_ratio))

Total number of unique words: 111529
Number of words we will use: 111533
Percent of words we will use: 100.0%


In [42]:
# Need to use 300 for embedding dimensions to match CN's vectors.
embedding_dim = 300
nb_words = len(vocab_to_int)

# Create matrix with default values of zero
word_embedding_matrix = np.zeros((nb_words, embedding_dim), dtype=np.float32)
for word, i in vocab_to_int.items():
    if word in embeddings_index:
        word_embedding_matrix[i] = embeddings_index[word]
    else:
        # If word not in CN, create a random embedding for it
        new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
        embeddings_index[word] = new_embedding
        word_embedding_matrix[i] = new_embedding

# Check if value matches len(vocab_to_int)
print(len(word_embedding_matrix))

111533


In [43]:
def convert_to_ints(text, word_count, unk_count, eos=False):
    '''Convert words in text to an integer.
       If word is not in vocab_to_int, use UNK's integer.
       Total the number of words and UNKs.
       Add EOS token to the end of texts'''
    ints = []
    for sentence in text:
        sentence_ints = []
        for word in sentence.split():
            word_count += 1
            if word in vocab_to_int:
                sentence_ints.append(vocab_to_int[word])
            else:
                sentence_ints.append(vocab_to_int["<UNK>"])
                unk_count += 1
        if eos:
            sentence_ints.append(vocab_to_int["<EOS>"])
        ints.append(sentence_ints)
    return ints, word_count, unk_count

In [44]:
# Apply convert_to_ints to clean_target and clean_input
word_count = 0
unk_count = 0

int_target, word_count, unk_count = convert_to_ints(clean_target, word_count, unk_count)
int_input, word_count, unk_count = convert_to_ints(clean_input, word_count, unk_count, eos=True)

unk_percent = round(unk_count/word_count,4)*100

print("Total number of words in headlines:", word_count)
print("Total number of UNKs in headlines:", unk_count)
print("Percent of words that are UNK: {}%".format(unk_percent))

Total number of words in headlines: 5339165
Total number of UNKs in headlines: 0
Percent of words that are UNK: 0.0%


In [45]:
def create_lengths(text):
    '''Create a data frame of the sentence lengths from a text'''
    lengths = []
    for sentence in text:
        lengths.append(len(sentence))
    return pd.DataFrame(lengths, columns=['counts'])

In [46]:
lengths_target = create_lengths(int_target)
lengths_input = create_lengths(int_input)

print("Summaries:")
print(lengths_target.describe())
print()
print("Texts:")
print(lengths_input.describe())

Summaries:
             counts
count  21401.000000
mean      34.393112
std       12.316238
min        3.000000
25%       26.000000
50%       34.000000
75%       42.000000
max      164.000000

Texts:
             counts
count  21401.000000
mean     216.088921
std      106.650884
min       11.000000
25%      142.000000
50%      192.000000
75%      260.000000
max     1884.000000


In [47]:
# Inspect the length of "input"
print(np.percentile(lengths_input.counts, 90))
print(np.percentile(lengths_input.counts, 95))
print(np.percentile(lengths_input.counts, 99))

377.0
417.0
525.0


In [48]:
# Inspect the length of "target"
print(np.percentile(lengths_target.counts, 90))
print(np.percentile(lengths_target.counts, 95))
print(np.percentile(lengths_target.counts, 99))

49.0
54.0
70.0


In [49]:
def unk_counter(sentence):
    '''Counts the number of time UNK appears in a sentence.'''
    unk_count = 0
    for word in sentence:
        if word == vocab_to_int["<UNK>"]:
            unk_count += 1
    return unk_count

In [50]:
# takes a long time  , this is normal

# Sort the summaries and texts by the length of the texts, shortest to longest
# Limit the length of summaries and texts based on the min and max ranges.
# Remove texts that include too many UNKs

sorted_target = []
sorted_input = []
max_input_length = 377
max_target_length = 70
min_length = 2
unk_input_limit = 1
unk_target_limit = 0

for length in range(min(lengths_input.counts), max_input_length): 
    for count, words in enumerate(int_target):
        if (len(int_target[count]) >= min_length and
            len(int_target[count]) <= max_target_length and
            len(int_input[count]) >= min_length and
            unk_counter(int_target[count]) <= unk_target_limit and
            unk_counter(int_input[count]) <= unk_input_limit and
            length == len(int_input[count])
           ):
            sorted_target.append(int_target[count])
            sorted_input.append(int_input[count])
        
# Compare lengths to ensure they match
print(len(sorted_target))
print(len(sorted_input))

19058
19058


In [51]:
def model_inputs():
    '''Create placeholders for inputs to the model'''
    
    input_data = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    summary_length = tf.placeholder(tf.int32, (None,), name='summary_length')
    max_summary_length = tf.reduce_max(summary_length, name='max_dec_len')
    text_length = tf.placeholder(tf.int32, (None,), name='text_length')

    return input_data, targets, lr, keep_prob, summary_length, max_summary_length, text_length

In [52]:
def process_encoding_input(target_data, vocab_to_int, batch_size):
    '''Remove the last word id from each batch and concat the <GO> to the begining of each batch'''
    
    ending = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
    dec_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), ending], 1)

    return dec_input

In [53]:
def encoding_layer(rnn_size, sequence_length, num_layers, rnn_inputs, keep_prob):
    '''Create the encoding layer'''
    
    for layer in range(num_layers):
        with tf.variable_scope('encoder_{}'.format(layer)):
            cell_fw = tf.contrib.rnn.LSTMCell(rnn_size,
                                              initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, 
                                                    input_keep_prob = keep_prob)

            cell_bw = tf.contrib.rnn.LSTMCell(rnn_size,
                                              initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, 
                                                    input_keep_prob = keep_prob)

            enc_output, enc_state = tf.nn.bidirectional_dynamic_rnn(cell_fw, 
                                                                    cell_bw, 
                                                                    rnn_inputs,
                                                                    sequence_length,
                                                                    dtype=tf.float32)
    # Join outputs since we are using a bidirectional RNN
    enc_output = tf.concat(enc_output,2)
    
    return enc_output, enc_state

In [54]:
def training_decoding_layer(dec_embed_input, summary_length, dec_cell, initial_state, output_layer, 
                            vocab_size, max_summary_length):
    '''Create the training logits'''
    
    training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embed_input,
                                                        sequence_length=summary_length,
                                                        time_major=False)

    training_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                       training_helper,
                                                       initial_state,
                                                       output_layer) 

    training_logits, _ , _ = tf.contrib.seq2seq.dynamic_decode(training_decoder,
                                                           output_time_major=False,
                                                           impute_finished=True,
                                                           maximum_iterations=max_summary_length)
    return training_decoder

In [55]:
def inference_decoding_layer(embeddings, start_token, end_token, dec_cell, initial_state, output_layer,
                             max_summary_length, batch_size):
    '''Create the inference logits'''
    
    start_tokens = tf.tile(tf.constant([start_token], dtype=tf.int32), [batch_size], name='start_tokens')
    
    inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings,
                                                                start_tokens,
                                                                end_token)
                
    inference_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                        inference_helper,
                                                        initial_state,
                                                        output_layer)
                
    inference_logits, _ , _ = tf.contrib.seq2seq.dynamic_decode(inference_decoder,
                                                            output_time_major=False,
                                                            impute_finished=True,
                                                            maximum_iterations=max_summary_length)
    
    return inference_decoder

In [56]:
def decoding_layer(dec_embed_input, embeddings, enc_output, enc_state, vocab_size, text_length, summary_length, 
                   max_summary_length, rnn_size, vocab_to_int, keep_prob, batch_size, num_layers):
    '''Create the decoding cell and attention for the training and inference decoding layers'''
    
    for layer in range(num_layers):
        with tf.variable_scope('decoder_{}'.format(layer)):
            lstm = tf.contrib.rnn.LSTMCell(rnn_size,
                                           initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            dec_cell = tf.contrib.rnn.DropoutWrapper(lstm, 
                                                     input_keep_prob = keep_prob)
    
    output_layer = Dense(vocab_size,
                         kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))
    
    attn_mech = tf.contrib.seq2seq.BahdanauAttention(rnn_size,
                                                  enc_output,
                                                  text_length,
                                                  normalize=False,
                                                  name='BahdanauAttention')

    dec_cell = tf.contrib.seq2seq.AttentionWrapper(dec_cell,
                                                          attn_mech,
                                                          rnn_size)
            
    #initial_state = tf.contrib.seq2seq.AttentionWrapperState(enc_state[0],
    #                                                                _zero_state_tensors(rnn_size, 
    #                                                                                    batch_size, 
    #                                                                                    tf.float32)) 
    initial_state = dec_cell.zero_state(batch_size=batch_size,dtype=tf.float32).clone(cell_state=enc_state[0])

    with tf.variable_scope("decode"):
        training_decoder = training_decoding_layer(dec_embed_input, 
                                                  summary_length, 
                                                  dec_cell, 
                                                  initial_state,
                                                  output_layer,
                                                  vocab_size, 
                                                  max_summary_length)
        
        training_logits,_ ,_ = tf.contrib.seq2seq.dynamic_decode(training_decoder,
                                  output_time_major=False,
                                  impute_finished=True,
                                  maximum_iterations=max_summary_length)
    with tf.variable_scope("decode", reuse=True):
        inference_decoder = inference_decoding_layer(embeddings,  
                                                    vocab_to_int['<GO>'], 
                                                    vocab_to_int['<EOS>'],
                                                    dec_cell, 
                                                    initial_state, 
                                                    output_layer,
                                                    max_summary_length,
                                                    batch_size)
        
        inference_logits,_ ,_ = tf.contrib.seq2seq.dynamic_decode(inference_decoder,
                                  output_time_major=False,
                                  impute_finished=True,
                                  maximum_iterations=max_summary_length)

    return training_logits, inference_logits

In [57]:
def seq2seq_model(input_data, target_data, keep_prob, text_length, summary_length, max_summary_length, 
                  vocab_size, rnn_size, num_layers, vocab_to_int, batch_size):
    '''Use the previous functions to create the training and inference logits'''
    
    # Use Numberbatch's embeddings and the newly created ones as our embeddings
    embeddings = word_embedding_matrix
    
    enc_embed_input = tf.nn.embedding_lookup(embeddings, input_data)
    enc_output, enc_state = encoding_layer(rnn_size, text_length, num_layers, enc_embed_input, keep_prob)
    
    dec_input = process_encoding_input(target_data, vocab_to_int, batch_size)
    dec_embed_input = tf.nn.embedding_lookup(embeddings, dec_input)
    
    training_logits, inference_logits  = decoding_layer(dec_embed_input, 
                                                        embeddings,
                                                        enc_output,
                                                        enc_state, 
                                                        vocab_size, 
                                                        text_length, 
                                                        summary_length, 
                                                        max_summary_length,
                                                        rnn_size, 
                                                        vocab_to_int, 
                                                        keep_prob, 
                                                        batch_size,
                                                        num_layers)
    
    return training_logits, inference_logits

In [58]:
def pad_sentence_batch(sentence_batch):
    """Pad sentences with <PAD> so that each sentence of a batch has the same length"""
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [vocab_to_int['<PAD>']] * (max_sentence - len(sentence)) for sentence in sentence_batch]

In [59]:
def get_batches(summaries, texts, batch_size):
    """Batch summaries, texts, and the lengths of their sentences together"""
    for batch_i in range(0, len(texts)//batch_size):
        start_i = batch_i * batch_size
        summaries_batch = summaries[start_i:start_i + batch_size]
        texts_batch = texts[start_i:start_i + batch_size]
        pad_summaries_batch = np.array(pad_sentence_batch(summaries_batch))
        pad_texts_batch = np.array(pad_sentence_batch(texts_batch))
        
        # Need the lengths for the _lengths parameters
        pad_summaries_lengths = []
        for summary in pad_summaries_batch:
            pad_summaries_lengths.append(len(summary))
        
        pad_texts_lengths = []
        for text in pad_texts_batch:
            pad_texts_lengths.append(len(text))
        
        yield pad_summaries_batch, pad_texts_batch, pad_summaries_lengths, pad_texts_lengths

In [60]:
# Set the Hyperparameters
epochs = 100
batch_size = 64
rnn_size = 256
num_layers = 2
learning_rate = 0.005
keep_probability = 0.75

In [63]:
!pip install tensorflow== 1.13.1


ERROR: Could not find a version that satisfies the requirement tensorflow== (from versions: 2.12.0rc0, 2.12.0rc1, 2.12.0, 2.12.1, 2.13.0rc0, 2.13.0rc1, 2.13.0rc2, 2.13.0, 2.13.1, 2.14.0rc0, 2.14.0rc1, 2.14.0, 2.14.1, 2.15.0rc0, 2.15.0rc1, 2.15.0, 2.15.1, 2.16.0rc0, 2.16.1, 2.16.2, 2.17.0rc0, 2.17.0rc1, 2.17.0)
ERROR: No matching distribution found for tensorflow==


In [61]:
# Build the graph
train_graph = tf.Graph()
# Set the graph to default to ensure that it is ready for training
with train_graph.as_default():
    
    # Load the model inputs    
    input_data, targets, lr, keep_prob, summary_length, max_summary_length, text_length = model_inputs()

    # Create the training and inference logits
    training_logits, inference_logits = seq2seq_model(tf.reverse(input_data, [-1]),
                                                      targets, 
                                                      keep_prob,   
                                                      text_length,
                                                      summary_length,
                                                      max_summary_length,
                                                      len(vocab_to_int)+1,
                                                      rnn_size, 
                                                      num_layers, 
                                                      vocab_to_int,
                                                      batch_size)
    
    # Create tensors for the training logits and inference logits
    training_logits = tf.identity(training_logits.rnn_output, 'logits')
    inference_logits = tf.identity(inference_logits.sample_id, name='predictions')
    
    # Create the weights for sequence_loss
    masks = tf.sequence_mask(summary_length, max_summary_length, dtype=tf.float32, name='masks')

    with tf.name_scope("optimization"):
        # Loss function
        cost = tf.contrib.seq2seq.sequence_loss(
            training_logits,
            targets,
            masks)

        # Optimizer
        optimizer = tf.train.AdamOptimizer(learning_rate)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)
print("Graph is built.")

AttributeError: module 'tensorflow' has no attribute 'placeholder'

In [None]:
# Subset the data for training
start = 10000
end = start + 8000
sorted_target_short = sorted_target[start:end]
sorted_input_short = sorted_input[start:end]
print("The shortest text length:", len(sorted_input_short[0]))
print("The longest text length:",len(sorted_input_short[-1]))

In [None]:
# Train the Model
learning_rate_decay = 0.95
min_learning_rate = 0.01#0.0005
display_step = 20 # Check training loss after every 20 batches
stop_early = 0 
stop = 6 #3 # If the update loss does not decrease in 3 consecutive update checks, stop training
per_epoch = 3 # Make 3 update checks per epoch
update_check = (len(sorted_input_short)//batch_size//per_epoch)-1

update_loss = 0 
batch_loss = 0
summary_update_loss = [] # Record the update losses for saving improvements in the model

  
tf.reset_default_graph()
checkpoint = "C:/Users/Giuseppe/Desktop/NLP/best_model.ckpt"  #300k sentence
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    # If we want to continue training a previous session
    # loader = tf.train.import_meta_graph(checkpoint + '.meta')
    # loader.restore(sess, checkpoint)
    #sess.run(tf.local_variables_initializer())

    for epoch_i in range(1, epochs+1):
        update_loss = 0
        batch_loss = 0
        for batch_i, (summaries_batch, texts_batch, summaries_lengths, texts_lengths) in enumerate(
                get_batches(sorted_target_short, sorted_input_short, batch_size)):
            start_time = time.time()
            _, loss = sess.run(
                [train_op, cost],
                {input_data: texts_batch,
                 targets: summaries_batch,
                 lr: learning_rate,
                 summary_length: summaries_lengths,
                 text_length: texts_lengths,
                 keep_prob: keep_probability})

            batch_loss += loss
            update_loss += loss
            end_time = time.time()
            batch_time = end_time - start_time

            if batch_i % display_step == 0 and batch_i > 0:
                print('Epoch {:>3}/{} Batch {:>4}/{} - Loss: {:>6.3f}, Seconds: {:>4.2f}'
                      .format(epoch_i,
                              epochs, 
                              batch_i, 
                              len(sorted_input_short) // batch_size, 
                              batch_loss / display_step, 
                              batch_time*display_step))
                batch_loss = 0
                
                saver = tf.train.Saver() 
                saver.save(sess, checkpoint)
                
            if batch_i % update_check == 0 and batch_i > 0:
                print("Average loss for this update:", round(update_loss/update_check,3))
                summary_update_loss.append(update_loss)
                
              
                  
                # If the update loss is at a new minimum, save the model
                if update_loss <= min(summary_update_loss):
                    print('New Record!') 
                    stop_early = 0
                    saver = tf.train.Saver() 
                    saver.save(sess, checkpoint)

                else:
                    print("No Improvement.")
                    stop_early += 1
                    if stop_early == stop:
                        break
                update_loss = 0
            
                    
        # Reduce learning rate, but not below its minimum value
        learning_rate *= learning_rate_decay
        if learning_rate < min_learning_rate:
            learning_rate = min_learning_rate
        
        if stop_early == stop:
            print("Stopping Training.")
            break
    saver.save(sess, checkpoint)