## Abstract generation

In [None]:
## Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import re 
from nltk.corpus import stopwords   
from nltk.tokenize import word_tokenize, sent_tokenize 
import heapq
import warnings
import nltk

warnings.filterwarnings("ignore")
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
## Connection to the Drive

from google.colab import drive
drive.mount('/content/drive')
dataroot = "/content/drive/Shared drives/ING3 IA: Use Case 1 (NLP Patent)"

Mounted at /content/drive


In [None]:
## Open the dataset

#data = pd.read_csv("/content/drive/Shareddrives/ING3 IA: Use Case 1 (NLP Patent)/data_clean/all_en_claim_abstract.csv")
#data = pd.read_csv("/content/drive/Shareddrives/ING3 IA: Use Case 1 (NLP Patent)/data_clean/all_de_join_publications.csv")
data = pd.read_csv("/content/drive/Shareddrives/ING3 IA: Use Case 1 (NLP Patent)/data_clean/all_fr_join_publications.csv")

In [None]:
## Keep important columns

data = data[['claim', 'abstr']]
data.dropna(axis=0,inplace=True)

In [None]:
## Remove html tags from them

data['abstr'] = [BeautifulSoup(abstr).get_text() for abstr in data['abstr']]
data['claim'] = [BeautifulSoup(claim).get_text() for claim in data['claim']]

In [None]:
## Function to clean a dataframe column of strings

def text_cleaner(text):
    newString = text.lower()
    newString = BeautifulSoup(newString, "lxml").text
    newString = re.sub(r'\([^)]*\)', '', newString)
    newString = re.sub('"','', newString)    
    newString = re.sub(r"'s\b","",newString)
    newString = re.sub(",", "", newString) 
    tokens = [w for w in newString.split()]
    return (" ".join(tokens)).strip()

# Lists of all cleaned claims and abstracts
cleaned_claim = []
for t in data['claim']:
    cleaned_claim.append(text_cleaner(t))

cleaned_abstr = []
for t in data['abstr']:
    cleaned_abstr.append(text_cleaner(t))

In [None]:
## Stopwords language selection (has to match with the data language)

stop_words = nltk.corpus.stopwords.words('english')
#stop_words = nltk.corpus.stopwords.words('german')
#stop_words = nltk.corpus.stopwords.words('french')

In [None]:
## Build a list of predicted abstracts (most meaningful sentence of the claim)

claim_to_abstr = []

for line in range(len(cleaned_claim)):

  sentence_scores = {}
  word_frequencies = {}

  # Build word_frequencies dictionary
  for word in nltk.word_tokenize(cleaned_claim[line]):
    if word not in stop_words:
      if word not in word_frequencies.keys():
        word_frequencies[word] = 1
      else:
        word_frequencies[word] += 1

  maximum_frequncy = max(word_frequencies.values())

  # Give a weight to words
  for word in word_frequencies.keys():
      word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)

  # Get all sentences froma a claim
  sentence_list = cleaned_claim[line].split(".")

  # Build a sentence_scores dictionary
  for snt in sentence_list :
    for word in nltk.word_tokenize(snt):
      if word in word_frequencies.keys():
        if snt not in sentence_scores.keys():
          sentence_scores[snt] = word_frequencies[word]
        else:
          sentence_scores[snt] += word_frequencies[word]

  # Get the most meaningful sentence of a claim
  summary_sentences = heapq.nlargest(1, sentence_scores, key=sentence_scores.get)

  summary = ' '.join(summary_sentences)
  claim_to_abstr.append(summary)

# Evaluation metrics

In [None]:
from difflib import SequenceMatcher

somme = 0

for i in range(len(claim_to_abstr)):
  somme += SequenceMatcher(None, claim_to_abstr[i], cleaned_abstr[i]).ratio()

# Raw text evaluation
print("Predicted abstract similarity : ", "{:.2f}".format(100*somme/len(claim_to_abstr)), " %")

Predicted abstract similarity :  20.34  %


In [None]:
## Advanced comparison

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

similarity_sum = 0

for i in range(len(claim_to_abstr)):

  abstr_pred = word_tokenize(claim_to_abstr[i])
  abstr_true = word_tokenize(cleaned_abstr[i])

  l1 =[]
  l2 =[] 
  
  # remove stop words from the string 
  X_set = {w for w in abstr_pred if not w in stop_words}  
  Y_set = {w for w in abstr_true if not w in stop_words} 
    
  # form a set containing keywords of both strings  
  rvector = X_set.union(Y_set)  
  for w in rvector: 
    if w in X_set: l1.append(1) # create a vector 
    else: l1.append(0) 
    if w in Y_set: l2.append(1) 
    else: l2.append(0) 
  c = 0
    
  # cosine formula  
  for i in range(len(rvector)): 
    c+= l1[i]*l2[i]
  f = sum(l1)*sum(l2)
  cosine = c/float(f**0.5)
  similarity_sum += cosine 

# Meaningful words  
print("Predicted abstract similarity : ", "{:.2f}".format(100*similarity_sum/len(claim_to_abstr)), " %") 

Predicted abstract similarity :  54.26  %


In [None]:
## FR
# Fulltext comparison accuracy : 34.98 %
# Advanced comparison accuracy : 66.81 %

## GE
# Fulltext comparison accuracy : 37.92 %
# Advanced comparison accuracy : 59.83 %

## EN
# Fulltext comparison accuracy : 20.34 %
# Advanced comparison accuracy : 54.26 %