## Test NLP  
by Jeremy Trullier  
#### Trying some NLP libraries and model
#### Writing functions to easily retrieve relevant data from our word doc

In [1]:
import os
import json

from enum import Enum
from pptx.util import Inches
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE

In [2]:
from process_json import *
from slide_functions import *

In [3]:
datapath = '../data/'

In [4]:
files = os.listdir(datapath)
for filename in files:
    print(filename)

connard_de_virus.png
default_picture.jpg
Disney_json_ppt2.json
example.json
lego.json


In [5]:
jsonpath = datapath+"example.json"

In [6]:
with open(jsonpath) as fjson:
    data = json.load(fjson)

In [7]:
json_slides = data['document_content']
for elem in json_slides:
    print(elem, "\n") 

{'type': 'level_0', 'content': [{'type': 'title_0', 'content': ['Document title']}, {'type': 'plain_text', 'content': ['Lorem ipsum dolor sit amet, consectetur adipiscing elit. Fusce diam ipsum, aliquam sit amet tempor sed, sodales vitae tellus. Fusce vitae lobortis quam. Proin laoreet efficitur ligula, laoreet congue neque condimentum a']}]} 

{'type': 'level_1', 'content': [{'type': 'title_1', 'content': ['Part 1']}, {'type': 'plain_text', 'content': ['Lorem ipsum dolor sit amet, consectetur adipiscing elit. Fusce diam ipsum, aliquam sit amet tempor sed.']}, {'type': 'level_2', 'content': [{'type': 'title_2', 'content': ['Part 1-1']}, {'type': 'plain_text', 'content': ['Lorem ipsum dolor sit amet, consectetur adipiscing elit. Fusce diam ipsum, aliquam sit amet tempor sed, sodales vitae tellus.', 'Nulla odio nibh, aliquam sit amet eros a, vehicula eleifend tortor. Sed interdum tellus eu convallis pretium. Proin euismod felis id tortor semper, vel vehicula quam dapibus.']}]}, {'type': 

In [8]:
slide_elements = []
breakdown_json(json_slides, slide_elements)

In [9]:
def retrieve_content(list_slides):
    """
    Retrieves titles and plain texts from slide elements (can get them with breakdown_json)
    Parameters:
        - list_slides : list of data in json format
    """
    keywords = ['level_', 'title_', 'subtitle', 'header']
    doc_content = []
    sublist = []
    for content in list_slides:
        r = [content for keyword in keywords if keyword in content]
        if len(r)==0:
            if filling == True:
                sublist.append(content)
            if content == 'plain_text':
                doc_content.append(sublist)
                sublist = []
                filling = True
        elif len(r)>0:
            filling = False

    return doc_content[1:]

In [10]:
for item in retrieve_content(slide_elements):
    print(len(item), item)

1 ['Lorem ipsum dolor sit amet, consectetur adipiscing elit. Fusce diam ipsum, aliquam sit amet tempor sed, sodales vitae tellus. Fusce vitae lobortis quam. Proin laoreet efficitur ligula, laoreet congue neque condimentum a']
1 ['Lorem ipsum dolor sit amet, consectetur adipiscing elit. Fusce diam ipsum, aliquam sit amet tempor sed.']
2 ['Lorem ipsum dolor sit amet, consectetur adipiscing elit. Fusce diam ipsum, aliquam sit amet tempor sed, sodales vitae tellus.', 'Nulla odio nibh, aliquam sit amet eros a, vehicula eleifend tortor. Sed interdum tellus eu convallis pretium. Proin euismod felis id tortor semper, vel vehicula quam dapibus.']
2 ['Lorem ipsum dolor sit amet, consectetur adipiscing elit. Fusce diam ipsum, aliquam sit amet tempor sed, sodales vitae tellus.', 'Nulla odio nibh, aliquam sit amet eros a, vehicula eleifend tortor.']
1 ['Lorem ipsum dolor sit amet, consectetur adipiscing elit. Fusce diam ipsum, aliquam sit amet tempor sed, sodales vitae tellus.']
1 ['Lorem ipsum dol

In [11]:
def retrieve_content_and_titles(list_slides, l):
    """
    Retrieves titles and plain texts from json file
    Parameters:
        - list_slides : list of data in json format
        - l : the list to be returned
    """
    if (type(list_slides) == type(list())):
        for value in list_slides:
            if type(value) == type(str()):
                l.append(value)
            else:
                retrieve_content_and_titles(value, l)
    elif (type(list_slides) == type(dict())):
        for key,value in list_slides.items():
            retrieve_content_and_titles(value, l)
    else:
        pass
        #print(type(list_slides))

testl= []
retrieve_content_and_titles(json_slides, testl)

In [12]:
for i in testl:
    print(i,"\n")

Document title 

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Fusce diam ipsum, aliquam sit amet tempor sed, sodales vitae tellus. Fusce vitae lobortis quam. Proin laoreet efficitur ligula, laoreet congue neque condimentum a 

Part 1 

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Fusce diam ipsum, aliquam sit amet tempor sed. 

Part 1-1 

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Fusce diam ipsum, aliquam sit amet tempor sed, sodales vitae tellus. 

Nulla odio nibh, aliquam sit amet eros a, vehicula eleifend tortor. Sed interdum tellus eu convallis pretium. Proin euismod felis id tortor semper, vel vehicula quam dapibus. 

Part 1-2 

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Fusce diam ipsum, aliquam sit amet tempor sed, sodales vitae tellus. 

Nulla odio nibh, aliquam sit amet eros a, vehicula eleifend tortor. 

Part 2 

Part 2-1 

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Fusce diam ipsum, aliquam sit amet tempor se

### Extraction based summurazation

In [13]:
article = "Peter and Elizabeth took a taxi to attend the night party in the city. While in the party, Elizabeth collapsed and was rushed to the hospital. Since she was diagnosed with a brain injury, the doctor told Peter to stay besides her until she gets well. Therefore, Peter stayed with her at the hospital for 3 days without leaving."

In [14]:
#importing libraries
# https://blog.floydhub.com/gentle-introduction-to-text-summarization-in-machine-learning/
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize

In [15]:
def _create_dictionary_table(text_string) -> dict:
   
    #removing stop words
    stop_words = set(stopwords.words("english"))
    
    words = word_tokenize(text_string)
    
    #reducing words to their root form
    stem = PorterStemmer()
    
    #creating dictionary for the word frequency table
    frequency_table = dict()
    for wd in words:
        wd = stem.stem(wd)
        if wd in stop_words:
            continue
        if wd in frequency_table:
            frequency_table[wd] += 1
        else:
            frequency_table[wd] = 1

    return frequency_table

In [16]:
def _calculate_sentence_scores(sentences, frequency_table) -> dict:   

    #algorithm for scoring a sentence by its words
    sentence_weight = dict()

    for sentence in sentences:
        sentence_wordcount = (len(word_tokenize(sentence)))
        sentence_wordcount_without_stop_words = 0
        for word_weight in frequency_table:
            if word_weight in sentence.lower():
                sentence_wordcount_without_stop_words += 1
                if sentence[:7] in sentence_weight:
                    sentence_weight[sentence[:7]] += frequency_table[word_weight]
                else:
                    sentence_weight[sentence[:7]] = frequency_table[word_weight]

        sentence_weight[sentence[:7]] = sentence_weight[sentence[:7]] / sentence_wordcount_without_stop_words      

    return sentence_weight

In [17]:
def _calculate_average_score(sentence_weight) -> int:
   
    #calculating the average score for the sentences
    sum_values = 0
    for entry in sentence_weight:
        sum_values += sentence_weight[entry]

    #getting sentence average value from source text
    average_score = (sum_values / len(sentence_weight))

    return average_score

In [18]:
def _get_article_summary(sentences, sentence_weight, threshold):
    sentence_counter = 0
    article_summary = ''

    for sentence in sentences:
        if sentence[:7] in sentence_weight and sentence_weight[sentence[:7]] >= (threshold):
            article_summary += " " + sentence
            sentence_counter += 1
            
    return article_summary

In [27]:
def _run_article_summary(article):
    
    #creating a dictionary for the word frequency table
    frequency_table = _create_dictionary_table(article)

    #tokenizing the sentences
    sentences = sent_tokenize(article)

    #algorithm for scoring a sentence by its words
    sentence_scores = _calculate_sentence_scores(sentences, frequency_table)

    #getting the threshold
    threshold = _calculate_average_score(sentence_scores)

    #producing the summary
    article_summary = _get_article_summary(sentences, sentence_scores, threshold)

    return article_summary

In [28]:
summary_results = _run_article_summary(article)
print(summary_results)

 While in the party, Elizabeth collapsed and was rushed to the hospital. Therefore, Peter stayed with her at the hospital for 3 days without leaving.
