Jacob Duvall

# the_summarizer

In [70]:
import os
import random
import glob
from sklearn.metrics import silhouette_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
import json
import pickle
import nltk
import numpy as np
import pandas as pd
import networkx
import os


# look at the format of the json schema
def look_at_the_format_of_the_file(file):
    file = open(file, 'r')
    print(file.read())


# Takes glob and randomly extracts a percent of the files in a list
def choose_documents(folders, percent):
    file_list = list()
    for folder in folders:
        files = os.listdir(folder)
        for file in files:
            file_list.append(file)
    size = int(len(file_list) * (percent*(10**-2)))
    return random.sample(file_list, size)


# Takes a list of files and tokenizes the data in the files using CountVectorizer
def files_reader(files_list):
    directory = 'C:\\Users\\jdale\\OneDrive\\School\\Text Analytics\\' \
                '*\\*\\pdf_json\\'
    single_large_document_list = list()
    for file in files_list:
        dir = glob.glob(str(directory + file))
        with open(dir[0]) as file_json:
            data = json.load(file_json)
            document_string = ''
            for line in data['body_text']:
                document_string = document_string + line['text']
            single_large_document_list.append(document_string)

    return single_large_document_list


# use tfidfvectorizer on the documents to get the matrix
def tfid_vectorize(doc):
    tv = TfidfVectorizer(min_df = 0., max_df=1., use_idf=True)
    tv_matrix = tv.fit_transform(doc)
    pickle.dump(tv_matrix, open('yummy_pickle.pkl', 'wb'))
    return tv, tv_matrix


# use countvectoizer on the documents to get the matrix
def count_vectorize(doc):
    cv = CountVectorizer()
    cv_matrix = cv.fit_transform(doc)
    pickle.dump(cv_matrix, open('yummy_pickle_cv.pkl', 'wb'))
    return cv, cv_matrix


# Takes tokenized documents and clusters them -- Uses Silhouette Coefficient to measure cluster quality.
# Records the documents that are part of each cluster
def cluster_documents(doc_list):
    cv, tokenized_files = count_vectorize(doc_list)
    cluster_range = list(range(2, 8))
    for n_clusters in cluster_range:
        km = KMeans(n_clusters=n_clusters)
        km_predicts = km.fit_predict(tokenized_files)
        pickle_save = 'pickle_km_' + str(n_clusters) + '.pkl'
        pickle.dump(km_predicts, open(pickle_save, 'wb'))

        score = silhouette_score(tokenized_files, km_predicts)
        print("Number of clusters: {}, Silhouette Score: {}".format(n_clusters, score))
    return cv, tokenized_files


# fits the best cluster size and shows the feature names 
def best_cluster(cv, tokenized_files, size):
    km = KMeans(n_clusters=size).fit(tokenized_files)
    feature_names = cv.get_feature_names()
    ordered_centroids = km.cluster_centers_.argsort()[:, ::-1]
    for cluster_num in range(2):
        print('CLUSTER #' +str(cluster_num+1))
        feature_list = list()
        for i in ordered_centroids[cluster_num, :10]:
            feature_list.append(feature_names[i])
        print(feature_list)


# summarize the documents looking at their top 8 sentences through TextRank
def summarize_documents(string_list):
    count = 0
    for string in string_list:
        sentences = nltk.sent_tokenize(string)
        try:
            if len(sentences) < 8:
                raise Exception
        except Exception:
            continue
        tv, tv_matrix = tfid_vectorize(sentences)
        try:
            tv_matrix = tv_matrix.toarray()
        except:
            print('too large')
            continue
        similarity_matrix = np.matmul(tv_matrix, tv_matrix.T)
        try:
            similarity_graph = networkx.from_numpy_array(similarity_matrix)
        except MemoryError:
            print('memory error 1')
            continue
        try:
            scores = networkx.pagerank(similarity_graph)
        except MemoryError:
            print('memory error')
            continue

        ranked_sentences = sorted(((score, index) for index, score in scores.items()), reverse=True)
        top_sentenc_indices = [ranked_sentences[index][1] for index in range(8)]

        top_sentenc_indices.sort()
        write_summary_to_file(count, np.array(sentences)[top_sentenc_indices])
        count += 1


# write the summaries generated from summarize_documents() to file
def write_summary_to_file(count, summary_array):
    #print(count)
    if count < 1:
        try:
            os.remove("SUMMARY.md")
        except:
            pass
        file = open("SUMMARY.md", "w",  encoding='utf-8')
        header = 'This file was generated using TextRank summarization. The process taken was to extract ' \
                 'a random percent sampling of pdf_json files from ' \
                 'https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge. With these ' \
                 'files extracted, I then opened all files, tokenized the sentences of the ' \
                 'files using tfid vectorizer, ' \
                 'and then applied a TextRank algorithm to the tokenized docs. This TextRank allowed me to then ' \
                 'extract the 8 most useful summarized sentences. And voila!'
        file.write(header)
        file.write('\n\n')
        file.close()

    file = open("SUMMARY.md", "a",  encoding='utf-8')
    file.write(str(summary_array))
    file.write('\n\n')
    file.close()

# main

## Look at the format of the file

In [39]:
look_at_the_format_of_the_file('C:\\Users\\jdale\\OneDrive\\School\\Text Analytics\\json_schema.txt')

# JSON schema of full text documents


{
    "paper_id": <str>,                      # 40-character sha1 of the PDF
    "metadata": {
        "title": <str>,
        "authors": [                        # list of author dicts, in order
            {
                "first": <str>,
                "middle": <list of str>,
                "last": <str>,
                "suffix": <str>,
                "affiliation": <dict>,
                "email": <str>
            },
            ...
        ],
        "abstract": [                       # list of paragraphs in the abstract
            {
                "text": <str>,
                "cite_spans": [             # list of character indices of inline citations
                                            # e.g. citation "[7]" occurs at positions 151-154 in "text"
                                            #      linked to bibliography entry BIBREF3
                    {
                        "start": 151,
                        "end": 1

## Choose documents

In [45]:
sampling_of_files = choose_documents(glob.glob(
    'C:\\Users\\jdale\\OneDrive\\School\\Text Analytics\\*\\*\\pdf_json'), 10)
import pprint
pprint.pprint(sampling_of_files[:10])

['4c159330655513984fac5ab8e0c575512d33b514.json',
 'eb265b07935811bc052549f3780c9af843ecf45e.json',
 '8da224b5eba13cee28c3a95cb3c1e3c4d26e5a3a.json',
 '5e5908b9f7ad23dd332a32e1dfa411c4b0e88f59.json',
 'ddb7090314e89cfc001186101093a15896ba3b43.json',
 'b142d0d3130f98fe1bc2cec14d62de4362c80534.json',
 'd3d1b82a318250a2e7193878c11a4e14817b07eb.json',
 '49b80f692b4d3e2ebbd59454e10c29d522d8c2f7.json',
 '9f5625d9287d8c6729215a20add99c8cd4ba08cb.json',
 '0953fa36903063f60627e07f7b4e07f0aec3c4d3.json']


## Write a files reader

In [48]:
string_of_files_list = files_reader(sampling_of_files)
pprint.pprint(string_of_files_list[:2])

['The global assurance of safe drinking water and basic sanitation has been '
 'recognised as a United Nations Millennium Development Goal 1 , particularly '
 'in light of the pressures of rising urbanisation, agricultural '
 'intensification and climate change 2,3 . These trends enforce an increasing '
 'demand for freshwater monitoring frameworks that combine cost effectiveness, '
 'fast technology deployability and data transparency 4 . Environmental '
 'metagenomics, the tracing of organisms present in a substrate through '
 'high-throughput DNA sequencing, yields informative measures of relative '
 'taxonomic species occurrence and functional diversity 5 . Microbial '
 'metagenomics studies overcome enrichment biases common to traditional '
 'culturing approaches 5 ; however, they usually depend on expensive and '
 'stationary equipment, highly specialised operational training and '
 'substantial time lags between fieldwork, sample preparation, raw data '
 'generation and access.I

## Cluster documents

In [53]:
cv, tokenized_files = cluster_documents(string_of_files_list)

Number of clusters: 2, Silhouette Score: 0.9270759361330172
Number of clusters: 3, Silhouette Score: 0.9207116157180787
Number of clusters: 4, Silhouette Score: 0.4672383758046355
Number of clusters: 5, Silhouette Score: 0.45250612718556243
Number of clusters: 6, Silhouette Score: 0.4552221519281297
Number of clusters: 7, Silhouette Score: 0.35165373560844365


In [54]:
best_cluster(cv, tokenized_files, 2)

CLUSTER #1
['the', 'of', 'and', 'in', 'to', 'is', 'with', 'di', 'or', 'for']
CLUSTER #2
['the', 'of', 'and', 'in', 'to', 'with', 'for', 'is', 'that', 'was']


## Summarize documents
## Write summarys to file

In [None]:
summarize_documents(string_of_files_list)

In [72]:
file = open('C:\\Users\\jdale\\OneDrive\\School\\Text Analytics\\project_2\\SUMMARY.md', 'r')
with open("C:\\Users\\jdale\\OneDrive\\School\\Text Analytics\\project_2\\SUMMARY.md") as f:
    head = [next(f) for i in range(20)]
for i in head:
    print(i)

This file was generated using TextRank summarization. The process taken was to extract a random percent sampling of pdf_json files from https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge. With these files extracted, I then opened all files, tokenized the sentences of the files using tfid vectorizer, and then applied a TextRank algorithm to the tokenized docs. This TextRank allowed me to then extract the 8 most useful summarized sentences. And voila!



['The protein expressed from the IBV S1 protein multi-epitope cassette and the NDV NP protein were found at 35 and 53 kDa bands in P1 and P20 of rNDV-IBV-T/B lysates, however, the parental virus strain LaSota only had the~53 kDa NP protein band (Figure 2) , confirming that rNDV-IBV-T/B successfully expressed the IBV S1 protein multi-epitope cassette in vitro and that the protein was antigenic, as evidenced by its recognition on a Western blot by anti-IBV serum.'

 'To determine whether inserting the IBV S1 protein m