## **This program performs the following**: 

1. Places 5 articles into a dataframe
2. Cleans the articles
3. Uses Cosine Similarity to generate a summary for all 5 articles from the "cleaned" data
4. Uses TF-IDF to generate a summary for all 5 "cleaned" articles
5. Places all outputs into one dataframe to easily compare

# **Import Data**

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')

In [2]:
# Create list that will hold file names to be opened 
articles_list = []
for i in range (1,6):
    string="Article"+str(i)+'.txt'
    articles_list.append(string)
    
#articles_list[] = Article1.txt, Article2.txt, Article3.txt, .....

In [3]:
# Turn List to Dict. Needed becuase object 'i' in later used for loop needs to be iterable 
dictOfWords = { i : articles_list[i] for i in range(0, len(articles_list) ) }


In [4]:
#Create dataframe and use list values to populate "Filename Column. Create null values for column with data that will later be populated"
import pandas as pd
import numpy as np 
df = pd.DataFrame(articles_list, columns=['File name'])

df['Content'] = np.nan

df

Unnamed: 0,File name,Content
0,Article1.txt,
1,Article2.txt,
2,Article3.txt,
3,Article4.txt,
4,Article5.txt,


In [5]:
#create list to hold raw string values of data

raw_article_ls = []

In [6]:
# LET THE OPERATIONS BEGIN! 

for i in dictOfWords:
  file = open('/content/Article_data/' + dictOfWords[i], "r")   # read files using names from dict
  filedata = file.readlines()   # filedata is type: list
  text = ''.join(filedata)      # text is the str of all joined items of list: filedata
  raw_article_ls.append(text)   # Add text to list of raw strings to be used later

  article = text.split(". ")    #List that splits raw str values from 'text' into list values. seperated at the period
  
  df.iloc[i, df.columns.get_loc('Content')] = text    #Load raw text into datframe


In [7]:
df

Unnamed: 0,File name,Content
0,Article1.txt,Wi-fi web reaches farmers in Peru\n\nA network...
1,Article2.txt,Hewitt decries 'career sexism'\n\nPlans to ext...
2,Article3.txt,Digital guru floats sub-$100 PC\n\nNicholas Ne...
3,Article4.txt,Technology gets the creative bug\n\nThe hi-tec...
4,Article5.txt,Solutions to net security fears\n\nFake bank e...


# **Clean Data**

Clean Data Special Characters

In [8]:
df['Content_Parsed_1'] = df['Content'].str.replace("\r", " ")
df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("    ", " ")

Clean data - Uppercase to downcase

In [9]:
df['Content_Parsed_2'] = df['Content_Parsed_1'].str.lower()

Clean Data - Punctuation 

In [10]:
punctuation_signs = list("?:!,;")
df['Content_Parsed_3'] = df['Content_Parsed_2']

for punct_sign in punctuation_signs:
    df['Content_Parsed_3'] = df['Content_Parsed_3'].str.replace(punct_sign, '')

  """


Clean data - Possessive pronouns

In [11]:
df['Content_Parsed_4'] = df['Content_Parsed_3'].str.replace("'s", "")

Clean Data - Stemming and Lemmatization


In [12]:
nltk.download('punkt')
print("------------------------------------------------------------")
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


------------------------------------------------------------


[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [13]:
wordnet_lemmatizer = WordNetLemmatizer()


In [14]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [15]:
nrows = len(df)
lemmatized_text_list = []

for row in range(0, nrows):
    
    # Create an empty list containing lemmatized words
    lemmatized_list = []
    
    # Save the text and its words into an object
    text = df.loc[row]['Content_Parsed_4']
    text_words = text.split(" ")

    # Iterate through every word to lemmatize
    for word in text_words:
        lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        
    # Join the list
    lemmatized_text = " ".join(lemmatized_list)
    
    # Append to the list containing the texts
    lemmatized_text_list.append(lemmatized_text)

In [16]:
df['Content_Parsed_5'] = lemmatized_text_list


CLEAN - Stop Words

In [17]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
stop_words = list(stopwords.words('english'))

In [19]:
df['Content_Parsed_6'] = df['Content_Parsed_5']

for stop_word in stop_words:

    regex_stopword = r"\b" + stop_word + r"\b"
    df['Content_Parsed_6'] = df['Content_Parsed_6'].str.replace(regex_stopword, '')

  


In [20]:
df['Content_Parsed_7'] = np.nan

Remove extra spaces left from data cleaning 

In [21]:
import re
for i in range(len(df)):
  cell = df.iat[i,7]

  df.iat[i,8] = re.sub(' +', ' ', str(cell))

Rename fully cleaned column to "Content_Cleaned"

In [22]:
list_columns = ["File name", "Content","Content_Parsed_7"]
df = df[list_columns]

df = df.rename(columns={'Content_Parsed_7': 'Content_Cleaned'})

In [23]:
df

Unnamed: 0,File name,Content,Content_Cleaned
0,Article1.txt,Wi-fi web reaches farmers in Peru\n\nA network...,wi-fi web reach farmers peru\n\n network commu...
1,Article2.txt,Hewitt decries 'career sexism'\n\nPlans to ext...,hewitt decry 'career sexism'\n\nplans extend p...
2,Article3.txt,Digital guru floats sub-$100 PC\n\nNicholas Ne...,digital guru float sub-$100 pc\n\nnicholas neg...
3,Article4.txt,Technology gets the creative bug\n\nThe hi-tec...,technology get creative bug\n\n hi-tech arts w...
4,Article5.txt,Solutions to net security fears\n\nFake bank e...,solutions net security fears\n\nfake bank e-ma...


# **Summary w/ Cosine Similarity Method**

In [24]:
df['Cosine-Similarity Summary'] = np.nan

In [25]:
df

Unnamed: 0,File name,Content,Content_Cleaned,Cosine-Similarity Summary
0,Article1.txt,Wi-fi web reaches farmers in Peru\n\nA network...,wi-fi web reach farmers peru\n\n network commu...,
1,Article2.txt,Hewitt decries 'career sexism'\n\nPlans to ext...,hewitt decry 'career sexism'\n\nplans extend p...,
2,Article3.txt,Digital guru floats sub-$100 PC\n\nNicholas Ne...,digital guru float sub-$100 pc\n\nnicholas neg...,
3,Article4.txt,Technology gets the creative bug\n\nThe hi-tec...,technology get creative bug\n\n hi-tech arts w...,
4,Article5.txt,Solutions to net security fears\n\nFake bank e...,solutions net security fears\n\nfake bank e-ma...,


In [26]:
cleaned_article_ls = []

In [27]:
for i in range(len(df)):
  cleaned_article_ls.append(df.iloc[i]['Content_Cleaned'])

In [28]:
send_to_df_lis = []

In [29]:
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx

for i in range(len(df)):

  def read_article(file_name):
      article = cleaned_article_ls[i].split(". ")
      sentences = []

      for sentence in article:
          #print(sentence)
          sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
      sentences.pop()  
      
      return sentences

  def sentence_similarity(sent1, sent2, stopwords=None):
      if stopwords is None:
          stopwords = []
  
      sent1 = [w.lower() for w in sent1]
      sent2 = [w.lower() for w in sent2]
  
      all_words = list(set(sent1 + sent2))
  
      vector1 = [0] * len(all_words)
      vector2 = [0] * len(all_words)
  
      # build the vector for the first sentence
      for w in sent1:
          if w in stopwords:
              continue
          vector1[all_words.index(w)] += 1
  
      # build the vector for the second sentence
      for w in sent2:
          if w in stopwords:
              continue
          vector2[all_words.index(w)] += 1
  
      return 1 - cosine_distance(vector1, vector2)
  
  def build_similarity_matrix(sentences, stop_words):
      # Create an empty similarity matrix
      similarity_matrix = np.zeros((len(sentences), len(sentences)))
  
      for idx1 in range(len(sentences)):
          for idx2 in range(len(sentences)):
              if idx1 == idx2: #ignore if both are same sentences
                  continue 
              similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

      return similarity_matrix


  def generate_summary(file_name, top_n=5):
      stop_words = stopwords.words('english')
      summarize_text = []

      # Step 1 - Read text anc split it
      sentences =  read_article(file_name)

      # Step 2 - Generate Similary Martix across sentences
      sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)

      # Step 3 - Rank sentences in similarity martix
      sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
      scores = nx.pagerank(sentence_similarity_graph)

      # Step 4 - Sort the rank and pick top sentences
      ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    
      #print("Indexes of top ranked_sentence order are ", ranked_sentence)    

      for i in range(top_n):
        summarize_text.append(" ".join(ranked_sentence[i][1]))

      # Step 5 - Offcourse, output the summarize texr
      print("Summarize Text: \n", ". ".join(summarize_text))
      lis_to_str = '. '.join(str(x) for x in summarize_text)
      send_to_df_lis.append(lis_to_str)
      print('---------------------------------------')

      

  # let's begin
  generate_summary( "msft.txt", 2)

Summarize Text: 
 agricultural information project farmers chancay-huaral valley also provide vital link local organisations charge water irrigation enable coordinate actions. "throughout last three years people provide vital thrust project feel belong " say mr saldarriaga
---------------------------------------
Summarize Text: 
 " also say childcare job really women suitable men. women full-time work earn 19% less men accord equal opportunities commission (eoc).

 minister tell delegate get rid "career sexism" vital close gender pay gap.

"career sexism limit opportunities women age prevent achieve full potential
---------------------------------------
Summarize Text: 
 say one laptop per child could " important development child whole family village neighbourhood". say child could use laptop like text book
---------------------------------------
Summarize Text: 
 art world "fantastically rich" say mr stone creative people ideas mean traditional company like bt want get . " hop unders

In [30]:
for i in range(len(df)):
  df.iat[i,3] = send_to_df_lis[i]

In [31]:
df

Unnamed: 0,File name,Content,Content_Cleaned,Cosine-Similarity Summary
0,Article1.txt,Wi-fi web reaches farmers in Peru\n\nA network...,wi-fi web reach farmers peru\n\n network commu...,agricultural information project farmers chanc...
1,Article2.txt,Hewitt decries 'career sexism'\n\nPlans to ext...,hewitt decry 'career sexism'\n\nplans extend p...,""" also say childcare job really women suitable..."
2,Article3.txt,Digital guru floats sub-$100 PC\n\nNicholas Ne...,digital guru float sub-$100 pc\n\nnicholas neg...,"say one laptop per child could "" important dev..."
3,Article4.txt,Technology gets the creative bug\n\nThe hi-tec...,technology get creative bug\n\n hi-tech arts w...,"art world ""fantastically rich"" say mr stone cr..."
4,Article5.txt,Solutions to net security fears\n\nFake bank e...,solutions net security fears\n\nfake bank e-ma...,people become aware online security issue litt...


# **Summary w/ TF-IDF Method**

In [32]:
df['Tf-IDF Summary'] = np.nan

In [33]:
import math

from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords    

In [34]:
def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix

In [35]:
def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix

In [36]:
def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table

In [37]:
def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix

In [38]:
def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix

In [39]:
def _score_sentences(tf_idf_matrix) -> dict:
    """
    score a sentence by its word's TF
    Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
    :rtype: dict
    """

    sentenceValue = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0

        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence

    return sentenceValue

In [40]:
def _find_average_score(sentenceValue) -> int:
    """
    Find the average score from the sentence value dictionary
    :rtype: int
    """
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original summary_text
    average = (sumValues / len(sentenceValue))

    return average

In [41]:
def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

In [42]:
import math

from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords 

import nltk
nltk.download('punkt')
nltk.download('stopwords')
    

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [43]:
clean_sent_sum = []

In [None]:

for i in range(len(df)):

  # 1 Sentence Tokenize
  sentences = sent_tokenize(cleaned_article_ls[i])
  total_documents = len(sentences)
  #print(sentences)

  # 2 Create the Frequency matrix of the words in each sentence.
  freq_matrix = _create_frequency_matrix(sentences)
  #print(freq_matrix)

  # 3 Calculate TermFrequency and generate a matrix
  tf_matrix = _create_tf_matrix(freq_matrix)
  #print(tf_matrix)

  # 4 creating table for documents per words
  count_doc_per_words = _create_documents_per_words(freq_matrix)
  #print(count_doc_per_words)

  # 5 Calculate IDF and generate a matrix
  idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
  #print(idf_matrix)

  # 6 Calculate TF-IDF and generate a matrix
  tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
  #print(tf_idf_matrix)

  # 7 Important Algorithm: score the sentences
  sentence_scores = _score_sentences(tf_idf_matrix)
  #print(sentence_scores)

  # 8 Find the threshold
  threshold = _find_average_score(sentence_scores)
  #print(threshold)

  # 9 Important Algorithm: Generate the summary
  summary = _generate_summary(sentences, sentence_scores, 1.3 * threshold)
  #print(summary)

  clean_sent_sum.append(summary)

In [46]:
for i in range(len(df)):
  df.iat[i,4] = clean_sent_sum[i]

In [47]:
df

Unnamed: 0,File name,Content,Content_Cleaned,Cosine-Similarity Summary,Tf-IDF Summary
0,Article1.txt,Wi-fi web reaches farmers in Peru\n\nA network...,wi-fi web reach farmers peru\n\n network commu...,agricultural information project farmers chanc...,network three years make officially inaugurat...
1,Article2.txt,Hewitt decries 'career sexism'\n\nPlans to ext...,hewitt decry 'career sexism'\n\nplans extend p...,""" also say childcare job really women suitable...",look quickly obviously cost implications taxp...
2,Article3.txt,Digital guru floats sub-$100 PC\n\nNicholas Ne...,digital guru float sub-$100 pc\n\nnicholas neg...,"say one laptop per child could "" important dev...",device probably export kit part assemble loca...
3,Article4.txt,Technology gets the creative bug\n\nThe hi-tec...,technology get creative bug\n\n hi-tech arts w...,"art world ""fantastically rich"" say mr stone cr...",vice versa. industry grow 6% year . put art g...
4,Article5.txt,Solutions to net security fears\n\nFake bank e...,solutions net security fears\n\nfake bank e-ma...,people become aware online security issue litt...,trust online security fall result. worry shop...
