In [1]:
import pandas as pd
import numpy as np
import nltk
# nltk.download('punkt') # one time execution
import re
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import math
from tqdm import tqdm
from rouge import Rouge
from sklearn.feature_extraction.text import TfidfVectorizer

## Function Initializations

In [2]:
## Extract Word Vectors ##

def create_word_embeddings(embedding_path):

    word_embedding_dict = {}

    f = open(embedding_path, encoding='utf-8')

    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_embedding_dict[word] = coefs
    f.close()
    
    print("Word embeddings succesfully extracted!")
    
    return(word_embedding_dict)

# word_embeddings = create_word_embeddings('glove.6b/glove.6B.100d.txt')

## Remove stopwords from sentence ##

def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

In [3]:
def pre_process_single_doc(doc):
    
    sentences = [] # Create empty temporary list
    
    for s in doc['documents']:
        sentences.append(sent_tokenize(s))
    
    sentences = [y for x in sentences for y in x] # flatten list
    
    sentences = list(pd.Series(sentences))
    
    # Replace newlines and beginning marks

    clean_sentences = list(map(lambda x: x.replace('\n','').replace('"b','').replace('\\n','').replace("'b",'').replace('b"','').replace("b'",''),sentences))
    
    # Replace URL's and beginning marks

    clean_sentences = [re.sub('(http|ftp|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])','URL',i) for i in clean_sentences]
    
    # Replace subreddit indicator
    
    clean_sentences = list(map(lambda x: x.replace('r/','subreddit '),clean_sentences))
    
    # Remove punctuation, numbers and special characters

    clean_sentences = [re.sub(r"[^a-zA-Z0-9]+"," ",i) for i in clean_sentences]

    # Lowercase everything

    clean_sentences = [s.lower() for s in clean_sentences]

    # remove stopwords from the sentences

    clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]
    
    return(sentences, clean_sentences)
    
# sentence_list, pre_processed_sentences = pre_process_single_doc(sample_post)
# print(pre_processed_sentences)

In [4]:
## Get Sentence Vectors ##

def create_sentence_vectors(preprocessed_doc):

    sentence_vectors = []

    for i in preprocessed_doc:
        if len(i) != 0:
            v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
        else:
            v = np.zeros((100,))
        sentence_vectors.append(v)
    
    return(sentence_vectors)

# sentence_vector = create_sentence_vectors(pre_processed_sentences)

In [5]:
## Get longest common substring ## 

def calculate_lcs(string1, string2):
    lcs = 0;
    for a in range(len(string1)):
         for b in range(len(string2)):
            k = 0;
            while ((a + k) < len(string1) and (b + k) < len(string2)
        and string1[a + k] == string2[b + k]):
                k = k + 1;

            lcs = max(lcs, k);
    return lcs;

In [6]:
## Create Similarity Matrix based on Cosine Similarity ##

def create_similarity_matrix_cosine_vector(sen_vec,sen_list):

    # similarity matrix

    sim_mat = np.zeros([len(sen_list), len(sen_list)])

    for i in range(len(sen_list)):
        for j in range(len(sen_list)):
            if i != j:
                sim_mat[i][j] = cosine_similarity(sen_vec[i].reshape(1,100), sen_vec[j].reshape(1,100))[0,0]
    
    return(sim_mat)
    
# similarity_matrix = create_similarity_matrix(sentence_vector,sentence_list)

In [7]:
## Create Similarity Matrix based on Cosine Similarity ##

def create_similarity_matrix_TFIDF(sen_list):

    # similarity matrix

    vect = TfidfVectorizer(min_df=1, stop_words="english")
    tfidf = vect.fit_transform(sen_list)     
    pairwise_similarity = tfidf * tfidf.T
    sim_mat = pairwise_similarity.toarray()
    np.fill_diagonal(sim_mat, 0)

    return(sim_mat)

In [8]:
## Create Similarity Matrix based on LCS Similarity ##

def create_similarity_matrix_lcs(sen_list):

    # similarity matrix

    sim_mat = np.zeros([len(sen_list), len(sen_list)])

    for i in range(len(sen_list)):
        for j in range(len(sen_list)):
            if i != j:
                sim_mat[i][j] = calculate_lcs(sen_list[i], sen_list[j])
    
    return(sim_mat)
    
# similarity_matrix = create_similarity_matrix(sentence_vector,sentence_list)

In [9]:
## Generate Graph from Similarity Matrix ##

def create_graph_and_rank(sim_mat,sen_list,plot_graph=False):
    
    # Create graph

    nx_graph = nx.from_numpy_array(sim_mat)

    # Call Pagerank

    scores = nx.pagerank_numpy(nx_graph)
    
    if plot_graph == True:
        # Plot the graph
        nx.draw(nx_graph)
    
    ## Rank Sentences Based on PageRank Algorithm #
    
    return(sorted(((scores[i],s) for i,s in enumerate(sen_list)), reverse=True))

# ranking_scores = create_graph_and_rank(similarity_matrix,sentence_list)

In [10]:
## Extract top sentences as the summary ##

def generate_summary(sen_ranks, sen_list):

    # Use 20% of the length of the original post

    sum_size = math.ceil(int(0.2 * len(sen_list)))
    if sum_size < 1:
        sum_size = 1

    order_dict = dict()

    for i in range(sum_size):
        order_dict.update({sen_ranks[i][1]: sen_list.index(sen_ranks[i][1])})

    sorted_values = sorted(order_dict.values()) # Sort the values
    sorted_dict = {}

    for i in sorted_values:
        for k in order_dict.keys():
            if order_dict[k] == i:
                sorted_dict[k] = order_dict[k]
                break
                
    # Replace newlines and beginning marks

    clean_sentences = list(map(lambda x: x.replace('\n','').replace('"b','').replace('\\n','').replace("'b",'').replace('b"','').replace("b'",''),sorted_dict))
    
    # Replace URL's and beginning marks

    clean_sentences = [re.sub('(http|ftp|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])','URL',i) for i in clean_sentences]
    
    # Replace subreddit indicator
    
    clean_sentences = list(map(lambda x: x.replace('r/','subreddit '),clean_sentences))
    
    # Remove punctuation, numbers and special characters
    
    clean_sentences = [re.sub(r"[^a-zA-Z0-9%',.-]+"," ",i) for i in clean_sentences]
    
    return(" ".join(clean_sentences))

# summary = generate_summary(ranking_scores,sentence_list)

## Reading in the data

In [11]:
tifu_dataset = pd.read_csv("out/out.csv",index_col="Unnamed: 0")

In [12]:
tifu_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42136 entries, 0 to 42135
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   documents     42136 non-null  object 
 1   num_comments  42136 non-null  int64  
 2   score         42136 non-null  int64  
 3   title         42136 non-null  object 
 4   tldr          42136 non-null  object 
 5   ups           42136 non-null  int64  
 6   upvote_ratio  42136 non-null  float64
dtypes: float64(1), int64(3), object(3)
memory usage: 2.6+ MB


In [13]:
tifu_dataset.head()

Unnamed: 0,documents,num_comments,score,title,tldr,ups,upvote_ratio
0,"b""me and a friend decided to go to the beach l...",1,8,b'liking seafood',b'had delicious seafood. almost flooded a toil...,8,0.76
1,b'obligatory this happened last thursday. \n\n...,4,0,b'joking about being a whore',b'made a joke about being a whore in class. ma...,0,0.5
2,b'this was actually a few years ago (obligator...,24,25,b'burning my finger with molten starburst',b'i burned my finger with a microwaved starbur...,25,0.81
3,"b'this is my first post on reddit, so i\'m sor...",23,152,b'skipping my heart meds. nsfw',"b'a girl tried to give me a banana, smashed a ...",152,0.9
4,"b""unlike most tifu this actually just happened...",10,21,b'overcoming my fears',"b'noticed a spider above my bed,decided to ove...",21,0.84


## Running the Pipeline

In [14]:
# Execute once

word_embeddings = create_word_embeddings('glove.6b/glove.6B.100d.txt')

stop_words = stopwords.words('english')

Word embeddings succesfully extracted!


In [15]:
def pipeline_textrank_summarization(doc,sim_measure): # Sim measure should be either cosine or lcs

    if sim_measure == "cosine":
        
        sentence_list, pre_processed_sentences = pre_process_single_doc(doc)

        sentence_vector = create_sentence_vectors(pre_processed_sentences)

        similarity_matrix = create_similarity_matrix_cosine_vector(sentence_vector,sentence_list)

        ranking_scores = create_graph_and_rank(similarity_matrix,sentence_list)

        summary = generate_summary(ranking_scores,sentence_list)
                
    if sim_measure == "lcs":
        
        sentence_list, pre_processed_sentences = pre_process_single_doc(doc)
        
        similarity_matrix = create_similarity_matrix_lcs(pre_processed_sentences)

        ranking_scores = create_graph_and_rank(similarity_matrix,sentence_list)

        summary = generate_summary(ranking_scores,sentence_list)
        
    if sim_measure == "tfidf":
        
        sentence_list, pre_processed_sentences = pre_process_single_doc(doc)
        
        similarity_matrix = create_similarity_matrix_TFIDF(pre_processed_sentences)

        ranking_scores = create_graph_and_rank(similarity_matrix,sentence_list)

        summary = generate_summary(ranking_scores,sentence_list)
                
    return(summary)

In [16]:
sample_post = tifu_dataset[2459:2460] # Select one document

text = sample_post['documents'][2459]
print("_____________TEXT______________")
print(text)

tldr = sample_post['tldr'][2459]
print("_____________TLDR______________")
print(tldr)

# 2019 has faulty TLDR

_____________TEXT______________
b'throwaway for obvious reasons. \n\nthis was back when i was in my early teens, 13-15 i don\'t remember exactly. anyways, i was at sleep-away camp for two weeks and it was one of my first experiences away from home for an extended period of time, and for some reason i have a really bad time with taking a shit when i\'m not at a toilet i\'m familiar with (like the one at my house). maybe it was also due to the fact that campground toilets are gross, and outhouses even more so. i don\'t know. anyways i didn\'t take a shit for at least eight days, maybe more. \n\nat this point the struggle is so real. i\'m starting to have the "i really need to take a shit" feeling but somehow i\'m keeping it in. another day passes. i have somehow still managed to keep the shit inside me. my stomach and entire abdominal area is starting to look a little swollen, and it actually hurts to lie on my stomach. still managing to hold it in, god knows how. \n\nin the afternoon, t

In [17]:
print("_______ LCS method _______")

summary_lcs = pipeline_textrank_summarization(sample_post,"lcs")
print(summary_lcs)

print("_______ cosine method _______")

summary_cosine = pipeline_textrank_summarization(sample_post,"cosine")
print(summary_cosine)

print("_______ TFIDF method _______")

summary_tfidf = pipeline_textrank_summarization(sample_post,"tfidf")
print(summary_tfidf)

_______ LCS method _______
anyways, i was at sleep-away camp for two weeks and it was one of my first experiences away from home for an extended period of time, and for some reason i have a really bad time with taking a shit when i 'm not at a toilet i 'm familiar with like the one at my house . anyways i didn 't take a shit for at least eight days, maybe more. i 'm starting to have the i really need to take a shit feeling but somehow i 'm keeping it in. i feign being sick, partly because i actually do feel like shit no pun intended  and also because i feel like intense walking does not mix well with a shit-packed stomach. maybe it 's partly psychological - my brain knows i 'm alone now - but i don 't know why...suddenly i really need to take a shit. he was also the type of kid who was probably the most likely to so something weird like take a shit in the woods behind the sleeping area...well, you see where this is going.
_______ cosine method _______
anyways, i was at sleep-away camp 

In [18]:
rouge = Rouge()

rouge.get_scores(summary_tfidf, tldr)

[{'rouge-1': {'r': 0.3, 'p': 0.028037383177570093, 'f': 0.05128204971875233},
  'rouge-2': {'r': 0.1111111111111111,
   'p': 0.005847953216374269,
   'f': 0.011111110161111193},
  'rouge-l': {'r': 0.2, 'p': 0.018691588785046728, 'f': 0.03418803262473526}}]

In [19]:
summary_list_LCS = []
summary_list_Cosine = []
summary_list_TFIDF = []

for index, row in tqdm(tifu_dataset.iterrows(), total=tifu_dataset.shape[0]):
    curr_post = tifu_dataset[index:index+1] # Select one document
    
    try:
        summary_list_Cosine.append(pipeline_textrank_summarization(curr_post,"cosine"))
    except Exception:
        summary_list_Cosine.append("error")
        
    try:
        summary_list_LCS.append(pipeline_textrank_summarization(curr_post,"lcs"))
    except Exception:
        summary_list_LCS.append("error")
                
    try:
        summary_list_TFIDF.append(pipeline_textrank_summarization(curr_post,"tfidf"))
    except Exception:
        summary_list_TFIDF.append("error")

100%|█████████████████████████████████████████████████████████████████████████| 42136/42136 [12:22:06<00:00,  1.06s/it]


In [20]:
reference_list = list(tifu_dataset['tldr'])

In [21]:
tifu_dataset["LCS"] = summary_list_LCS
tifu_dataset["COSINE"] = summary_list_Cosine
tifu_dataset["TFIDF"] = summary_list_TFIDF

In [22]:
rouge.get_scores(summary_list_LCS, reference_list,avg=True)

{'rouge-1': {'r': 0.34185330774196,
  'p': 0.10462973284345654,
  'f': 0.14846601011048802},
 'rouge-2': {'r': 0.0684301096125939,
  'p': 0.017297407796005713,
  'f': 0.025020455502450897},
 'rouge-l': {'r': 0.2943874568887026,
  'p': 0.08777490071442022,
  'f': 0.12528979208646296}}

In [23]:
rouge.get_scores(summary_list_Cosine, reference_list,avg=True)

{'rouge-1': {'r': 0.3293588628008475,
  'p': 0.09986659789070577,
  'f': 0.14249724711983686},
 'rouge-2': {'r': 0.06021175277383512,
  'p': 0.015065330239948773,
  'f': 0.021950238359046435},
 'rouge-l': {'r': 0.28269857729130543,
  'p': 0.08352918051864501,
  'f': 0.11987308456461475}}

In [24]:
rouge.get_scores(summary_list_TFIDF, reference_list,avg=True)

{'rouge-1': {'r': 0.31396949702276566,
  'p': 0.1170709075381415,
  'f': 0.15657601097597862},
 'rouge-2': {'r': 0.06168557674975509,
  'p': 0.019239124805426392,
  'f': 0.026236436843573904},
 'rouge-l': {'r': 0.2695735901166072,
  'p': 0.09824747553032066,
  'f': 0.13210622867212768}}

In [25]:
compression_opts = dict(method='zip', archive_name='TextRankOutput.csv')  # Create a compression method to efficiently export data
tifu_dataset.to_csv('TextRankOutput.zip', index=True, compression=compression_opts) # Export the data

In [27]:
tifu_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42136 entries, 0 to 42135
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   documents     42136 non-null  object 
 1   num_comments  42136 non-null  int64  
 2   score         42136 non-null  int64  
 3   title         42136 non-null  object 
 4   tldr          42136 non-null  object 
 5   ups           42136 non-null  int64  
 6   upvote_ratio  42136 non-null  float64
 7   LCS           42136 non-null  object 
 8   COSINE        42136 non-null  object 
 9   TFIDF         42136 non-null  object 
dtypes: float64(1), int64(3), object(6)
memory usage: 3.5+ MB
