# Imports 

In [69]:
import pandas as pd
import numpy as np

from lexrank import STOPWORDS, LexRank
from sklearn.metrics import pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer

from tqdm import tqdm
tqdm.pandas()

# Settings and Configuration 

In [4]:
POLISUM_PATH = '../../../../data/data_clean/polisum_clean.csv'

RESULTS_PATH = '../../../../results/lexrank_results.csv'

# Data Loading 

In [8]:
polisum = pd.read_csv(POLISUM_PATH)

# Model 

In [61]:
# Used https://pypi.org/project/lexrank/
class LexRankCOS():
    '''
        A simple baseline using LexRank and BOW cosine distance to select contrastive sentence summaries.
    '''
    
    def __init__(self):
        '''
            Constructor for LexRankCOS model
        '''
        
        self.lexrank = None
        self.vect    = CountVectorizer(stop_words = STOPWORDS['en'])
    
    def fit(self, documents):
        '''
            Fit LexRank and vectorizer to the input set of documents
            
            Parameters:
                documents
                    List of full corpus of documents
        '''
        
        self.lexrank = LexRank(documents, stopwords=STOPWORDS['en'])
        self.vect    = self.vect.fit(documents)
    
    def predict(self, src_sents):
        '''
            Make a prediction of two contrasting summaries
            
            Parameters:
                -src_sents
                    Full set of candidate sentences to select from
                    
            Return
                Predicted pair of summaries
        '''
        
        # Calculate LexRank representativesness scores
        sent_scores = self.lexrank.rank_sentences(src_sents)
        sent_scores = np.repeat(sent_scores[None, ...], sent_scores.shape[0], axis = 0)
        
        # Transform sentences into BOWS and calculate cosine distances
        sent_bows   = self.vect.transform(src_sents)
        sent_diffs  = pairwise_distances(sent_bows)
        
        # Sum distance and LexRank scores for a comprehensive score
        cos_scores = sent_scores + (sent_diffs + sent_diffs.transpose())/2
        cos_scores = cos_scores * ~np.eye(sent_scores.shape[0]).astype(bool)
        
        # Select the two summaries with the maximum scores
        sum1_idx, sum2_idx = np.unravel_index(cos_scores.argmax(), cos_scores.shape)
        
        return src_sents[sum1_idx], src_sents[sum2_idx]

# Predictions 

In [65]:
# Place all sentences in PoliSum into a list for fitting LexRank and a vectorizer
all_sentences = [sent for sent_list in polisum['sm_text'].str.split('\|\|\|').values for sent in sent_list]

In [67]:
# Instantiate and fit the LexRankCOS Baseline
lexrank = LexRankCOS()
lexrank.fit(all_sentences)

In [68]:
def get_lr_preds(model, sm_text):
    '''
        Get a single pair of predictions
        
        Parameters:
            -model
                Fitted LexRankCOS Model
            -sm_text
                Source/candidate sentences to select from
                
        Return
            LexRankCOS predictions for contrasting summary pairs
    '''
    
    sm_sents       = sm_text.split('|||')
    l_pred, r_pred = model.predict(sm_sents)
    return l_pred, r_pred

def get_lr_preds_row(model, row):
    '''
        Run LexRankCOS predictions on a dataframe row
        
        Parameters:
            -model
                Fitted LexRankCOS Model
            -row:
                Current row in the dataframe (for use with apply/progress_apply)
                
        Return:
            A modified row containing the LexRank predictions
    '''
    
    l_pred, r_pred = get_lr_preds(model, row['sm_text'])
    row['lexrank_lsum'] = l_pred
    row['lexrank_rsum'] = r_pred
    return row

In [71]:
lr_preds = polisum.progress_apply(lambda row: get_lr_preds_row(lexrank, row), axis = 1)

100%|████████████████████████████████████████████████████████████████████████████████| 735/735 [03:09<00:00,  3.88it/s]


In [89]:
lr_preds.to_csv(RESULTS_PATH)