# Imports 

In [49]:
from cmos import CMOS

import pandas as pd
import numpy as np

import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import pairwise_distances

import spacy

from tqdm import tqdm
tqdm.pandas()

import itertools

from rouge_score import rouge_scorer
from bert_score import score
from nltk.translate import bleu_score

import statistics

from typing import List, Tuple, Union, Optional

 # Settings and Configuration

In [34]:
POLISUM_PATH = '../../../../data/data_clean/polisum_clean.csv'

RESULTS_PATH = '../../../../results/cmos_results.csv'

SENT_SPLIT_TOK = '|||'
SENT_RSPLIT_TOK = '\|\|\|'

# Contrastiveness weight for CMOS model
LAMBDA       = 0.5

# Data Loading 

In [36]:
polisum = pd.read_csv(POLISUM_PATH)

# Data Processing

Contrastive Max-Sum Opinion Summarization filters out adjectives and negation terms when comparing texts for contrastiveness and representativeness. So, the data is preprocessed according to these steps. 

In [38]:
nlp = spacy.load('en_core_web_sm')
url_regex = r'https?:\/\/\S*'

In [39]:
def remove_sal_terms(text: str) -> str:
    '''
        Removes salient terms (adjectives and negation terms) from text
        
        Parameters:
            -text: str
                Text to filter adjectives from
        
        Return
            Text with salient terms removed
    '''
    
    doc = nlp(text)
    new_doc = []
    for token in doc:
        if token.pos_ != 'ADJ' and token.dep_ != 'neg':
            new_doc.append(token.text)
    
    return ' '.join(new_doc)

def process_text(text: str) -> str:
    '''
        Preprocess text by removing salient terms and remove urls or empty documents
        
        Parameters:
            -text: str
                Text to preprocess for CMOS
        
        Return
            Preprocessed text according to CMOS preprocessing steps
    '''
    
    text_l = re.split(SENT_RSPLIT_TOK, text)
    text_l = [t for t in text_l if t not in ('', ' ')]
    
    new_text_l = []
    
    for single_text in text_l:
        new_text_l.append(remove_sal_terms(single_text))
    
    return '|||'.join(new_text_l)

In [40]:
# Preprocess texts to prepare for inference
polisum['h1_text'] = polisum['h1_text'].progress_apply(process_text)
polisum['h2_text'] = polisum['h2_text'].progress_apply(process_text)

100%|████████████████████████████████████████████████████████████████████████████████| 735/735 [02:53<00:00,  4.24it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 735/735 [02:49<00:00,  4.33it/s]


# Evaluation 

In [43]:
def make_prediction(model, row):
    '''
        Pandas apply function to make contrastive summary pair predictions
        
        Parameters:
            -model
                Instantiated CMOS model to make predictions
            -row
                Current row of the DataFrame
        
        Return
            A modified row containing CMOS predictions
    '''
    h1_text, h2_text = row['h1_text'], row['h2_text']
    
    summ_1, summ_2 = model.predict(h1_text, h2_text)
    
    row['cmos_lsum'] = summ_1
    row['cmos_rsum'] = summ_2
    
    return row

### Vectorizer Training 

In [44]:
vect = CountVectorizer(min_df = 5, max_df = 0.8)
tf_vect = CountVectorizer(min_df = 5, max_df = 0.8)

In [45]:
vect = vect.fit(polisum['sm_text'])
tf_vect = tf_vect.fit(polisum['sm_text'])

In [46]:
cmos    = CMOS(LAMBDA, vect)
tf_cmos = CMOS(LAMBDA, tf_vect)

In [47]:
predictions = polisum.progress_apply(lambda row: make_prediction(cmos, row), axis = 1)

100%|███████████████████████████████████████████████████████████████████████████████| 735/735 [00:03<00:00, 198.93it/s]


In [48]:
predictions.to_csv(RESULTS_PATH, index = None)