In [1]:
import pandas as pd
import numpy as np

import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import pairwise_distances

import spacy

from tqdm import tqdm
tqdm.pandas()

import itertools

from rouge_score import rouge_scorer
from bert_score import score
from nltk.translate import bleu_score

import statistics

# Data Loading 

In [28]:
POLISUM_PATH = '../../../../data/data_clean/polisum_clean.csv'

RESULTS_PATH = '../../../../results/cmos_results.csv'

SENT_SPLIT_TOK = '|||'

LAMBDA       = 0.5

In [11]:
polisum = pd.read_csv(POLISUM_PATH)

# Filter Out Adjectives 

In [12]:
nlp = spacy.load('en_core_web_sm')
url_regex = r'https?:\/\/\S*'

In [16]:
def remove_sal_terms(text):
    doc = nlp(text)
    new_doc = []
    for token in doc:
        if token.pos_ != 'ADJ' and token.dep_ != 'neg':
            new_doc.append(token.text)
    
    return ' '.join(new_doc)

def process_text(text):
    text_l = re.split(SENT_SPLIT_TOK, text)
    text_l = [t for t in text_l if t not in ('', ' ')]
    
    new_text_l = []
    
    for single_text in text_l:
        new_text_l.append(remove_sal_terms(single_text))
    
    return '|||'.join(new_text_l)

In [17]:
polisum['h1_text'] = polisum['h1_text'].progress_apply(process_text)
polisum['h2_text'] = polisum['h2_text'].progress_apply(process_text)

100%|██████████████████████████████████████████████████████████████████████████████| 977/977 [3:04:13<00:00, 11.31s/it]
100%|██████████████████████████████████████████████████████████████████████████████| 977/977 [2:51:11<00:00, 10.51s/it]


# CMOS Model 

In [22]:
class CMOS():
    
    def __init__(self, lambda_w, vectorizer):
        self.lambda_w = lambda_w
        self.vectorizer = vectorizer
        self.SENT_SPLIT_TOK = '\|\|\|'
    
    def predict(self, l_text, r_text):
        '''
            Predict the representative, contrastive summaries using Contrastive Max-Sum Opinion Summarization.
            Text should not contain adjectives or negations
        '''
        
        half1 = self.split_text(l_text) if isinstance(l_text, str) else l_text
        half2 = self.split_text(r_text) if isinstance(r_text, str) else r_text
        
        half1_bows = self.get_vecs(half1)
        half2_bows = self.get_vecs(half2)
        
        half1_cs = self.calc_cos_sims(half1_bows)
        half2_cs = self.calc_cos_sims(half2_bows)
        sum_cs   = self.combine_cos_sims(half1_cs, half2_cs)
        
        dist_cs  = self.calc_cos_dists(half1_bows, half2_bows)
        
        all_scores    = sum_cs + 2 * self.lambda_w * dist_cs
        h1_idx, h2_idx = np.unravel_index(all_scores.argmax(), all_scores.shape)
        
        return half1[h1_idx], half2[h2_idx]
    
    def calc_cos_sims(self, bows):
        sims = 1. - pairwise_distances(bows, metric = 'cosine')
        sims = sims.mean(-1)
        return sims
    
    def combine_cos_sims(self, cs1s, cs2s):
        combos = itertools.product(cs1s, cs2s)
        combos = [a + b for (a, b) in combos]
        combos = np.array(combos).reshape(len(cs1s), len(cs2s))
        return combos
    
    def calc_cos_dists(self, bows1, bows2):
        dists = pairwise_distances(bows1, bows2, metric = 'cosine')
        return dists
    
    def get_vecs(self, text_list):
        return self.vectorizer.transform(text_list)
    
    def split_text(self, text):
        text_l = re.split(self.SENT_SPLIT_TOK, text)
        text_l = [t for t in text_l if t != '']
        return text_l

# Evaluation 

In [23]:
def make_prediction(model, row):
    h1_text, h2_text = row['h1_text'], row['h2_text']
    
    summ_1, summ_2 = model.predict(h1_text, h2_text)
    
    row['cmos_lsum'] = summ_1
    row['cmos_rsum'] = summ_2
    
    return row

### Vectorizer Training 

In [24]:
vect = CountVectorizer(min_df = 5, max_df = 0.8)
tf_vect = CountVectorizer(min_df = 5, max_df = 0.8)

In [26]:
vect = vect.fit(polisum['sm_text'])
tf_vect = tf_vect.fit(polisum['sm_text'])

In [29]:
cmos    = CMOS(LAMBDA, vect)
tf_cmos = CMOS(LAMBDA, tf_vect)

In [31]:
predictions = polisum.progress_apply(lambda row: make_prediction(cmos, row), axis = 1)

100%|████████████████████████████████████████████████████████████████████████████████| 977/977 [32:56<00:00,  2.02s/it]


In [32]:
predictions.to_csv(RESULTS_PATH, index = None)