In [238]:
import pandas as pd
import numpy as np

import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import pairwise_distances

import spacy

from tqdm import tqdm
tqdm.pandas()

from nltk.sentiment.vader import SentimentIntensityAnalyzer

import itertools


from rouge_score import rouge_scorer
from bert_score import score

# Data Loading 

In [5]:
DATA_SOURCE = '../../data/data_clean/polisumm_final.csv'

In [6]:
data = pd.read_csv(DATA_SOURCE)

In [29]:
data.iloc[0]['tweet_text']



# Filter Out Adjectives 

In [64]:
nlp = spacy.load('en_core_web_sm')
url_regex = r'https?:\/\/\S*'

In [283]:
def remove_sal_terms(text):
    doc = nlp(text)
    new_doc = []
    for token in doc:
        if token.pos_ != 'ADJ' and token.dep_ != 'neg':
            new_doc.append(token.text)
    
    return ' '.join(new_doc)

def process_text(text):
    text = re.sub(url_regex, '', str(text))
    text_l = re.split(punct_regex, text)
    text_l = [t for t in text_l if t not in ('', ' ')]
    
    new_text_l = []
    
    for single_text in text_l:
        new_text_l.append(remove_sal_terms(single_text))
    
    return ' | '.join(new_text_l)

In [None]:
data['tweet_text_proc'] = data['tweet_text'].progress_apply(process_text)
data['reddit_text_proc'] = data['reddit_text'].progress_apply(process_text)

  2%|█▎                                                                            | 20/1199 [02:52<4:27:27, 13.61s/it]

In [None]:
data.to_csv('../../data/data_clean/polisumm_final_cmos.csv', index = None)

# Sentiment for Sorting

In [None]:
punct_regex = '\s*[\.\?\!\|\n\:]\s*[\.\?\!\|\n\:]*\s*'

In [None]:
sid = SentimentIntensityAnalyzer()

In [None]:
def get_sentiments(text):
    text = re.sub(url_regex, '', str(text))
    text_l = re.split(punct_regex, text)
    text_l = [t for t in text_l if t != '']
    
    sent_scores = []
    for sent in text_l:
        sentiments = sid.polarity_scores(sent)
        sent_scores.append(sentiments)
    
    return sent_scores

In [None]:
data['tweet_sent_scores'] = data['tweet_text'].progress_apply(get_sentiments)
data['reddit_sent_scores'] = data['reddit_text'].progress_apply(get_sentiments)

In [None]:
data.to_csv('../../data/data_clean/polisumm_final_cmos.csv', index = None)

# Combine 

In [None]:
def combine_sents(row):
    sent_tw = row['tweet_sent_scores']
    sent_rd = row['reddit_sent_scores']
    return (sent_tw + sent_rd)

In [None]:
# data['all_texts'] = data['tweet_text'] + ' ' + data['reddit_text']
data['all_texts_proc'] = data['tweet_text_proc'].fillna('') + ' ' + data['reddit_text_proc'].fillna('')
# data['all_sent_scores'] = data.progress_apply(combine_sents, axis = 1)

In [None]:
data['tweet_text_proc']

In [None]:
data['all_texts_proc']

# Vectorizer Training 

In [271]:
vect = CountVectorizer(min_df = 5, max_df = 0.8)
tf_vect = CountVectorizer(min_df = 5, max_df = 0.8)

In [272]:
vect = vect.fit(data['all_texts_proc'])
tf_vect = tf_vect.fit(data['all_texts_proc'])

# CMOS Model 

In [273]:
class CMOS():
    
    def __init__(self, lambda_w, vectorizer):
        self.lambda_w = lambda_w
        self.vectorizer = vectorizer
        self.PUNCT_REGEX = '\s*[\.\?\!\|\n\:]\s*[\.\?\!\|\n\:]*\s*'
    
    def predict(self, text, sentiments):
        '''
            Text should not contain adjectives or negations
        '''
        
        text_l = self.split_text(text) if isinstance(text, str) else text
        
        sent_scores = [sent['compound'] for sent in sentiments]
        
        assert len(text_l) == len(sent_scores), 'Text and sentiment scores are of different lengths'
        
        text_sents = [(stext, ssent) for stext, ssent in zip(text_l, sent_scores)]
        text_sents.sort(key = lambda pair: pair[1])
        
        half1 = [t[0] for t in text_sents[:len(text_sents)//2]]
        half2 = [t[0] for t in text_sents[len(text_sents)//2:]]
        
        half1_bows = self.get_vecs(half1)
        half2_bows = self.get_vecs(half2)
        
        half1_cs = self.calc_cos_sims(half1_bows)
        half2_cs = self.calc_cos_sims(half2_bows)
        sum_cs   = self.combine_cos_sims(half1_cs, half2_cs)
        
        dist_cs  = self.calc_cos_dists(half1_bows, half2_bows)
        
        all_scores    = sum_cs + 2 * self.lambda_w * dist_cs
        h1_idx, h2_idx = np.unravel_index(all_scores.argmax(), all_scores.shape)
        
        return half1[h1_idx], half2[h2_idx]
    
    def calc_cos_sims(self, bows):
        sims = 1. - pairwise_distances(bows, metric = 'cosine')
        sims = sims.mean(-1)
        return sims
    
    def combine_cos_sims(self, cs1s, cs2s):
        combos = itertools.product(cs1s, cs2s)
        combos = [a + b for (a, b) in combos]
        combos = np.array(combos).reshape(len(cs1s), len(cs2s))
        return combos
    
    def calc_cos_dists(self, bows1, bows2):
        dists = pairwise_distances(bows1, bows2, metric = 'cosine')
        return dists
    
    def get_vecs(self, text_list):
        return self.vectorizer.transform(text_list)
    
    def split_text(self, text):
        text_l = re.split(self.PUNCT_REGEX, text)
        text_l = [t for t in text_l if t != '']
        return text_l

# Evaluation 

In [274]:
cmos    = CMOS(0.5, vect)
tf_cmos = CMOS(0.5, tf_vect)

In [275]:
refs = data['left_sum'] + ' | ' + data['right_sum']

In [279]:
def make_prediction(model, row):
    text = row['all_texts_proc']
    sent_scores = row['all_sent_scores']
    
    summ_1, summ_2 = model.predict(text, sent_scores)
    return summ_1 + ' | ' + summ_2

In [280]:
predictions = data.progress_apply(lambda row: make_prediction(cmos, row), axis = 1)

  0%|                                                                                | 1/1199 [00:00<00:08, 133.56it/s]






AssertionError: Text and sentiment scores are of different lengths