In [1]:
import pandas as pd
from nltk.tokenize import sent_tokenize
import time

import Schedule_pipeline_functions as SP

## First filter and structured representation

Here the dataset is loaded and for each sentence we search for the matches according to the patterns "schedule_noun" and "delay_verbs". The output is the structured representation of the matched sentences and it is saved on a csv file.

In [2]:
# load data in chunks from the sample of 100.000 comments
comments_chunks = pd.read_csv('data/sample_of_comments.csv.gz', 
                                chunksize=10000, compression='gzip')
# list of responses (to be transformed in DataFrame)
results = []
n=0 # comments count
n_with_keywords = 0

t0 = time.time()
t_tot = 0

for chunk in comments_chunks:
    
    n += chunk.shape[0]
    
    if n%100000 == 0:
        print('Comments processed: ', n)
        print('Time (min) : ', (time.time() - t0) / 60)
        t_tot += time.time() - t0
        t0 = time.time()
        
    # get useful info and drop comments with no text
    chunk = chunk[['body', 'comment_author', 'thread_id', 'comment_date', 'c_id']].dropna(subset=['body'])
    
    # loop over comments
    for idx, row in chunk.iterrows():
        tx, c_id, comment_author, thread_id, comment_date = row.body, row.c_id, row.comment_author, row.thread_id, row.comment_date
        
        # has one of the keywords?
        n_with_keywords += 0 if ("schedule" not in tx and 'spac' not in tx and 'dela' not in tx and 'split' not in tx) else 1
        # sent tokenizer for comments
        for sent in sent_tokenize(tx):
    
            # apply first filter
            idxs_keywords = SP.first_filter_keywords_syntactic(sent)
            # obtain structured representation
            representation = SP.Structured_representation(idxs_keywords)
            representation = [SP.translate_response(i, c_id, comment_author, thread_id, comment_date) 
                                                                                      for i in representation]

            results.extend(representation)
            
            
results = pd.DataFrame.from_dict(results)

# assign an order to the columns of the df
columns_ordered = ['sent', 'c_id', 'comment_author', 'thread_id', 'comment_date',
                   'pattern_matched', 'text_short', 'negations',
                   'amod_subj_xcomp_lemma', 'amod_subj_xcomp_lower', 
                   'compound_subj_xcomp_lemma', 'compound_subj_xcomp_lower',
                   'pos_subj_xcomp_lemma', 'pos_subj_xcomp_lower',
                   'subject_xcomp_lemma', 'subject_xcomp_lower',
                   'verb_xcomp_lemma', 'verb_xcomp_lower', 'verb_phrase_xcomp', 'verb_tense_xcomp',
                   'amod_subj_lemma', 'amod_subj_lower',
                   'compound_subj_lemma', 'compound_subj_lower',
                   'pos_subj_lemma', 'pos_subj_lower',
                   'subject_lemma', 'subject_lower', 'subject_active',
                   'verb_lemma', 'verb_lower', 'verb_phrase', 'verb_tense',
                   'dobj_amod_lemma', 'dobj_amod_lower',
                   'compound_dobj_lemma', 'compound_dobj_lower',
                   'pos_dobj_lemma', 'pos_dobj_lower',
                   'dobj_lemma', 'dobj_lower']

results = results[columns_ordered]
results.to_csv("results/vaccination_schedule/Structured_representation_schedule.csv")
           
print()
print("Total time (min) : ", t_tot/60)
print("Number of matches : ", results.shape[0])
print()
print('Number of comments containing at least one of the keword : ', n_with_keywords)
print("Number of unique comments retrieved : ", results.groupby(['c_id', 'comment_author', 
                                                           'thread_id', 'comment_date']).apply(lambda rows: 1).sum())
print()
patterns_matched = results.pattern_matched.value_counts()
print('Number of matches with "schedule_noun" pattern : ', patterns_matched['schedule_noun'])
print('Number of matches with "delay_verbs" pattern : ', patterns_matched['delay_verbs'])


Comments processed:  100000
Time (min) :  2.1434847752253217

Total time (min) :  2.143485426902771
Number of matches :  6954

Number of comments containing at least one of the keword :  7902
Number of unique comments retrieved :  5417

Number of matches with "schedule_noun" pattern :  3884
Number of matches with "delay_verbs" pattern :  3070


In [3]:
# show an example of structured representation of a match
example = results.sample().iloc[0]
print('Sentence matched : ', example.sent)
example

Sentence matched :  i avoided vaccines and once dear daughter was born we put her on a delayed schedule.


Unnamed: 0                                                                 136
sent                         i avoided vaccines and once dear daughter was ...
c_id                                                               c2540129100
comment_author                                                littlelivismomma
thread_id                                                            a64086145
comment_date                                                        09/11/2016
pattern_matched                                                  schedule_noun
text_short                                    we put her on a delayed schedule
negations                                                                    1
amod_subj_xcomp_lemma                                                      NaN
amod_subj_xcomp_lower                                                      NaN
compound_subj_xcomp_lemma                                                  NaN
compound_subj_xcomp_lower                           

## FILTER & CLASSIFIER

Now apply the filter to the matched sentences to identify schedule-related comments. The label 'FILTERED_OUT' is assgned to the unrelated matches and the classifier is applied to the related ones.

In [4]:
# here the new column "FILTER" corresponds to the final classification (+1 and -1) or "FILTERED_OUT"
results.loc[:, 'FILTER'] = results.apply(lambda row: SP.Filter(row), axis=1)

# remove filtered out matches
results_filtered_out = results[results.FILTER=='FILTERED_OUT']
results = results[results.FILTER!='FILTERED_OUT']

n_reg_modif = results.FILTER.value_counts()
patterns_matched = results.pattern_matched.value_counts()

print('Number of filtered out matches : ', results_filtered_out.shape[0])
print('Number of filtered in matches : ', results.shape[0])
print('Number of matches filtered in "schedule_noun" pattern : ', patterns_matched['schedule_noun'])
print('Number of matches filtered in "delay_verbs" pattern : ', patterns_matched['delay_verbs'])
print()
print('Number of matches labeled as "recommended" : ', n_reg_modif[1])
print('Number of matches labeled as "alternative" : ', n_reg_modif[-1])
print()
print("Number of unique comments filtered in : ", results.groupby(['c_id', 'comment_author', 
                                                           'thread_id', 'comment_date']).apply(lambda rows: 1).sum())

Number of filtered out matches :  2560
Number of filtered in matches :  4394
Number of matches filtered in "schedule_noun" pattern :  2469
Number of matches filtered in "delay_verbs" pattern :  1925

Number of matches labeled as "recommended" :  1605
Number of matches labeled as "alternative" :  2789

Number of unique comments filtered in :  3746


## Merge Classifications

Once the schedule-related sentences are classified, we proceed by assigning the label to comments by aggregating sentences. We discard:
- comments having more than one sentence with discordant labels
- sentences containing matches in past tenses

Before doing that, we manually modify the tense of all the verbs whose lemma is "split" and identified as "PastSimple". The dependency parser of SpaCy always assign this tense, even if it may refer to the present. We choose to consider all these verbs at present tense.

Finally, we save the file containing the classified comments.

In [5]:
# change tense of split (it is almost always considered as PastSimple)
index_of_comments_split_past = results[(results.verb_lemma=='split')
                                      &(results.verb_tense=='PastSimple')
                                      &(results.verb_phrase=='split')].index

results.loc[index_of_comments_split_past, 'verb_tense'] = 'PresentSimple'
print('CHANGING tense to verbs whose lemma is "split". Number of matches changed : ', len(index_of_comments_split_past))
print()

# discard past tenses
tense_not_to_take = ['PastSimple', 'PastPassive', 'PastContinuous', 'PastPerfect']

n_results = results.shape[0]
results = results[(~results.verb_tense.isin(tense_not_to_take))
                 &(~results.verb_tense_xcomp.isin(tense_not_to_take))]
print('DISCARDING past tenses. Number of matches removed : ', n_results-results.shape[0])
print()

# check how many comments have discordant labels
n_with_more_class = 0
n_with_same_class = 0
n_classes = []

for idx, rows in results.groupby(['c_id', 'comment_author', 'thread_id', 'comment_date']):
    
    n_classes.append(rows.shape[0])
    
    if rows.shape[0] > 1:
        n_with_more_class+=1
        
        if len(set(rows.FILTER.values))==1:
            n_with_same_class+=1
            
print('CHECK COMMENTS WITH DISCORDANT LABELS')
print('Number unique comments ', len(n_classes))
print()
print('Number comments with more classes ', n_with_more_class)
print('Number comments with more classes and homogeneous ', n_with_same_class)
print('Number comments with more classes and discordant ', n_with_more_class-n_with_same_class)

CHANGING tense to verbs whose lemma is "split". Number of matches changed :  46

DISCARDING past tenses. Number of matches removed :  1074

CHECK COMMENTS WITH DISCORDANT LABELS
Number unique comments  2957

Number comments with more classes  317
Number comments with more classes and homogeneous  212
Number comments with more classes and discordant  105


In [6]:
# discard comments with discordant labels and assign class for each comment
comments_classified = results.groupby(['c_id', 'comment_author', 'thread_id', 'comment_date']).filter(lambda rows:
                                                                      len(set(rows.FILTER.values))==1)

comments_classified = comments_classified.groupby(['c_id', 'comment_author', 
                                                   'thread_id', 'comment_date']).apply(lambda rows: 
                                            pd.Series({'CLASS':list(set(rows.FILTER.values))[0]})).reset_index()

print('Discarded comments with discordant labels')
print('Final number of schedule-related comments : ', comments_classified.shape[0])
n_reg_modif = comments_classified.CLASS.value_counts()
print('Number of comments coded as "recommended" : ', n_reg_modif[1])
print('Number of comments coded as "alternative" : ', n_reg_modif[-1])

Discarded comments with discordant labels
Final number of schedule-related comments :  2852
Number of comments coded as "recommended" :  1059
Number of comments coded as "alternative" :  1793


In [7]:
comments_classified.head()

Unnamed: 0,c_id,comment_author,thread_id,comment_date,CLASS
0,c2000186919,Virgo&SparklerMomma,a87195,03/21/2008,1
1,c2000274000,mlbryant_7,a124485,04/04/2008,1
2,c2000380950,pineaple35,a167455,04/16/2008,-1
3,c2000840633,Conshusmama,a363035,05/24/2008,-1
4,c2000841389,~domestic&tattooed~,a363035,05/24/2008,1


In [8]:
comments_classified.to_csv("results/vaccination_schedule/comments_classified.csv")