In [1]:
import pandas as pd
from nltk.tokenize import sent_tokenize
import time

import Schedule_pipeline_functions as SP

## First filter and structured representation

In [2]:
# load data in chunks from the sample of 100.000 comments
comments_chunks = pd.read_csv('data/sample_of_comments.csv.gz', 
                                chunksize=10000, compression='gzip')
# list of responses (to be transformed in DataFrame)
results = []
n=0 # comments count
n_with_keywords = 0

t0 = time.time()
t_tot = 0

for chunk in comments_chunks:
    
    n += chunk.shape[0]
    
    if n%100000 == 0:
        print('Comments processed: ', n)
        print('Time (min) : ', (time.time() - t0) / 60)
        t_tot += time.time() - t0
        t0 = time.time()
        
    # get useful info and drop comments with no text
    chunk = chunk[['body', 'comment_author', 'thread_id', 'comment_date', 'c_id']].dropna(subset=['body'])
    
    # loop over comments
    for idx, row in chunk.iterrows():
        tx, c_id, comment_author, thread_id, comment_date = row.body, row.c_id, row.comment_author, row.thread_id, row.comment_date
        
        # has one of the keywords?
        n_with_keywords += 0 if ("schedule" not in tx and 'spac' not in tx and 'dela' not in tx and 'split' not in tx) else 1
        # sent tokenizer for comments
        for sent in sent_tokenize(tx):
    
            # apply first filter
            idxs_keywords = SP.first_filter_keywords_syntactic(sent)
            # obtain structured representation
            representation = SP.Structured_representation(idxs_keywords)
            representation = [SP.translate_response(i, c_id, comment_author, thread_id, comment_date) 
                                                                                      for i in representation]

            results.extend(representation)
            
            
results = pd.DataFrame.from_dict(results)


columns_ordered = ['sent', 'c_id', 'comment_author', 'thread_id', 'comment_date',
                   'pattern_matched', 'text_short', 'negations',
                   'amod_subj_xcomp_lemma', 'amod_subj_xcomp_lower', 
                   'compound_subj_xcomp_lemma', 'compound_subj_xcomp_lower',
                   'pos_subj_xcomp_lemma', 'pos_subj_xcomp_lower',
                   'subject_xcomp_lemma', 'subject_xcomp_lower',
                   'verb_xcomp_lemma', 'verb_xcomp_lower', 'verb_phrase_xcomp', 'verb_tense_xcomp',
                   'amod_subj_lemma', 'amod_subj_lower',
                   'compound_subj_lemma', 'compound_subj_lower',
                   'pos_subj_lemma', 'pos_subj_lower',
                   'subject_lemma', 'subject_lower', 'subject_active',
                   'verb_lemma', 'verb_lower', 'verb_phrase', 'verb_tense',
                   'dobj_amod_lemma', 'dobj_amod_lower',
                   'compound_dobj_lemma', 'compound_dobj_lower',
                   'pos_dobj_lemma', 'pos_dobj_lower',
                   'dobj_lemma', 'dobj_lower']

results = results[columns_ordered]
#results.to_csv("github_sample/Structured_representation_schedule.csv")
           
print()
print("Total time (min) : ", t_tot/60)
print("Number of matches : ", results.shape[0])
print()
print('Number of comments containing at least one of the keword : ', n_with_keywords)
print("Number of unique comments retrieved : ", results.groupby(['c_id', 'comment_author', 
                                                           'thread_id', 'comment_date']).apply(lambda rows: 1).sum())
print()
patterns_matched = results.pattern_matched.value_counts()
print('Number of matches with "schedule_noun" pattern : ', patterns_matched['schedule_noun'])
print('Number of matches with "delay_verbs" pattern : ', patterns_matched['delay_verbs'])


Comments processed:  100000
Time (min) :  2.7876848657925923

Total time (min) :  2.7876885692278544
Number of matches :  6731

Number of comments containing at least one of the keword :  7902
Number of unique comments retrieved :  5254

Number of matches with "schedule_noun" pattern :  3707
Number of matches with "delay_verbs" pattern :  3024


In [3]:
results.iloc[:20]

Unnamed: 0,sent,c_id,comment_author,thread_id,comment_date,pattern_matched,text_short,negations,amod_subj_xcomp_lemma,amod_subj_xcomp_lower,...,verb_phrase,verb_tense,dobj_amod_lemma,dobj_amod_lower,compound_dobj_lemma,compound_dobj_lower,pos_dobj_lemma,pos_dobj_lower,dobj_lemma,dobj_lower
0,"plus he is a brand new baby, i would like to d...",c2420670102,ohbabyLiam,a42201085,05/23/2013,delay_verbs,i would like to delay,1,,,...,to delay,Infinite,,,,,,,,
1,"so far i am only planning on the dtap, polio, ...",c2420670102,ohbabyLiam,a42201085,05/23/2013,delay_verbs,i am spacing them,1,,,...,am spacing,PresentContinuous,,,,,,,them,them
2,you should post this to the none and selected ...,c2471479210,Rach3740,a52858753,10/09/2014,delay_verbs,delayed,1,,,...,delayed,PastParticipe,,,,,,,,
3,i found a pedi favorable to selective and dela...,c2471479210,Rach3740,a52858753,10/09/2014,delay_verbs,delay,1,,,...,delay,Infinite,,,,,,,,
4,japan had the highest incidence in sids until ...,c2063817359,haileycakes,a15243575,09/17/2009,schedule_noun,they delayed their vaccination schedule,1,,,...,delayed,PastSimple,,,vaccination,vaccination,their,their,schedule,schedule
5,japan had the highest incidence in sids until ...,c2063817359,haileycakes,a15243575,09/17/2009,delay_verbs,they delayed their vaccination schedule,1,,,...,delayed,PastSimple,,,vaccination,vaccination,their,their,schedule,schedule
6,i have decided to talk it out with my pedi... ...,c2345789374,dawnnrose,a27819267,05/24/2011,schedule_noun,my biggest issue with the current vax schedule is,1,,,...,is,PresentSimple,"current, vax","current, vax",,,,,schedule,schedule
7,so i know i wont be doing any shots at 2m but ...,c2345789374,dawnnrose,a27819267,05/24/2011,schedule_noun,i wont be doing any shots at 2m but idk my exa...,-1,,,...,idk,Infinite,exact,exact,,,my,my,schedule,schedule
8,i will be vaccinating my baby on schedule.,c2379458968,skipper91,a32712217,04/16/2012,schedule_noun,i will be vaccinating my baby on schedule,1,,,...,will be vaccinating,Future,,,,,,,schedule,schedule
9,i am a nurse and think vaccinating on schedule...,c2365347871,mrosev14,a30400851,11/23/2011,schedule_noun,i am a nurse and think vaccinating on schedule,1,,,...,vaccinating,Gerundive,,,,,,,schedule,schedule


In [6]:
text = results.iloc[9].sent
text

'i am a nurse and think vaccinating on schedule is the right decision for our family--i do not think everyone needs to be so rude to one another on this subject.'

In [7]:
idxs_keywords = SP.first_filter_keywords_syntactic(text)
idxs_keywords

{'schedule_noun': {'keywords_idxs': [38],
  'text': 'i am a nurse and think vaccinating on schedule is the right decision for our family--i do not think everyone needs to be so rude to one another on this subject.',
  'G': <networkx.classes.digraph.DiGraph at 0x1a301f5d90>}}

In [9]:
idxs_keywords['schedule_noun']['G'].nodes('lower')

NodeDataView({0: 'i', 2: 'am', 5: 'a', 7: 'nurse', 13: 'and', 17: 'think', 23: 'vaccinating', 35: 'on', 38: 'schedule', 47: 'is', 50: 'the', 60: 'decision', 54: 'right', 69: 'for', 73: 'our', 77: 'family', 83: '--', 94: 'think', 85: 'i', 87: 'do', 90: 'not', 100: 'everyone', 109: 'needs', 115: 'to', 118: 'be', 121: 'so', 124: 'rude', 129: 'to', 132: 'one', 136: 'another', 144: 'on', 147: 'this', 152: 'subject', 159: '.'}, data='lower')

In [10]:
idxs_keywords['schedule_noun']['G'].edges(38, data='dep')

OutEdgeDataView([(38, 35, 'pobj')])

In [11]:
idxs_keywords['schedule_noun']['G'].edges(35, data='dep')

OutEdgeDataView([(35, 23, 'prep')])

In [12]:
idxs_keywords['schedule_noun']['G'].edges(23, data='dep')

OutEdgeDataView([(23, 17, 'xcomp')])

In [13]:
idxs_keywords['schedule_noun']['G'].edges(47, data='dep')

OutEdgeDataView([(47, 17, 'ccomp')])

In [14]:
idxs_keywords['schedule_noun']['G'].in_edges(47, data='dep')

InEdgeDataView([(60, 47, 'attr')])

## FILTER & CLASSIFIER

In [4]:
# here the new column "FILTER" corresponds to the final classification (+1 and -1) or "FILTERED_OUT"
results.loc[:, 'FILTER'] = results.apply(lambda row: SP.Filter(row), axis=1)

# remove filtered out matches
results_filtered_out = results[results.FILTER=='FILTERED_OUT']
results = results[results.FILTER!='FILTERED_OUT']

n_reg_modif = results.FILTER.value_counts()
patterns_matched = results.pattern_matched.value_counts()

print('Number of filtered out matches : ', results_filtered_out.shape[0])
print('Number of filtered in matches : ', results.shape[0])
print('Number of matches filtered in "schedule_noun" pattern : ', patterns_matched['schedule_noun'])
print('Number of matches filtered in "delay_verbs" pattern : ', patterns_matched['delay_verbs'])
print()
print('Number of matches labeled as "recommended" : ', n_reg_modif[1])
print('Number of matches labeled as "alternative" : ', n_reg_modif[-1])
print()
print("Number of unique comments filtered in : ", results.groupby(['c_id', 'comment_author', 
                                                           'thread_id', 'comment_date']).apply(lambda rows: 1).sum())

Number of filtered out matches :  2560
Number of filtered in matches :  4394
Number of matches filtered in "schedule_noun" pattern :  2469
Number of matches filtered in "delay_verbs" pattern :  1925

Number of matches labeled as "recommended" :  1605
Number of matches labeled as "alternative" :  2789

Number of unique comments filtered in :  3746


## Merge Classifications

In [5]:
# 

# change tense of split (it is almost always considered as PastSimple)
index_of_comments_split_past = results[(results.verb_lemma=='split')
                                      &(results.verb_tense=='PastSimple')
                                      &(results.verb_phrase=='split')].index

results.loc[index_of_comments_split_past, 'verb_tense'] = 'PresentSimple'
print('CHANGING tense to verbs whose lemma is "split". Number of matches changed : ', len(index_of_comments_split_past))
print()

# discard past tenses
tense_not_to_take = ['PastSimple', 'PastPassive', 'PastContinuous', 'PastPerfect']

n_results = results.shape[0]
results = results[(~results.verb_tense.isin(tense_not_to_take))
                 &(~results.verb_tense_xcomp.isin(tense_not_to_take))]
print('DISCARDING past tenses. Number of matches removed : ', n_results-results.shape[0])
print()

# check how many comments have discordant labels
n_with_more_class = 0
n_with_same_class = 0
n_classes = []

for idx, rows in results.groupby(['c_id', 'comment_author', 'thread_id', 'comment_date']):
    
    n_classes.append(rows.shape[0])
    
    if rows.shape[0] > 1:
        n_with_more_class+=1
        
        if len(set(rows.FILTER.values))==1:
            n_with_same_class+=1
            
print('CHECK COMMENTS WITH DISCORDANT LABELS')
print('Number unique comments ', len(n_classes))
print()
print('Number comments with more classes ', n_with_more_class)
print('Number comments with more classes and homogeneous ', n_with_same_class)
print('Number comments with more classes and discordant ', n_with_more_class-n_with_same_class)

CHANGING tense to verbs whose lemma is "split". Number of matches changed :  46

DISCARDING past tenses. Number of matches removed :  1074

CHECK COMMENTS WITH DISCORDANT LABELS
Number unique comments  2957

Number comments with more classes  317
Number comments with more classes and homogeneous  212
Number comments with more classes and discordant  105


In [6]:
# discard comments with discordant labels and assign class for each comment
comments_classified = results.groupby(['c_id', 'comment_author', 'thread_id', 'comment_date']).filter(lambda rows:
                                                                      len(set(rows.FILTER.values))==1)

comments_classified = comments_classified.groupby(['c_id', 'comment_author', 
                                                   'thread_id', 'comment_date']).apply(lambda rows: 
                                            pd.Series({'CLASS':list(set(rows.FILTER.values))[0]})).reset_index()

print('Discarded comments with discordant labels')
print('Final number of schedule-related comments : ', comments_classified.shape[0])
n_reg_modif = comments_classified.CLASS.value_counts()
print('Number of comments coded as "recommended" : ', n_reg_modif[1])
print('Number of comments coded as "alternative" : ', n_reg_modif[-1])

Discarded comments with discordant labels
Final number of schedule-related comments :  2852
Number of comments coded as "recommended" :  1059
Number of comments coded as "alternative" :  1793


In [7]:
comments_classified.head()

Unnamed: 0,c_id,comment_author,thread_id,comment_date,CLASS
0,c2000186919,Virgo&SparklerMomma,a87195,03/21/2008,1
1,c2000274000,mlbryant_7,a124485,04/04/2008,1
2,c2000380950,pineaple35,a167455,04/16/2008,-1
3,c2000840633,Conshusmama,a363035,05/24/2008,-1
4,c2000841389,~domestic&tattooed~,a363035,05/24/2008,1


In [8]:
comments_classified.to_csv("github_sample/comments_classified.csv")

In [36]:
# for debugging

results = []
for tx in ["we are doing an alternative schedule; dr bob's alternative vaccine schedule 2 months: dtap, rotavirus 3 months: pc, hib 4 months: dtap, rotavirus 5 months: pc, hib 6 months: dtap, rotavirus 7 months: pc, hib 9 months: polio 1 two months: mumps, polio 15 months: pc, hib 18 months: dtap, chickenpox 2 years: rubella, polio 2 1 and 2 years: hep b, hep a 3 years: hep b, measles 3 1 and 2 years: hep b, hep a 4 years: dtap, polio 5 years: mmr 6 years: chickenpox 1 two years: tdap, hpv 1 two years, 2 months: hpv 1 three years: hpv, meningococcal however, our pediatrician does not give the rotovirus vaccine because it was recalled 4 ears ago, and has just recently been re-released."]:

    for sent in sent_tokenize(tx):

        # apply first filter
        idxs_keywords = SP.first_filter_keywords_syntactic(sent)
        # obtain structured representation
        representation = SP.Structured_representation(idxs_keywords)
        representation = [SP.translate_response(i, 'c_id', 'comment_author', 'thread_id', 'comment_date') 
                                                                                  for i in representation]

        results.extend(representation)
            
            
results = pd.DataFrame.from_dict(results)


columns_ordered = ['sent', 
                   'pattern_matched', 'text_short', 'negations',
                   'amod_subj_xcomp_lemma', 'amod_subj_xcomp_lower', 
                   'compound_subj_xcomp_lemma', 'compound_subj_xcomp_lower',
                   'pos_subj_xcomp_lemma', 'pos_subj_xcomp_lower',
                   'subject_xcomp_lemma', 'subject_xcomp_lower',
                   'verb_xcomp_lemma', 'verb_xcomp_lower', 'verb_phrase_xcomp', 'verb_tense_xcomp',
                   'amod_subj_lemma', 'amod_subj_lower',
                   'compound_subj_lemma', 'compound_subj_lower',
                   'pos_subj_lemma', 'pos_subj_lower',
                   'subject_lemma', 'subject_lower', 'subject_active',
                   'verb_lemma', 'verb_lower', 'verb_phrase', 'verb_tense',
                   'dobj_amod_lemma', 'dobj_amod_lower',
                   'compound_dobj_lemma', 'compound_dobj_lower',
                   'pos_dobj_lemma', 'pos_dobj_lower',
                   'dobj_lemma', 'dobj_lower']

cols = results.columns

for col in columns_ordered:
    if col not in cols:
        results.loc[:, col] = pd.np.nan

results = results[columns_ordered]
#results.to_csv("Vaccination_schedule_outputs/Structured_representation_schedule.csv")
           


In [37]:
representation

[{'sent': "we are doing an alternative schedule; dr bob's alternative vaccine schedule 2 months: dtap, rotavirus 3 months: pc, hib 4 months: dtap, rotavirus 5 months: pc, hib 6 months: dtap, rotavirus 7 months: pc, hib 9 months: polio 1 two months: mumps, polio 15 months: pc, hib 18 months: dtap, chickenpox 2 years: rubella, polio 2 1 and 2 years: hep b, hep a 3 years: hep b, measles 3 1 and 2 years: hep b, hep a 4 years: dtap, polio 5 years: mmr 6 years: chickenpox 1 two years: tdap, hpv 1 two years, 2 months: hpv 1 three years: hpv, meningococcal however, our pediatrician does not give the rotovirus vaccine because it was recalled 4 ears ago, and has just recently been re-released.",
  'pattern_matched': 'schedule_noun',
  'dobj_amod_lemma': 'alternative',
  'dobj_amod_lower': 'alternative',
  'compound_dobj_lemma': nan,
  'compound_dobj_lower': nan,
  'pos_dobj_lemma': nan,
  'pos_dobj_lower': nan,
  'dobj_lemma': 'schedule',
  'dobj_lower': 'schedule',
  'verb_lemma': 'do',
  'verb

In [38]:
results.iloc[1].sent

"we are doing an alternative schedule; dr bob's alternative vaccine schedule 2 months: dtap, rotavirus 3 months: pc, hib 4 months: dtap, rotavirus 5 months: pc, hib 6 months: dtap, rotavirus 7 months: pc, hib 9 months: polio 1 two months: mumps, polio 15 months: pc, hib 18 months: dtap, chickenpox 2 years: rubella, polio 2 1 and 2 years: hep b, hep a 3 years: hep b, measles 3 1 and 2 years: hep b, hep a 4 years: dtap, polio 5 years: mmr 6 years: chickenpox 1 two years: tdap, hpv 1 two years, 2 months: hpv 1 three years: hpv, meningococcal however, our pediatrician does not give the rotovirus vaccine because it was recalled 4 ears ago, and has just recently been re-released."

In [39]:
idxs_keywords

{'schedule_noun': {'keywords_idxs': [28, 67],
  'text': "we are doing an alternative schedule; dr bob's alternative vaccine schedule 2 months: dtap, rotavirus 3 months: pc, hib 4 months: dtap, rotavirus 5 months: pc, hib 6 months: dtap, rotavirus 7 months: pc, hib 9 months: polio 1 two months: mumps, polio 15 months: pc, hib 18 months: dtap, chickenpox 2 years: rubella, polio 2 1 and 2 years: hep b, hep a 3 years: hep b, measles 3 1 and 2 years: hep b, hep a 4 years: dtap, polio 5 years: mmr 6 years: chickenpox 1 two years: tdap, hpv 1 two years, 2 months: hpv 1 three years: hpv, meningococcal however, our pediatrician does not give the rotovirus vaccine because it was recalled 4 ears ago, and has just recently been re-released.",
  'G': <networkx.classes.digraph.DiGraph at 0x7f5fc4741f50>}}

In [41]:
idxs_keywords['schedule_noun']['G'].in_edges(67, data='dep')

InEdgeDataView([(59, 67, 'compound')])

In [24]:
idxs_keywords['schedule_noun']['G'].nodes('lower')

NodeDataView({0: 'we', 7: 'doing', 3: 'are', 13: 'an', 28: 'schedule', 16: 'alternative', 36: ';', 38: 'dr', 41: 'bob', 59: 'vaccine', 44: "'s", 47: 'alternative', 67: 'schedule', 76: '2', 78: 'months', 84: ':', 86: 'dtap', 90: ',', 92: 'rotavirus', 102: '3', 104: 'months', 110: ':', 112: 'pc', 114: ',', 116: 'hib', 120: '4', 122: 'months', 128: ':', 130: 'dtap', 134: ',', 136: 'rotavirus', 146: '5', 148: 'months', 154: ':', 156: 'pc', 158: ',', 160: 'hib', 164: '6', 166: 'months', 172: ':', 174: 'dtap', 178: ',', 180: 'rotavirus', 190: '7', 192: 'months', 198: ':', 200: 'pc', 202: ',', 204: 'hib', 210: 'months', 208: '9', 216: ':', 218: 'polio', 224: '1', 226: 'two', 230: 'months', 236: ':', 238: 'mumps', 243: ',', 245: 'polio', 251: '15', 254: 'months', 260: ':', 262: 'pc', 264: ',', 266: 'hib', 270: '18', 273: 'months', 279: ':', 281: 'dtap', 285: ',', 287: 'chickenpox', 298: '2', 300: 'years', 305: ':', 307: 'rubella', 314: ',', 316: 'polio', 322: '2', 324: '1', 326: 'and', 330: '2

In [26]:
idxs_keywords['schedule_noun']['G'].edges(41, data='dep')

OutEdgeDataView([(41, 59, 'poss')])