## Experiences AEFI : comment_classification

In this notebook we will run the pupeline to classify comemnts related to experiences with adverse reactions following immunization (AEFI).

In [1]:
# kernel "base clone"
import pandas as pd
import time
import re

from nltk.tokenize import sent_tokenize
import AEFI_pipeline_functions as AP

In [2]:
# load data in chunks
comments_chunks = pd.read_csv('data/sample_of_comments.csv.gz', 
                                chunksize=10000, compression='gzip')

results = []

comment_info_non_matched_sentences = []

n=0 # comments count

t0 = time.time()
t_tot = 0

# loop over comments
for chunk in comments_chunks:
    
    n += chunk.shape[0]
    
    if n%100000 == 0:
        print('Comments processed: ', n)
        print("Number of matches : ", len(results))
        print('Time (min) : ', (time.time() - t0) / 60)
        t_tot += time.time() - t0
        t0 = time.time()
        print()
        
    # get useful info and drop comments with no text
    chunk = chunk[['body', 'comment_author', 'thread_id', 'comment_date', 'c_id']].dropna(subset=['body'])

    # loop over comments
    for idx, row in chunk.iterrows():

        tx, c_id, comment_author, thread_id, comment_date = row.body, row.c_id, row.comment_author, row.thread_id, row.comment_date

        comment_info_matched_sentence = []
        representations = []
        is_reaction_to_vaccine_flags = []
        
        for sentence in sent_tokenize(tx):
            for sent in AP.preprocess_sentence(sentence):

                # not interested in questions or 'if'/'in case'/'unless' questions
                if '?' in sent or re.findall(r'(?i)\b(?:if|in case|unless)\b', sent)!=[]:
                    continue

                idxs_keywords = AP.first_filter_keywords_syntactic(sent)

                ######### manage contextual patterns
                if len(idxs_keywords)>0:
                    is_reaction_to_vaccine_flag = []
                    for key, value in idxs_keywords.items():
                        if 'pattern' in key:
                            is_reaction_to_vaccine_flag.append(value['is_reaction_to_vaccine_flag'])

                    comment_info_matched_sentence.append([sent, any(is_reaction_to_vaccine_flag), 
                                         idxs_keywords['after_vaccine_flag'], idxs_keywords['someone_get_vaccine_flag']])
                    is_reaction_to_vaccine_flags.append(any(is_reaction_to_vaccine_flag))

                    #del idxs_keywords['after_vaccine_flag']
                    #del idxs_keywords['someone_get_vaccine_flag']
                ######### 

                representation = AP.Structured_representation(idxs_keywords)
                representation = [AP.translate_response(rep, c_id, comment_author, thread_id, comment_date) 
                                                                                  for rep in representation]
                representations.extend(representation)

        #comment_info_matched_sentences.extend(comment_info_matched_sentence)

        if len(comment_info_matched_sentence)>0:
            if any(is_reaction_to_vaccine_flags):
                comment_info_non_matched_sentences.append([c_id, True, True, pd.np.nan, pd.np.nan, pd.np.nan])
            else:
                is_related, is_reaction_to_vaccine_flag, after_vaccine_flag, someone_got_vaccine_flag, post_related_flag = AP.check_contextual_patterns_whole_comment(tx, comment_info_matched_sentence, thread_id)
                comment_info_non_matched_sentences.append([c_id, is_related, is_reaction_to_vaccine_flag, after_vaccine_flag, someone_got_vaccine_flag, post_related_flag])

            results.extend(representations)

comment_info_non_matched_sentences = pd.DataFrame(comment_info_non_matched_sentences, 
                                                  columns=['c_id', 'is_related_to_AEFI', 
                                                           'is_reaction_to_vaccine_flag', 
                                                           'after_vaccine_flag', 'someone_got_vaccine_flag', 
                                                           'post_related_flag']).set_index('c_id')            

results = pd.DataFrame.from_dict(results)
results.to_csv("output/Experiences_AEFI/Structured_representation_comments_AEFI.csv")

print()
print("Total time (min) : ", t_tot/60)
print("Number of matches : ", results.shape[0])
print()

print("Number of unique comments retrieved : ", results.groupby(['c_id', 'comment_author', 
                                                  'thread_id', 'comment_date']).apply(lambda rows: 1).sum())
print()
patterns_matched = results.pattern_matched.apply(lambda patt: patt.split('_')[0]).value_counts()
print('Number of matches with "pattern1" pattern : ', patterns_matched['pattern1'])
print('Number of matches with "pattern2" pattern : ', patterns_matched['pattern2'])
print('Number of matches with "pattern3" pattern : ', patterns_matched['pattern3'])

Comments processed:  100000
Number of matches :  6114
Time (min) :  6.728704357147217


Total time (min) :  6.728704798221588
Number of matches :  6831

Number of unique comments retrieved :  5613

Number of matches with "pattern1" pattern :  5387
Number of matches with "pattern2" pattern :  1329
Number of matches with "pattern3" pattern :  115


In [3]:
results.head()

Unnamed: 0,sent,pattern_matched,subjects,verb_lower,verb_lemma,verb_phrase,verb_tense,verb_negation,reactions,after_vaccine_flag,someone_get_vaccine_flag,c_id,comment_author,thread_id,comment_date
0,"yes, a vaccine may give you a fever, but it wi...",pattern1_1,"{'subject_1': {'subject_lower': 'vaccine', 'su...",give,give,may give,Conditional,False,"{'reaction_1': {'reaction_lower': 'fever', 're...",False,True,c2542107730,Daephyl,a64374421,10/10/2016
1,"with my odd she was very fussy, did not sleep ...",pattern2_1,"{'subject_1': {'subject_lower': 'she', 'subjec...",was,be,was,PastSimple,False,"{'reaction_1': {'reaction_lower': 'fussy', 're...",False,False,c2432853402,excited4#2wy,a45037870,10/07/2013
2,i do understand people's concerns and the fact...,pattern1_1,"{'subject_1': {'subject_lower': 'children', 's...",have,have,do have,PresentSimple,False,"{'reaction_1': {'reaction_lower': 'reactions',...",False,False,c2072488727,lmfitzz,a16740945,10/17/2009
3,i am sorry you had a bad reaction.,pattern1_1,"{'subject_1': {'subject_lower': 'you', 'subjec...",had,have,had,PastSimple,False,"{'reaction_1': {'reaction_lower': 'reaction', ...",False,False,c2499115033,June&JoJo,a57516217,06/19/2015
4,"i had a sore arm for a few days, but that was it.",pattern1_1,"{'subject_1': {'subject_lower': 'i', 'subject_...",had,have,had,PastSimple,False,"{'reaction_1': {'reaction_lower': 'arm', 'reac...",False,False,c2499115033,June&JoJo,a57516217,06/19/2015


### Filter & Classifier

In [4]:
# apply the filter
results = results.merge(results.apply(lambda row: AP.Filter(row), axis=1),
                                left_index=True, right_index=True)

# remove filtered out matches
results_filtered_out = results[results.is_reaction=='Not_related']
results = results[results.is_reaction!='Not_related']

# remove suppositional matches
tenses_not_to_keep = ['Conditional', 'Future', 'Gerundive']
results = results[(results.verb_phrase.apply(lambda vp: re.findall(r'(?i)(?:\bcan\b)', vp)==[]))
                   &(~results.verb_tense.isin(tenses_not_to_keep))]

# get comments related to AEFI
c_ids_comments_related = comment_info_non_matched_sentences[
                                                comment_info_non_matched_sentences.is_related_to_AEFI==True].index
results = results[results.c_id.isin(c_ids_comments_related)]

n_react_or_not = results.is_reaction.value_counts()
patterns_matched = results.pattern_matched.apply(lambda patt: patt.split('_')[0]).value_counts()

print('Number of filtered out matches : ', results_filtered_out.shape[0])
print('Number of filtered in matches : ', results.shape[0])
print('Number of matches filtered with "pattern1" : ', patterns_matched['pattern1'])
print('Number of matches filtered with "pattern2" : ', patterns_matched['pattern2'])
print('Number of matches filtered with "pattern3" : ', patterns_matched['pattern3'])
print()
print('Number of matches labeled as "Negative experience" : ', n_react_or_not[True])
print('Number of matches labeled as "Positive experience" : ', n_react_or_not[False])
print()
print("Number of unique comments filtered in : ", results.groupby(['c_id', 'comment_author', 
                                                           'thread_id', 'comment_date']).apply(lambda rows: 1).sum())

Number of filtered out matches :  1930
Number of filtered in matches :  2752
Number of matches filtered with "pattern1" :  2061
Number of matches filtered with "pattern2" :  629
Number of matches filtered with "pattern3" :  62

Number of matches labeled as "Negative experience" :  2104
Number of matches labeled as "Positive experience" :  648

Number of unique comments filtered in :  2259


## Merge Classifications

In [5]:
# classify comments
comments_classified_reaction = []
for c_id, rows in results.groupby('c_id'):
    
    is_reaction_comment = rows.is_reaction.any()
    comment_author, thread_id, comment_date = rows.comment_author.values[0], rows.thread_id.values[0], rows.comment_date.values[0]
    
    if is_reaction_comment:
        
        persons_kind = ', '.join(set([i for i in rows.kind_of_persons.dropna().values]))
        # reactions obtained from pattern 3 are discarded
        rows = rows[rows.pattern_matched.apply(lambda p: 'pattern3' not in p)]
        reactions_kind = ', '.join(set([i for i in rows.kind_of_reactions.dropna().values]))
        
        comments_classified_reaction.append([c_id, comment_author, thread_id, comment_date, 
                                             is_reaction_comment, reactions_kind, persons_kind])
        
    else:
        comments_classified_reaction.append([c_id, comment_author, thread_id, comment_date, 
                                             is_reaction_comment, pd.np.nan, pd.np.nan])
        
comments_classified_reaction = pd.DataFrame(comments_classified_reaction,
                                           columns=['c_id', 'user_name', 'thread_id', 'comment_date',
                                                   'is_reaction_comment', 'reactions_kind', 'personas_kind']) 

print('Final number of comments related to experiences with AEFI: ', comments_classified_reaction.shape[0])
n_react_or_not = comments_classified_reaction.is_reaction_comment.value_counts()
print('Number of matches labeled as "Negative experience" : ', n_react_or_not[True])
print('Number of matches labeled as "Positive experience" : ', n_react_or_not[False])

Final number of comments related to experiences with AEFI:  2259
Number of matches labeled as "Negative experience" :  1768
Number of matches labeled as "Positive experience" :  491


In [6]:
comments_classified_reaction.head()

Unnamed: 0,c_id,user_name,thread_id,comment_date,is_reaction_comment,reactions_kind,personas_kind
0,c2001516448,plum82,a611455,07/13/2008,False,,
1,c2001971856,~*NicoleF*~,a800945,08/07/2008,True,lump,author_child
2,c2001990373,Lauren&MichaelsMommy,a810115,08/08/2008,True,"regression, reaction","author_child, author"
3,c2002059062,thebabymonkeysmommy,a828545,08/13/2008,True,reaction,author_child
4,c2002248001,jcailte,a7075,08/20/2008,True,fever,author_child


In [7]:
comments_classified_reaction.to_csv("output/Experiences_AEFI/comments_classified.csv")