In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import re
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_json('tweets.json',orient='index')
df.head()

Unnamed: 0,tweet_author,tweet_text
2013-07-18 09:39:46.071961602,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...
2013-07-17 03:40:32.173842437,"Michael Wang, MD",This phase 2 Acalabrutinib-Venetoclax (AV) tri...
2013-07-15 15:41:16.553048065,1stOncology,#NICE backs #AstraZenecas #Calquence for #CLL ...
2013-07-12 19:19:42.367813635,Toby Eyre,#acalabrutinib is a valuable option in pts int...
2013-07-04 12:40:34.334232586,Lymphoma Hub,NICE has recommended the use of acalabrutinib ...


# Preprocessing and Cleaning

### Lower case conversion

In [3]:
df['tweet_text'] = df['tweet_text'].apply(lambda x: x.lower())

### Contraction to Expansion

In [4]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how does",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
" u ": " you ",
" ur ": " your ",
" n ": " and "}

In [5]:
def cont_to_exp(x):
    if type(x) is str:
        for key in contractions:
            value = contractions[key]
            x = x.replace(key, value)
        return x
    else:
        return x

    
df['tweet_text'] = df['tweet_text'].apply(lambda x: cont_to_exp(x))

### Remove Emails, URL, RT, Multiple Spaces, HTML Tags, Special Character & Accented Character

In [6]:
df['tweet_text'] = df['tweet_text'].apply(lambda x: re.sub(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', '', x))

In [7]:
df['tweet_text'] = df['tweet_text'].apply(lambda x: re.sub(r'(http|ftp|https)://([\w_-]+ \
                                                           (?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', x))

In [8]:
df['tweet_text'] = df['tweet_text'].apply(lambda x: re.sub('RT', "", x))

In [9]:
df['tweet_text'] = df['tweet_text'].apply(lambda x: " ".join(x.split()))

In [10]:
from bs4 import BeautifulSoup
df['tweet_text'] = df['tweet_text'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())

In [11]:
df['tweet_text'] = df['tweet_text'].apply(lambda x: re.sub('[^A-Z a-z 0-9-]+', '', x))

In [12]:
import unicodedata
def remove_accented_chars(x):
    x = unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return x

df['tweet_text'] = df['tweet_text'].apply(lambda x: remove_accented_chars(x))

In [13]:
df['tweet_text'] = df['tweet_text'].apply(lambda x: ' '.join(['' if i.startswith('http') or i=='-' or 
                                                              i.isdigit() else i for i in x.split()]))

In [14]:
df.head()

Unnamed: 0,tweet_author,tweet_text
2013-07-18 09:39:46.071961602,Hematopoiesis News,scientists conducted a phase ii study of acala...
2013-07-17 03:40:32.173842437,"Michael Wang, MD",this phase acalabrutinib-venetoclax av trial ...
2013-07-15 15:41:16.553048065,1stOncology,nice backs astrazenecas calquence for cll
2013-07-12 19:19:42.367813635,Toby Eyre,acalabrutinib is a valuable option in pts into...
2013-07-04 12:40:34.334232586,Lymphoma Hub,nice has recommended the use of acalabrutinib ...


# NLP

### Remove Stop Words & Convert to Base Root

In [15]:
df['tweet'] = df['tweet_text'].apply(lambda x: " ".join([t for t in x.split() if t not in STOP_WORDS]))

In [31]:
nlp = spacy.load('en_core_web_lg')

def make_to_base(x):
    x_list = []
    doc = nlp(x)
    
    for token in doc:
        lemma = str(token.lemma_)
        if lemma == '-PRON-' or lemma == 'be':
            lemma = token.text
        x_list.append(lemma)
    return " ".join(x_list)

df['tweet'] = df['tweet'].apply(lambda x: make_to_base(x))

### Entity Recognition

In [32]:
def entity(x):    
    word = []
    doc = nlp(x)
    for ent in doc.ents:
        word.append(str(ent))
        x = re.sub(str(ent),'',x)    
    doc = nlp(x)
    for noun in doc.noun_chunks:
        word.append(str(noun))
    return word

In [33]:
x = 'Pink Pearl Apples are tasty but Empire Apples are not. Empire Apples are very tasty. Pink Pearl Apples are not tasty.\
             Pink Pearl Apples smells really good.'
# x = " ".join([t for t in x.split() if t not in STOP_WORDS])
from spacy import displacy
doc = nlp(x)
displacy.render(doc, style='dep', options={'compact':True, 'distance': 100})

In [34]:
pd.DataFrame(entity(x)).value_counts()

Pink Pearl Apples    3
Empire Apples        2
dtype: int64

## Objective 1

In [35]:
df['entity'] = df['tweet'].apply(lambda x: entity(x))

In [36]:
words = []
for i in df.entity:
    words.extend(i)

In [37]:
df1 = pd.DataFrame(pd.DataFrame(words).value_counts(),columns=['Frequency']).reset_index().rename(columns={0: 'Entity'})  \
                                                                                                  .set_index(['Entity'])

In [134]:
df1.to_csv('objective1.csv')

## Objective 2

In [24]:
from textblob import TextBlob

In [38]:
def polarity(x):
    doc = nlp(x)
    pos = [str(token) for token in doc if 'CONJ' in token.pos_]
    
    if len(pos) == 0:
        return [[x],[TextBlob(x).polarity]]
    else:
        b = []
        for i in range(len(pos)):
            if i==0:
                b.append(x.split(pos[i],maxsplit=1)) 
            else:
                b.append(b[i-1][1].split(pos[i],maxsplit=1))
        f= []
        for i in range(len(b)):
            if i < len(b)-1:
                f.append(b[i][0])
            else:
                f.extend(b[i])
        blob = [TextBlob(i) for i in f]
        return f, [i.polarity for i in blob]
    
def entity_polarity(x,y):
    p=[]
    for i in y:
        for index,j in enumerate(polarity(x)[0]):
            j = " ".join([make_to_base(t) for t in j.split() if t not in STOP_WORDS])
            if i in j:
                p.append(polarity(x)[1][index])
                break
    return p

In [45]:
df['polarity'] = df[['tweet_text','entity']].apply(lambda x: entity_polarity(x['tweet_text'],x['entity']),axis=1)

In [46]:
df.head()

Unnamed: 0,tweet_author,tweet_text,tweet,entity,polarity
2013-07-18 09:39:46.071961602,Hematopoiesis News,scientists conducted a phase ii study of acala...,scientist conduct phase ii study acalabrutinib...,"[scientist conduct phase, acalabrutinib, overa...","[0.0, 0.0, 0.0]"
2013-07-17 03:40:32.173842437,"Michael Wang, MD",this phase acalabrutinib-venetoclax av trial ...,phase acalabrutinib - venetoclax av trial recr...,"[phase, acalabrutinib, work mcl patient relapse]","[0.0, 0.0, 0.0]"
2013-07-15 15:41:16.553048065,1stOncology,nice backs astrazenecas calquence for cll,nice back astrazenecas calquence cll,[nice back astrazenecas calquence cll],[0.6]
2013-07-12 19:19:42.367813635,Toby Eyre,acalabrutinib is a valuable option in pts into...,acalabrutinib valuable option pt intolerant ib...,[valuable option pt intolerant ibrutinib valua...,"[0.05, 0.05]"
2013-07-04 12:40:34.334232586,Lymphoma Hub,nice has recommended the use of acalabrutinib ...,nice recommend use acalabrutinib patient treat...,"[nice recommend, acalabrutinib, patient treatm...","[0.55, 0.55, 0.55, 0.55]"


In [121]:
d=[]
for i in range(df.shape[0]):
    for j in range(min(len(df.polarity[i]),len(df.entity[i]))):
        d.append([df.entity[i][j],df.tweet_author[i],df.polarity[i][j]])

In [131]:
def text_pol(x):
    if x>0:
        return 'Positive'
    elif x<0:
        return 'Negative'
    else:
        return 'Neutral'
df2 = pd.DataFrame(d,columns=['Entity','Author','Polarity1'])
df2['Polarity'] = df2['Polarity1'].apply(lambda x: text_pol(x))
df2.drop(['Polarity1'],inplace=True,axis=1)

In [135]:
df2.to_csv('objective2.csv')