Importing required Libraries

In [1]:
import numpy as np
import pandas as pd
import re

######### Raw Data Dump

In [2]:
text_data = pd.read_csv("Text_Data.csv")

In [3]:
text_data.head(2)

Unnamed: 0,VAX_MANU,Symptoms
0,MODERNA,Dysphagia - Epiglottitis - - -
1,MODERNA,Anxiety - Dyspnoea - - -


In [4]:
text_data = text_data.rename(columns = {'Symptoms':'text'})

In [5]:
text_data.shape

(660415, 2)

Pre Processing - Removing unwanted Characters

Pre Processing
Remove Punctuations

In [6]:
text_data.iloc[:,1] = text_data.iloc[:,1].str.replace(r'https(\S)+', r' ')
text_data.iloc[:,1] = text_data.iloc[:,1].str.replace(r'http ...', r' ')

text_data.iloc[:,1] = text_data.iloc[:,1].str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r' ')
text_data.iloc[:,1] = text_data.iloc[:,1].str.replace(r'@[\S]+',r' ')

text_data.iloc[:,1] = text_data.iloc[:,1].str.replace('http', '')

In [7]:
text_data.head(5)

Unnamed: 0,VAX_MANU,text
0,MODERNA,Dysphagia - Epiglottitis - - -
1,MODERNA,Anxiety - Dyspnoea - - -
2,PFIZER\BIONTECH,Chest discomfort - Dysphagia - Pain in extremi...
3,MODERNA,Dizziness - Fatigue - Mobility decreased - -
4,MODERNA,Injection site erythema - Injection site pruri...


In [8]:
PUNCT_TO_REMOVE = '!"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~`'

In [9]:
def remove_punctuation(text):
    return text.translate(str.maketrans(' ', ' ', PUNCT_TO_REMOVE))

In [10]:
text_data.iloc[:,1] = text_data.iloc[:,1].apply(lambda text: remove_punctuation(text))

In [11]:
all_words = list(text_data.iloc[:,1].str.lower().str.split(' ', expand=True).stack().unique())

In [12]:
text_data['text'].head(10)

0                        Dysphagia  Epiglottitis      
1                              Anxiety  Dyspnoea      
2    Chest discomfort  Dysphagia  Pain in extremity...
3           Dizziness  Fatigue  Mobility decreased    
4    Injection site erythema  Injection site prurit...
5                          Pharyngeal swelling        
6           Abdominal pain  Chills  Sleep disorder    
7                    Diarrhoea  Nasal congestion      
8    Vaccination site erythema  Vaccination site pr...
9                                Rash  Urticaria      
Name: text, dtype: object

Remove most occuring words

In [13]:
from collections import Counter
cnt = Counter()
for text in text_data['text'].values:
    for word in text.split():
        cnt[word] += 1

In [14]:
cnt.most_common(16)

[('site', 162886),
 ('Injection', 136989),
 ('Pain', 118228),
 ('pain', 112070),
 ('Headache', 92989),
 ('Pyrexia', 77052),
 ('Fatigue', 76091),
 ('Chills', 69590),
 ('test', 66103),
 ('Dizziness', 55245),
 ('in', 54439),
 ('Nausea', 53891),
 ('swelling', 49820),
 ('Rash', 48707),
 ('extremity', 47606),
 ('Blood', 44000)]

In [15]:
FREQWORDS = set([w for (w, wc) in cnt.most_common(25)])
def remove_freqwords(text):
    """custom function to remove the frequent words"""
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

In [16]:
text_data['text'] = text_data['text'].apply(lambda text: remove_freqwords(text))

In [17]:
text_data['text'].head(10)

0                               Dysphagia Epiglottitis
1                                     Anxiety Dyspnoea
2               discomfort Dysphagia Visual impairment
3                                   Mobility decreased
4                             erythema pruritus warmth
5                                           Pharyngeal
6                             Abdominal Sleep disorder
7                           Diarrhoea Nasal congestion
8    Vaccination erythema Vaccination pruritus Vacc...
9                                            Urticaria
Name: text, dtype: object

Remove rare words

In [18]:
n_rare_words = 10
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
def remove_rarewords(text):
    return " ".join([word for word in str(text).split() if word not in RAREWORDS])

In [19]:
text_data['text'] = text_data['text'].apply(lambda text: remove_rarewords(text))

In [20]:
text_data['text'].head(10)

0                               Dysphagia Epiglottitis
1                                     Anxiety Dyspnoea
2               discomfort Dysphagia Visual impairment
3                                   Mobility decreased
4                             erythema pruritus warmth
5                                           Pharyngeal
6                             Abdominal Sleep disorder
7                           Diarrhoea Nasal congestion
8    Vaccination erythema Vaccination pruritus Vacc...
9                                            Urticaria
Name: text, dtype: object

Lemmatize

In [21]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [22]:
import nltk
!nltk.download('book')

!nltk.download('all-corpora')

In [23]:
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [24]:
text_data['text'] = text_data['text'].apply(lambda text: lemmatize_words(text))

In [25]:
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}

In [26]:
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

In [27]:
text_data['text'] = text_data['text'].apply(lambda text: lemmatize_words(text))

In [28]:
text_data.head(10)

Unnamed: 0,VAX_MANU,text
0,MODERNA,Dysphagia Epiglottitis
1,MODERNA,Anxiety Dyspnoea
2,PFIZER\BIONTECH,discomfort Dysphagia Visual impairment
3,MODERNA,Mobility decrease
4,MODERNA,erythema pruritus warmth
5,MODERNA,Pharyngeal
6,MODERNA,Abdominal Sleep disorder
7,MODERNA,Diarrhoea Nasal congestion
8,MODERNA,Vaccination erythema Vaccination pruritus Vacc...
9,MODERNA,Urticaria


Removal of Stopwords

In [29]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [30]:
STOPWORDS = set(stopwords.words('english'))

In [31]:
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

In [32]:
text_data['text'] = text_data['text'].apply(lambda text: remove_stopwords(text))

In [33]:
text_data['text'].head(10)

0                               Dysphagia Epiglottitis
1                                     Anxiety Dyspnoea
2               discomfort Dysphagia Visual impairment
3                                    Mobility decrease
4                             erythema pruritus warmth
5                                           Pharyngeal
6                             Abdominal Sleep disorder
7                           Diarrhoea Nasal congestion
8    Vaccination erythema Vaccination pruritus Vacc...
9                                            Urticaria
Name: text, dtype: object

Converting Chat Words

**** Defining the Dictionary

In [34]:
chat_words_str = """
AFAIK=As Far As I Know
AFK=Away From Keyboard
ASAP=As Soon As Possible
ATK=At The Keyboard
ATM=At The Moment
A3=Anytime, Anywhere, Anyplace
BAK=Back At Keyboard
BBL=Be Back Later
BBS=Be Back Soon
BFN=Bye For Now
B4N=Bye For Now
BRB=Be Right Back
BRT=Be Right There
BTW=By The Way
B4=Before
B4N=Bye For Now
CU=See You
CUL8R=See You Later
CYA=See You
FAQ=Frequently Asked Questions
FC=Fingers Crossed
FWIW=For What It's Worth
FYI=For Your Information
GAL=Get A Life
GG=Good Game
GN=Good Night
GMTA=Great Minds Think Alike
GR8=Great!
G9=Genius
IC=I See
ICQ=I Seek you (also a chat program)
ILU=ILU: I Love You
IMHO=In My Honest/Humble Opinion
IMO=In My Opinion
IOW=In Other Words
IRL=In Real Life
KISS=Keep It Simple, Stupid
LDR=Long Distance Relationship
LMAO=Laugh My A.. Off
LOL=Laughing Out Loud
LTNS=Long Time No See
L8R=Later
MTE=My Thoughts Exactly
M8=Mate
NRN=No Reply Necessary
OIC=Oh I See
PITA=Pain In The A..
PRT=Party
PRW=Parents Are Watching
ROFL=Rolling On The Floor Laughing
ROFLOL=Rolling On The Floor Laughing Out Loud
ROTFLMAO=Rolling On The Floor Laughing My A.. Off
SK8=Skate
STATS=Your sex and age
ASL=Age, Sex, Location
THX=Thank You
TTFN=Ta-Ta For Now!
TTYL=Talk To You Later
U=You
U2=You Too
U4E=Yours For Ever
WB=Welcome Back
WTF=What The F...
WTG=Way To Go!
WUF=Where Are You From?
W8=Wait...
7K=Sick:-D Laugher
"""

In [35]:
chat_words_map_dict = {}
chat_words_list = []

In [36]:
for line in chat_words_str.split("\n"):
    if line != "":
        cw = line.split("=")[0]
        cw_expanded = line.split("=")[1]
        chat_words_list.append(cw)
        chat_words_map_dict[cw] = cw_expanded
chat_words_list = set(chat_words_list)

In [37]:
def chat_words_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words_list:
            new_text.append(chat_words_map_dict[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [38]:
text_data['text'] = text_data['text'].apply(lambda text: chat_words_conversion(text))

In [39]:
text_data['text'].head(10)

0                               Dysphagia Epiglottitis
1                                     Anxiety Dyspnoea
2               discomfort Dysphagia Visual impairment
3                                    Mobility decrease
4                             erythema pruritus warmth
5                                           Pharyngeal
6                             Abdominal Sleep disorder
7                           Diarrhoea Nasal congestion
8    Vaccination erythema Vaccination pruritus Vacc...
9                                            Urticaria
Name: text, dtype: object

Some more Cleansing using RE Library

In [40]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\']', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = text.lower()       
    return text

In [41]:
text_data['text'] = text_data['text'].apply(lambda text: clean_text(text))

In [42]:
text_data['text'].head(10)

0                               dysphagia epiglottitis
1                                     anxiety dyspnoea
2               discomfort dysphagia visual impairment
3                                    mobility decrease
4                             erythema pruritus warmth
5                                           pharyngeal
6                             abdominal sleep disorder
7                           diarrhoea nasal congestion
8    vaccination erythema vaccination pruritus vacc...
9                                            urticaria
Name: text, dtype: object

In [43]:
num_words = text_data['text'].apply(lambda x: len(x.split()))
num_words_mean, num_words_std = np.mean(num_words), np.std(num_words)

num_sentences = text_data['text'].apply(lambda x: len(re.split( '~ ...' ,'~'.join(x.split('.')))))
num_sentences_mean = np.mean(num_sentences)

Remove extra spaces and Words less than 3 letters

In [44]:
text_data['text'] = text_data['text'].str.replace(r'[ ]{2, }',r' ')
text_data['text'] = text_data['text'].str.replace(r'\b\w{1,3}\b', r'')

In [45]:
text_data.head(10)

Unnamed: 0,VAX_MANU,text
0,MODERNA,dysphagia epiglottitis
1,MODERNA,anxiety dyspnoea
2,PFIZER\BIONTECH,discomfort dysphagia visual impairment
3,MODERNA,mobility decrease
4,MODERNA,erythema pruritus warmth
5,MODERNA,pharyngeal
6,MODERNA,abdominal sleep disorder
7,MODERNA,diarrhoea nasal congestion
8,MODERNA,vaccination erythema vaccination pruritus vacc...
9,MODERNA,urticaria


In [46]:
text_data['text_length'] = [len(text.split(' ')) for text in text_data.text]
print(text_data.shape)

(660415, 3)


Dropping texts with length <3 and drop duplicates

In [47]:
text_data = text_data[text_data['text_length']>2]
text_data = text_data.drop_duplicates(subset=['text'])

print(text_data.shape)

(211327, 3)


In [48]:
text_data.head(10)

Unnamed: 0,VAX_MANU,text,text_length
2,PFIZER\BIONTECH,discomfort dysphagia visual impairment,4
4,MODERNA,erythema pruritus warmth,3
6,MODERNA,abdominal sleep disorder,3
7,MODERNA,diarrhoea nasal congestion,3
8,MODERNA,vaccination erythema vaccination pruritus vacc...,5
10,MODERNA,pressure decrease confusional state decreased ...,6
11,MODERNA,dyspnoea head discomfort,3
12,MODERNA,heart rate decrease heart rate hypertension mu...,8
16,MODERNA,abdominal upper dysgeusia,3
17,MODERNA,pressure discomfort heart rate,4


In [49]:
text_data.to_csv('Cleaned_Text.csv')