## Natural Language Processing with Disaster Tweets

### Step 1: Import necessary libraries

In [53]:
import pandas as pd
import re

In [54]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer


### Step 2: Load and mport the Dataset

In [55]:
data = pd.read_csv('./data/train.csv')
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


### Step 3: Dataset Overview

In [56]:
data.shape

(7613, 5)

In [57]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [58]:
data.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

### Step 4 : Preprocessing

**Data Cleaning Steps**

In [59]:
text = data['text'].to_string()

a. Remove newlines and Tabs

In [60]:
# remove new lines \n and replace with space
clean_data = text.replace("\\n", " ") 

# remove tabs
clean_data = clean_data.replace("\\t", " ") 

# remove white spaces
clean_data = re.sub(re.compile(r'\s+'), " ", clean_data) 

b. Remove Punctuation/ Unicode characters/ Special Characters

In [61]:
# encode text to ascii to remove emoji
clean_data = clean_data.encode("ascii", "ignore") 
clean_data = clean_data.decode()

In [62]:
# remove HTML tags
clean_data = re.sub(r'https?://[a-zA-Z0-9\.\/\-_?=;&]*', '', clean_data)
clean_data = re.sub(r'<[^>]+>', '', clean_data)

In [63]:
unwanted_digit = ['0','1','2','3','4','5','6','7','8','9']

for digit in unwanted_digit:
    clean_data = clean_data.replace(digit, "")

In [64]:
unwanted_punc = ['"',"'",'=','@','&','%','.',',',':','\\','$','^','<','>','!','?','{','}',';','\n','\t','(',')','[',']','/','*','+','#','\u200c','\ufeff','-','_','|']

for punc in unwanted_punc:
    clean_data = clean_data.replace(punc, "")

c. Tokenization and remove stopwords

In [65]:
# nltk.download("stopwords")

In [66]:
# step1: tokenization
tokens = word_tokenize(clean_data)

# step2: remove capitalization tokens
normal_tokens = []
for token in tokens:
    normal_tokens.append(token.lower())

# step3: remove stopwords
clean_stop_words_tokens = []
for token in normal_tokens:
    if token not in stopwords.words("english"): 
        clean_stop_words_tokens.append(token)
# print(clean_stop_words_tokens, len(clean_stop_words_tokens))

d. Repeated characters reduction, for example: Hellllo → Hello

In [67]:
# nltk.download("wordnet")

In [68]:
class RepeatReplacer():
    def __init__(self):
        # The beginning and the end char of word can be anything, but there must be a repeated character in the middle
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)') 
        self.repl = r'\1\2\3' # 1: start chars + a char, 2: middle char, 3: end chars

    def replace(self, word):
        if(wordnet.synsets(word)): 
            return word # if the word was in wordnet returns that word.
            
        repl_word = self.repeat_regexp.sub(self.repl, word) # Delete a repeated character
        if(repl_word != word): 
            return self.replace(repl_word) # Recursive function
        else: 
            return repl_word # The word is not in word net

In [69]:
replacer = RepeatReplacer()

test_token = "Hellllo"
replacer.replace(word=test_token)

'Hello'

In [70]:
replacer = RepeatReplacer()

clean_repeat_tokens = []
for token in clean_stop_words_tokens:
    clean = replacer.replace(word=token)
    clean_repeat_tokens.append(clean)
    if clean != token: print(token, "=> clean: ", clean)

goooooooaaaaaal => clean:  goal
looooool => clean:  lol
cooool => clean:  cool
bbcmtd => clean:  bcmtd
offi => clean:  ofi
soooo => clean:  so
alexissanchez => clean:  alexisanchez
voortrekker => clean:  vortreker
greetingsr => clean:  gretingsr
falle => clean:  fale
acci => clean:  aci
nashvilletraffic => clean:  nashviletrafic
caraccidentlawyer => clean:  caracidentlawyer
mooresville => clean:  moresvile
iredell => clean:  iredel
sleepjunkies => clean:  slepjunkies
cabrillo => clean:  cabrilo
hwymagellan => clean:  hwymagelan
mooresville => clean:  moresvile
acciden => clean:  aciden
donnie => clean:  donie
ashville => clean:  ashvile
naayf => clean:  nayf
aashiqui => clean:  ashiqui
aggarwal => clean:  agarwal
suffield => clean:  sufield
southaccident => clean:  southacident
icemoon => clean:  icemon
icemoon => clean:  icemon
icemoon => clean:  icemon
icemoon => clean:  icemon
icemoon => clean:  icemon
icemoon => clean:  icemon
icemoon => clean:  icemon
alexalltimelow => clean:  ale

In [71]:
# print(clean_repeat_tokens, len(clean_repeat_tokens))

e. Remove Whitespaces, for instance, He llo → Hello

In [72]:
# remove white spaces and tabs
clean_data = clean_data.replace("\\t", " ")
clean_data = re.sub(re.compile(r'\s+'), " ", clean_data)

f. Typo Correction/ Misspelled words: big “dada” → big “data”

In [73]:
# ! pip install autocorrect

In [74]:
from autocorrect import Speller

speller = Speller()

print(speller("big dada"))

big data


In [75]:
from autocorrect import Speller

speller = Speller()

clean_spell_tokens = []
for token in normal_tokens:
    clean = speller(token)
    if clean != token: print(token, "=> clean spell: ", clean)
    clean_spell_tokens.append(clean)

ronge => clean spell:  range
sask => clean spell:  task
hah => clean spell:  has
bago => clean spell:  ago
bago => clean spell:  ago
wayi => clean spell:  way
cooool => clean spell:  cool
mufc => clean spell:  ufc
theyve => clean spell:  theyre
abia => clean spell:  asia
aliver => clean spell:  alive
offi => clean spell:  off
soooo => clean spell:  soon
preachi => clean spell:  preach
arsonist => clean spell:  agonist
noches => clean spell:  niches
trampling => clean spell:  traveling
turkmen => clean spell:  turkey
revel => clean spell:  level
wmv => clean spell:  wm
greetingsr => clean spell:  greetings
falle => clean spell:  false
navista => clean spell:  vista
somet => clean spell:  some
magn => clean spell:  main
cuz => clean spell:  cup
acci => clean spell:  acc
cabrillo => clean spell:  castillo
acciden => clean spell:  accident
donnie => clean spell:  donne
overturns => clean spell:  overturn
ashville => clean spell:  nashville
motorcyclist => clean spell:  motorcycles
damagenh

In [None]:
#print(clean_spell_tokens, len(clean_spell_tokens))

g. lemmatization

In [None]:
lemmatizer = WordNetLemmatizer() # by default Part of speach is Noun or Name
print(lemmatizer.lemmatize("believes"))

belief


In [None]:
lemmatizer = WordNetLemmatizer() # by default Part of speach is Noun or Name

lem_tokens = []
for token in clean_spell_tokens:
    clean = lemmatizer.lemmatize(token)
    if clean != token: print(token, "=>", clean)
    lem_tokens.append(clean)

deeds => deed
residents => resident
wildfires => wildfire
as => a
causes => cause
theres => there
has => ha
fruits => fruit
was => wa
markets => market
crying => cry
was => wa
cars => car
followers => follower
was => wa
thousands => thousand
retainers => retainer
its => it
deputies => deputy
years => year
niches => niche
kurds => kurd
hearts => heart
was => wa
was => wa
was => wa
thousands => thousand
videos => video
means => mean
greetings => greeting
fires => fire
markets => market
does => doe
kids => kid
pills => pill
was => wa
was => wa
as => a
as => a
was => wa
us => u
motorcycles => motorcycle
dies => dy
damages => damage
years => year
was => wa
has => ha
was => wa
was => wa
leaders => leader
ps => p
wants => want
was => wa
issues => issue
habits => habit
difficulties => difficulty
stands => stand
tells => tell
was => wa
gets => get
experts => expert
experts => expert
members => member
goes => go
wings => wing
wings => wing
experts => expert
experts => expert
nodes => node
expert

In [None]:
print(lem_tokens, len(lem_tokens))



### Step 5: Detemine the Features & Target Variable

### Step 6: Split the DataSet to Train & Test

### Step 7: Train the Model using the X_train and y_train

### Step 8: Predicting Test Data

### Step 9: Evaluating the Model

### Model fitting with K-cross Validation and GridSearchCV