1) Import Libraries

In [1]:
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

2) Load Data

In [2]:
data = pd.read_csv("train.csv")
data

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [3]:
df = data[['text', 'target']]
df['text'] = df['text'].str.lower()

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].str.lower()


Unnamed: 0,text,target
0,our deeds are the reason of this #earthquake m...,1
1,forest fire near la ronge sask. canada,1
2,all residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,just got sent this photo from ruby #alaska as ...,1
...,...,...
7608,two giant cranes holding a bridge collapse int...,1
7609,@aria_ahrary @thetawniest the out of control w...,1
7610,m1.94 [01:04 utc]?5km s of volcano hawaii. htt...,1
7611,police investigating after an e-bike collided ...,1


3) Clean Data

In [4]:
url = "https://www.somesite.com This is othe text."

pattern = re.compile(r'https?://\S*')
pattern.findall(url)

['https://www.somesite.com']

In [5]:
pattern.sub("$$$", url)

'$$$ This is othe text.'

In [6]:
df['text'] = df['text'].apply(lambda x: pattern.sub("", x))
df.head(50)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].apply(lambda x: pattern.sub("", x))


Unnamed: 0,text,target
0,our deeds are the reason of this #earthquake m...,1
1,forest fire near la ronge sask. canada,1
2,all residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,just got sent this photo from ruby #alaska as ...,1
5,#rockyfire update => california hwy. 20 closed...,1
6,#flood #disaster heavy rain causes flash flood...,1
7,i'm on top of the hill and i can see a fire in...,1
8,there's an emergency evacuation happening now ...,1
9,i'm afraid that the tornado is coming to our a...,1


In [7]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [8]:
text = "Hello $# world ?% find *() this <>"
pattern2 = re.compile(r'[^\w\s]')
pattern2.findall(text)

['$', '#', '?', '%', '*', '(', ')', '<', '>']

In [9]:
df['text'] = df['text'].apply(lambda x: pattern2.sub("", x))
df.head(50)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].apply(lambda x: pattern2.sub("", x))


Unnamed: 0,text,target
0,our deeds are the reason of this earthquake ma...,1
1,forest fire near la ronge sask canada,1
2,all residents asked to shelter in place are be...,1
3,13000 people receive wildfires evacuation orde...,1
4,just got sent this photo from ruby alaska as s...,1
5,rockyfire update california hwy 20 closed in ...,1
6,flood disaster heavy rain causes flash floodin...,1
7,im on top of the hill and i can see a fire in ...,1
8,theres an emergency evacuation happening now i...,1
9,im afraid that the tornado is coming to our area,1


4) Stemming of the vocabulary

In [10]:
[_ for _ in text.split()]

['Hello', '$#', 'world', '?%', 'find', '*()', 'this', '<>']

In [11]:
df['text'] = df['text'].apply(lambda x: [_ for _ in x.split()])
df['text'].head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].apply(lambda x: [_ for _ in x.split()])


0     [our, deeds, are, the, reason, of, this, earth...
1         [forest, fire, near, la, ronge, sask, canada]
2     [all, residents, asked, to, shelter, in, place...
3     [13000, people, receive, wildfires, evacuation...
4     [just, got, sent, this, photo, from, ruby, ala...
5     [rockyfire, update, california, hwy, 20, close...
6     [flood, disaster, heavy, rain, causes, flash, ...
7     [im, on, top, of, the, hill, and, i, can, see,...
8     [theres, an, emergency, evacuation, happening,...
9     [im, afraid, that, the, tornado, is, coming, t...
10    [three, people, died, from, the, heat, wave, s...
11    [haha, south, tampa, is, getting, flooded, hah...
12    [raining, flooding, florida, tampabay, tampa, ...
13        [flood, in, bago, myanmar, we, arrived, bago]
14    [damage, to, school, bus, on, 80, in, multi, c...
15                                     [whats, up, man]
16                                    [i, love, fruits]
17                                 [summer, is, 

In [12]:
# nltk.download('stopwords')
eng_stop = stopwords.words('English')
eng_stop

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [13]:
[_ for _ in text.split() if _ not in eng_stop]

['Hello', '$#', 'world', '?%', 'find', '*()', '<>']

In [14]:
df['text'] = df['text'].apply(lambda x: [_ for _ in x if _ not in eng_stop])
df['text'].head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].apply(lambda x: [_ for _ in x if _ not in eng_stop])


0     [deeds, reason, earthquake, may, allah, forgiv...
1         [forest, fire, near, la, ronge, sask, canada]
2     [residents, asked, shelter, place, notified, o...
3     [13000, people, receive, wildfires, evacuation...
4     [got, sent, photo, ruby, alaska, smoke, wildfi...
5     [rockyfire, update, california, hwy, 20, close...
6     [flood, disaster, heavy, rain, causes, flash, ...
7                     [im, top, hill, see, fire, woods]
8     [theres, emergency, evacuation, happening, bui...
9                   [im, afraid, tornado, coming, area]
10               [three, people, died, heat, wave, far]
11    [haha, south, tampa, getting, flooded, hah, wa...
12    [raining, flooding, florida, tampabay, tampa, ...
13                [flood, bago, myanmar, arrived, bago]
14    [damage, school, bus, 80, multi, car, crash, b...
15                                         [whats, man]
16                                       [love, fruits]
17                                     [summer, 

In [15]:
st = PorterStemmer()

In [16]:
st.stem("running")

'run'

In [17]:
[st.stem(_) for _ in text.split()]

['hello', '$#', 'world', '?%', 'find', '*()', 'thi', '<>']

In [18]:
df['text'] = df['text'].apply(lambda x: [st.stem(_) for _ in x])
df['text'].head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].apply(lambda x: [st.stem(_) for _ in x])


0     [deed, reason, earthquak, may, allah, forgiv, us]
1          [forest, fire, near, la, rong, sask, canada]
2     [resid, ask, shelter, place, notifi, offic, ev...
3     [13000, peopl, receiv, wildfir, evacu, order, ...
4     [got, sent, photo, rubi, alaska, smoke, wildfi...
5     [rockyfir, updat, california, hwi, 20, close, ...
6     [flood, disast, heavi, rain, caus, flash, floo...
7                      [im, top, hill, see, fire, wood]
8     [there, emerg, evacu, happen, build, across, s...
9                     [im, afraid, tornado, come, area]
10                 [three, peopl, die, heat, wave, far]
11    [haha, south, tampa, get, flood, hah, wait, se...
12    [rain, flood, florida, tampabay, tampa, 18, 19...
13                  [flood, bago, myanmar, arriv, bago]
14    [damag, school, bu, 80, multi, car, crash, break]
15                                          [what, man]
16                                        [love, fruit]
17                                       [summer

5) Create Vocabulary Dictionary

In [19]:
dictionary = []
for rows in df['text']:
    for word in rows:
        dictionary.append(word)

dictionary = list(set(dictionary))
dictionary

['goblu',
 'high',
 'pope',
 'mug',
 'morevoic',
 'whirlwind',
 'parter',
 'small',
 'deadgirltalk',
 'woe',
 'kasiakosek',
 'poconorecord',
 'nypd',
 'mogacola',
 'feelslikefob',
 'contrast',
 'nathan',
 'ju',
 'unsaf',
 'first',
 'baseman',
 'rabbit',
 'everybodi',
 'ward',
 'michaelwestbiz',
 'counselor',
 'honeybunzgem',
 'pic',
 'p45perez',
 'sfgiant',
 'b5',
 'stonewal',
 'slimebeast',
 'ådaniel',
 'afghetcleft',
 'ironmanå',
 'extermin',
 'manag',
 'motogp',
 'robsimss',
 'quiz',
 'jeannamibian',
 'gold',
 'grudg',
 'inj',
 '900',
 'loughe',
 'strait',
 'segment',
 'topic',
 'ir',
 'pixelcanuck',
 'steamship',
 'djicemoon',
 'demolish',
 'hairbut',
 'juic',
 'end',
 'badirand',
 'peal',
 'blais',
 'akilah',
 'behindashield',
 'sl',
 'reid',
 'nitishkumar',
 'blitz',
 'bloopandablast',
 'kcrw',
 'windmi',
 'imsort',
 'basket',
 'sarah',
 'biggangvh1',
 'retir',
 'rise',
 '1rockstar62',
 'gray',
 'downgrad',
 'udhampuragain',
 'litani',
 'societi',
 'push2left',
 'philippi',
 'va'

In [20]:
len(dictionary)

14969

6) Label Encode words in dictionary.

In [21]:
word_set = {}
for i, w in enumerate(dictionary):
    word_set[w] = i

In [22]:
word_set

{'goblu': 0,
 'high': 1,
 'pope': 2,
 'mug': 3,
 'morevoic': 4,
 'whirlwind': 5,
 'parter': 6,
 'small': 7,
 'deadgirltalk': 8,
 'woe': 9,
 'kasiakosek': 10,
 'poconorecord': 11,
 'nypd': 12,
 'mogacola': 13,
 'feelslikefob': 14,
 'contrast': 15,
 'nathan': 16,
 'ju': 17,
 'unsaf': 18,
 'first': 19,
 'baseman': 20,
 'rabbit': 21,
 'everybodi': 22,
 'ward': 23,
 'michaelwestbiz': 24,
 'counselor': 25,
 'honeybunzgem': 26,
 'pic': 27,
 'p45perez': 28,
 'sfgiant': 29,
 'b5': 30,
 'stonewal': 31,
 'slimebeast': 32,
 'ådaniel': 33,
 'afghetcleft': 34,
 'ironmanå': 35,
 'extermin': 36,
 'manag': 37,
 'motogp': 38,
 'robsimss': 39,
 'quiz': 40,
 'jeannamibian': 41,
 'gold': 42,
 'grudg': 43,
 'inj': 44,
 '900': 45,
 'loughe': 46,
 'strait': 47,
 'segment': 48,
 'topic': 49,
 'ir': 50,
 'pixelcanuck': 51,
 'steamship': 52,
 'djicemoon': 53,
 'demolish': 54,
 'hairbut': 55,
 'juic': 56,
 'end': 57,
 'badirand': 58,
 'peal': 59,
 'blais': 60,
 'akilah': 61,
 'behindashield': 62,
 'sl': 63,
 'rei