In [5]:
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer

pd.set_option("display.max_rows", 20)


In [6]:
import os
import sys

current_path = os.getcwd()
print(f"current path {current_path}")
relative_path=os.path.dirname(current_path)
print(f"root path {relative_path}")
sys.path.append(f"{relative_path}")


from lib.preprocessing import *


current path /home/mbarbaric/dev/python/kaggle
root path /home/mbarbaric/dev/python


## Preprocessing Methods

In [7]:
import re

def preprocess_test(x : str)->str:
    x = x.lower()
    x = x.strip(' ')
  #  new_word = re.sub(r'[^a-zA-Z0-9 ]+', '', new_word)
    return x


## Train Data Analysis

In [8]:
train_data = pd.read_csv('train.csv')
train_data

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [9]:
train_data.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [10]:
train_data['keyword'].value_counts()

keyword
fatalities               45
deluge                   42
armageddon               42
sinking                  41
damage                   41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: count, Length: 221, dtype: int64

### Text Column Analysis

In [11]:
text=train_data['text'].apply(lambda x : preprocess_text(x))
text

0       our deeds are the reason of this earthquake ma...
1                   forest fire near la ronge sask canada
2       all residents asked to shelter in place are be...
3       13000 people receive wildfires evacuation orde...
4       just got sent this photo from ruby alaska as s...
                              ...                        
7608    two giant cranes holding a bridge collapse int...
7609    ariaahrary thetawniest the out of control wild...
7610    m194 0104 utc5km s of volcano hawaii httptcozd...
7611    police investigating after an ebike collided w...
7612    the latest more homes razed by northern califo...
Name: text, Length: 7613, dtype: object

In [12]:
text_list=text.to_list()
text_list
words = [[i,w] for i,sentence in enumerate(text_list) for w in sentence.split(' ') if w]
words

[[0, 'our'],
 [0, 'deeds'],
 [0, 'are'],
 [0, 'the'],
 [0, 'reason'],
 [0, 'of'],
 [0, 'this'],
 [0, 'earthquake'],
 [0, 'may'],
 [0, 'allah'],
 [0, 'forgive'],
 [0, 'us'],
 [0, 'all'],
 [1, 'forest'],
 [1, 'fire'],
 [1, 'near'],
 [1, 'la'],
 [1, 'ronge'],
 [1, 'sask'],
 [1, 'canada'],
 [2, 'all'],
 [2, 'residents'],
 [2, 'asked'],
 [2, 'to'],
 [2, 'shelter'],
 [2, 'in'],
 [2, 'place'],
 [2, 'are'],
 [2, 'being'],
 [2, 'notified'],
 [2, 'by'],
 [2, 'officers'],
 [2, 'no'],
 [2, 'other'],
 [2, 'evacuation'],
 [2, 'or'],
 [2, 'shelter'],
 [2, 'in'],
 [2, 'place'],
 [2, 'orders'],
 [2, 'are'],
 [2, 'expected'],
 [3, '13000'],
 [3, 'people'],
 [3, 'receive'],
 [3, 'wildfires'],
 [3, 'evacuation'],
 [3, 'orders'],
 [3, 'in'],
 [3, 'california'],
 [4, 'just'],
 [4, 'got'],
 [4, 'sent'],
 [4, 'this'],
 [4, 'photo'],
 [4, 'from'],
 [4, 'ruby'],
 [4, 'alaska'],
 [4, 'as'],
 [4, 'smoke'],
 [4, 'from'],
 [4, 'wildfires'],
 [4, 'pours'],
 [4, 'into'],
 [4, 'a'],
 [4, 'school'],
 [5, 'rockyfire'],


In [13]:
word_frame=pd.DataFrame(words, columns=['sentence_id', 'token'])
word_frame

Unnamed: 0,sentence_id,token
0,0,our
1,0,deeds
2,0,are
3,0,the
4,0,reason
...,...,...
110749,7612,california
110750,7612,wildfire
110751,7612,abc
110752,7612,news


In [14]:
word_frame.value_counts()

sentence_id  token      
7472         wreck          13
3742         on              6
             fire            6
6490         his             5
1064         the             5
                            ..
2592         meats           1
             italian         1
             incinerator     1
             food            1
7612         wildfire        1
Name: count, Length: 105459, dtype: int64

In [15]:
none_words=word_frame.loc[word_frame['token'].str.contains('http')]
none_words

Unnamed: 0,sentence_id,token
254,31,httptcolhyxeohy6c
264,32,httptcoyao1e0xngw
273,33,httptco2nndbgwyei
293,35,httptcoqqsmshaj3n
321,37,httptco3imaomknna
...,...,...
110682,7607,httptco3sicroaanz
110683,7607,httptcoi27oa0hisp
110694,7608,httptcostfmbbzfb5
110722,7610,httptcozdtoyd8ebj


In [16]:
is_real_disaster=train_data.loc[train_data['target'] == 1]['text']
is_real_disaster

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 3271, dtype: object

In [17]:
fake_disaster=train_data.loc[train_data['target'] == 0]['text']
fake_disaster

15                                         What's up man?
16                                          I love fruits
17                                       Summer is lovely
18                                      My car is so fast
19                           What a goooooooaaaaaal!!!!!!
                              ...                        
7581    @engineshed Great atmosphere at the British Li...
7582    Cramer: Iger's 3 words that wrecked Disney's s...
7584    These boxes are ready to explode! Exploding Ki...
7587                                   Sirens everywhere!
7593    I just heard a really loud bang and everyone i...
Name: text, Length: 4342, dtype: object