In [49]:
import numpy as np
import pandas as pd
import tensorflow as tf

tf.random.set_seed(42)
pd.set_option("display.max_rows", 20)

In [50]:
import os
import sys

current_path = os.getcwd()
print(f"current path {current_path}")
relative_path=os.path.dirname(current_path)
print(f"root path {relative_path}")
sys.path.append(f"{relative_path}")

from lib.preprocessing import *

current path /home/mbarbaric/dev/python/kaggle
root path /home/mbarbaric/dev/python


In [51]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

OOV_TOKEN="<OOV_TOK>"
VOCAB_SIZE=1000
MAX_LEN=30
TRUNC_TYPE='post'
PADDING_TYPE='post'


def get_tokenizer():
    tokenizer = Tokenizer(oov_token="<OOV_TOK>", num_words=VOCAB_SIZE)
    return tokenizer


def get_padded_sequences(tokenizer : Tokenizer, tokens : list[str])->list[str]:
    tokenizer.fit_on_texts(tokens)
    sequences=tokenizer.texts_to_sequences(tokens)
    padded_sequences=pad_sequences(
        sequences, padding=PADDING_TYPE, truncating=TRUNC_TYPE, maxlen=MAX_LEN)
    return padded_sequences

## Train Data Analysis

In [52]:
train_data = pd.read_csv('train.csv')
#test_data = pd.read_csv('test.csv')
#train_data = pd.concat([train_data, test_data])
train_data

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [53]:
train_data.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [54]:
train_data['keyword'].value_counts()

keyword
fatalities               45
deluge                   42
armageddon               42
sinking                  41
damage                   41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: count, Length: 221, dtype: int64

### Text Column Analysis

In [55]:
text=train_data['text'].apply(lambda x : preprocess_text(x))
text

0       our deeds are the reason of this earthquake ma...
1                   forest fire near la ronge sask canada
2       all residents asked to shelter in place are be...
3       13000 people receive wildfires evacuation orde...
4       just got sent this photo from ruby alaska as s...
                              ...                        
7608    two giant cranes holding a bridge collapse int...
7609    the out of control wild fires in california ev...
7610                 m194 0104 utc5km s of volcano hawaii
7611    police investigating after an ebike collided w...
7612    the latest more homes razed by northern califo...
Name: text, Length: 7613, dtype: object

In [56]:
text_list=text.to_list()
text_list
words = [[i,w] for i,sentence in enumerate(text_list) for w in sentence.split(' ') if w]
words

[[0, 'our'],
 [0, 'deeds'],
 [0, 'are'],
 [0, 'the'],
 [0, 'reason'],
 [0, 'of'],
 [0, 'this'],
 [0, 'earthquake'],
 [0, 'may'],
 [0, 'allah'],
 [0, 'forgive'],
 [0, 'us'],
 [0, 'all'],
 [1, 'forest'],
 [1, 'fire'],
 [1, 'near'],
 [1, 'la'],
 [1, 'ronge'],
 [1, 'sask'],
 [1, 'canada'],
 [2, 'all'],
 [2, 'residents'],
 [2, 'asked'],
 [2, 'to'],
 [2, 'shelter'],
 [2, 'in'],
 [2, 'place'],
 [2, 'are'],
 [2, 'being'],
 [2, 'notified'],
 [2, 'by'],
 [2, 'officers'],
 [2, 'no'],
 [2, 'other'],
 [2, 'evacuation'],
 [2, 'or'],
 [2, 'shelter'],
 [2, 'in'],
 [2, 'place'],
 [2, 'orders'],
 [2, 'are'],
 [2, 'expected'],
 [3, '13000'],
 [3, 'people'],
 [3, 'receive'],
 [3, 'wildfires'],
 [3, 'evacuation'],
 [3, 'orders'],
 [3, 'in'],
 [3, 'california'],
 [4, 'just'],
 [4, 'got'],
 [4, 'sent'],
 [4, 'this'],
 [4, 'photo'],
 [4, 'from'],
 [4, 'ruby'],
 [4, 'alaska'],
 [4, 'as'],
 [4, 'smoke'],
 [4, 'from'],
 [4, 'wildfires'],
 [4, 'pours'],
 [4, 'into'],
 [4, 'a'],
 [4, 'school'],
 [5, 'rockyfire'],


In [57]:
word_frame=pd.DataFrame(words, columns=['sentence_id', 'token'])
word_frame

Unnamed: 0,sentence_id,token
0,0,our
1,0,deeds
2,0,are
3,0,the
4,0,reason
...,...,...
103460,7612,northern
103461,7612,california
103462,7612,wildfire
103463,7612,abc


In [58]:
word_frame.value_counts()

sentence_id  token      
7472         wreck          13
3742         on              6
             fire            6
1090         the             5
781          the             5
                            ..
2595         in              1
             hot             1
             hitchhiking     1
             garbagebot      1
7612         wildfire        1
Name: count, Length: 98185, dtype: int64

In [59]:
none_words=word_frame.loc[word_frame['token'].str.contains('http')]
none_words

Unnamed: 0,sentence_id,token
1433,121,http


In [60]:
is_real_disaster=train_data.loc[train_data['target'] == 1]['text']
is_real_disaster

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 3271, dtype: object

In [61]:
fake_disaster=train_data.loc[train_data['target'] == 0]['text']
fake_disaster

15                                         What's up man?
16                                          I love fruits
17                                       Summer is lovely
18                                      My car is so fast
19                           What a goooooooaaaaaal!!!!!!
                              ...                        
7581    @engineshed Great atmosphere at the British Li...
7582    Cramer: Iger's 3 words that wrecked Disney's s...
7584    These boxes are ready to explode! Exploding Ki...
7587                                   Sirens everywhere!
7593    I just heard a really loud bang and everyone i...
Name: text, Length: 4342, dtype: object

## Padded Sequences and Train Split

In [62]:
train_data['text']=train_data['text'].apply(lambda x : preprocess_text(x))

In [63]:
x = train_data['text']
x

0       our deeds are the reason of this earthquake ma...
1                   forest fire near la ronge sask canada
2       all residents asked to shelter in place are be...
3       13000 people receive wildfires evacuation orde...
4       just got sent this photo from ruby alaska as s...
                              ...                        
7608    two giant cranes holding a bridge collapse int...
7609    the out of control wild fires in california ev...
7610                 m194 0104 utc5km s of volcano hawaii
7611    police investigating after an ebike collided w...
7612    the latest more homes razed by northern califo...
Name: text, Length: 7613, dtype: object

In [64]:
tokenizer = get_tokenizer()
padded_sequences = get_padded_sequences(tokenizer=tokenizer, tokens=x)
padded_sequences

array([[108,   1,  22, ...,   0,   0,   0],
       [177,  44, 216, ...,   0,   0,   0],
       [ 40,   1,   1, ...,   0,   0,   0],
       ...,
       [  1,   1,   1, ...,   0,   0,   0],
       [ 76,   1,  38, ...,   0,   0,   0],
       [  2, 199,  52, ...,   0,   0,   0]], dtype=int32)

In [65]:
y = train_data['target']
y

0       1
1       1
2       1
3       1
4       1
       ..
7608    1
7609    1
7610    1
7611    1
7612    1
Name: target, Length: 7613, dtype: int64

In [68]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(padded_sequences, y, test_size=0.2)

In [69]:
x_test

array([[ 56,   9,   2, ...,   0,   0,   0],
       [  1,   1, 181, ...,   0,   0,   0],
       [ 19,   1,   1, ...,   0,   0,   0],
       ...,
       [  2,   1,  79, ...,   0,   0,   0],
       [  1,   1,   1, ...,   0,   0,   0],
       [  1,   1, 247, ...,   0,   0,   0]], dtype=int32)

## Model Definition and Training

In [71]:
from keras.models import Sequential
from keras.layers import Dense, Input, Conv1D,MaxPool1D, Flatten

model = Sequential([
    tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, output_dim=50, input_length=MAX_LEN),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 30, 50)            50000     
                                                                 
 global_average_pooling1d_1  (None, 50)                0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dropout_1 (Dropout)         (None, 50)                0         
                                                                 
 dense_2 (Dense)             (None, 24)                1224      
                                                                 
 dense_3 (Dense)             (None, 1)                 25        
                                                                 
Total params: 51249 (200.19 KB)
Trainable params: 51249 (200.19 KB)
Non-trainable params: 0 (0.00 Byte)
________________

In [72]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])


In [74]:
EPOCHS=7

model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=EPOCHS, batch_size=10)


Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.src.callbacks.History at 0x7f498fad27a0>

## Validation

In [75]:
test_data = pd.read_csv('test.csv')
test_data

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [77]:
test_data['text'] = test_data['text'].apply(lambda x : preprocess_text(x))
test_data['text']

0                      just happened a terrible car crash
1       heard about earthquake is different cities sta...
2       there is a forest fire at spot pond geese are ...
3                   apocalypse lighting spokane wildfires
4           typhoon soudelor kills 28 in china and taiwan
                              ...                        
3258    earthquake safety los angeles  safety fastener...
3259    storm in ri worse than last hurricane my citya...
3260                     green line derailment in chicago
3261             meg issues hazardous weather outlook hwo
3262    cityofcalgary has activated its municipal emer...
Name: text, Length: 3263, dtype: object

In [78]:
padded_test_data = tokenizer.texts_to_sequences(test_data['text'])
padded_test_data

[[28, 915, 3, 1, 122, 87],
 [432, 53, 240, 9, 1, 1, 612, 1, 226],
 [75, 9, 3, 177, 44, 17, 777, 1, 1, 22, 1, 821, 2, 696, 8, 1, 338, 90, 40],
 [511, 1, 1, 1],
 [495, 779, 437, 1, 4, 899, 7, 1],
 [68, 1, 43, 240],
 [1, 748, 81, 384, 52, 133, 77, 1, 147, 1, 1, 1],
 [737, 62, 22, 12],
 [56, 3, 1, 781],
 [352, 92],
 [42, 8, 54, 25, 1],
 [1, 54, 67, 16],
 [42, 54, 677, 30, 16],
 [56, 46],
 [989],
 [1, 1, 514, 9, 632, 1, 64, 44, 1, 37, 17, 1, 1, 514],
 [39, 12, 1, 1, 10, 1, 632],
 [1, 1, 1, 1, 766, 1, 1, 463, 632],
 [310, 249, 37, 1],
 [1, 32, 1, 13, 1, 1, 1, 1, 1],
 [1, 105, 632, 1, 1, 33, 1],
 [84, 97, 632, 18, 1, 1, 50],
 [34,
  3,
  1,
  541,
  57,
  39,
  166,
  185,
  434,
  7,
  311,
  14,
  15,
  1,
  36,
  43,
  932,
  1,
  776,
  58,
  9,
  53,
  71,
  254,
  2,
  433,
  632],
 [1,
  1,
  1,
  26,
  188,
  1,
  1,
  632,
  3,
  1,
  371,
  598,
  6,
  1,
  289,
  26,
  188,
  254,
  1,
  632],
 [1, 749, 632],
 [2,
  1,
  4,
  910,
  6,
  2,
  1,
  18,
  13,
  294,
  1,
  254,
  632

In [84]:
padded_test_sequences=pad_sequences(
    padded_test_data, padding=PADDING_TYPE, truncating=TRUNC_TYPE, maxlen=MAX_LEN)
padded_test_sequences

array([[ 28, 915,   3, ...,   0,   0,   0],
       [432,  53, 240, ...,   0,   0,   0],
       [ 75,   9,   3, ...,   0,   0,   0],
       ...,
       [926, 688, 343, ...,   0,   0,   0],
       [  1, 554, 453, ...,   0,   0,   0],
       [  1,  45,   1, ...,   0,   0,   0]], dtype=int32)

In [89]:
preds = model.predict(padded_test_sequences)
preds

  1/102 [..............................] - ETA: 3s



array([[0.82777303],
       [0.8129981 ],
       [0.8089577 ],
       ...,
       [0.959005  ],
       [0.8733133 ],
       [0.3589493 ]], dtype=float32)

In [91]:
ans = [[i,1] if p > 0.5 else [i,0] for i,p in enumerate(preds)]
ans

[[0, 1],
 [1, 1],
 [2, 1],
 [3, 0],
 [4, 1],
 [5, 1],
 [6, 0],
 [7, 0],
 [8, 0],
 [9, 0],
 [10, 0],
 [11, 0],
 [12, 0],
 [13, 0],
 [14, 0],
 [15, 0],
 [16, 0],
 [17, 1],
 [18, 0],
 [19, 0],
 [20, 0],
 [21, 0],
 [22, 0],
 [23, 1],
 [24, 0],
 [25, 1],
 [26, 0],
 [27, 1],
 [28, 0],
 [29, 1],
 [30, 0],
 [31, 1],
 [32, 1],
 [33, 0],
 [34, 1],
 [35, 0],
 [36, 1],
 [37, 0],
 [38, 0],
 [39, 1],
 [40, 0],
 [41, 1],
 [42, 1],
 [43, 1],
 [44, 0],
 [45, 0],
 [46, 0],
 [47, 0],
 [48, 0],
 [49, 0],
 [50, 0],
 [51, 0],
 [52, 1],
 [53, 0],
 [54, 0],
 [55, 0],
 [56, 0],
 [57, 0],
 [58, 0],
 [59, 0],
 [60, 1],
 [61, 1],
 [62, 1],
 [63, 1],
 [64, 1],
 [65, 1],
 [66, 1],
 [67, 1],
 [68, 1],
 [69, 1],
 [70, 0],
 [71, 1],
 [72, 1],
 [73, 1],
 [74, 1],
 [75, 1],
 [76, 0],
 [77, 0],
 [78, 0],
 [79, 1],
 [80, 0],
 [81, 0],
 [82, 0],
 [83, 0],
 [84, 1],
 [85, 1],
 [86, 0],
 [87, 0],
 [88, 1],
 [89, 1],
 [90, 0],
 [91, 0],
 [92, 0],
 [93, 0],
 [94, 0],
 [95, 0],
 [96, 0],
 [97, 0],
 [98, 1],
 [99, 0],
 [100, 0],

In [96]:
full_ans = pd.DataFrame(ans, columns=['id','target'])
full_ans

Unnamed: 0,id,target
0,0,1
1,1,1
2,2,1
3,3,0
4,4,1
...,...,...
3258,3258,1
3259,3259,1
3260,3260,1
3261,3261,1


In [97]:
full_ans.columns

Index(['id', 'target'], dtype='object')

In [98]:
full_ans.to_csv('output.csv',index=False)