In [None]:
!pip3 install -q tensorflow_decision_forests

[K     |████████████████████████████████| 13.4 MB 5.2 MB/s 
[K     |████████████████████████████████| 462 kB 41.3 MB/s 
[?25h

In [None]:
! pip install kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json



In [None]:
!kaggle competitions download -c nlp-getting-started

Downloading nlp-getting-started.zip to /content
  0% 0.00/593k [00:00<?, ?B/s]
100% 593k/593k [00:00<00:00, 103MB/s]


In [None]:
import zipfile
with zipfile.ZipFile("/content/nlp-getting-started.zip", 'r') as zip_ref:
    zip_ref.extractall("/content/")

In [None]:
import pandas as pd
import numpy as np
import re

import tensorflow as tf
import tensorflow_decision_forests as tfdf
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from tensorflow.keras.regularizers import l2, l1_l2

In [None]:
# Turn .csv files into pandas DataFrame's
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
train_df.shape

(7613, 5)

In [None]:
train_df_shuffled = train_df.sample(frac=1, random_state=42)
train_df_shuffled.drop(['id','keyword','location'],axis=1,inplace=True)
train_df_shuffled.head()

Unnamed: 0,text,target
2644,So you have a new weapon that can cause un-ima...,1
2227,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,Aftershock back to school kick off was great. ...,0
6845,in response to trauma Children of Addicts deve...,0


In [None]:
train_df_shuffled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7613 entries, 2644 to 7270
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    7613 non-null   object
 1   target  7613 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 178.4+ KB


In [None]:
train_df_shuffled.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [None]:
# How many samples total?
print(f"Total training samples: {len(train_df)}")
print(f"Total test samples: {len(test_df)}")
print(f"Total samples: {len(train_df) + len(test_df)}")

Total training samples: 7613
Total test samples: 3263
Total samples: 10876


In [None]:
import random
for i in range(5):
    random_index = random.randint(0, len(train_df))
    target = train_df_shuffled.iloc[random_index][1]
    text = train_df_shuffled.iloc[random_index][0]

    if target==1: print(f'Target: {target} (real disaster)')
    else: print(f'Target: {target} (not real disaster)')
    print(f'Text: {text}\n')
    print('-'*10)

Target: 0 (not real disaster)
Text: My ear started bleeding again...

----------
Target: 0 (not real disaster)
Text: @eileenmfl are you serious?

----------
Target: 0 (not real disaster)
Text: Ngata on injury list at start of practice for Lions http://t.co/Z16DtoQHhG

----------
Target: 0 (not real disaster)
Text: @TurnedonFetaboo @HSjb215 Check out this #rockin preview of @ClaytonBryant Danger Zone Coming soon! https://t.co/E1wrVyZFKV #ArtistsUnited

----------
Target: 0 (not real disaster)
Text: @awadgolf @GOP a capitalist would win biggest landslide in history people who haven't voted in years even OLD SCHOOL DEMS would elect him.

----------


In [None]:
from sklearn.model_selection import train_test_split

# Use train_test_split to split training data into training and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled["text"].to_numpy(),
                                                                            train_df_shuffled["target"].to_numpy(),
                                                                            test_size=0.1, # dedicate 10% of samples to validation set
                                                                            random_state=42) # random state for reproducibility

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences, train_labels))
valid_dataset = tf.data.Dataset.from_tensor_slices((val_sentences, val_labels))

train_dataset

<TensorSliceDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>

In [None]:
train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
valid_dataset = valid_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

In [None]:
max_vocab_length = 10000
max_length = 15

text_vectorizer = tf.keras.layers.TextVectorization(max_tokens=max_vocab_length,
                                    standardize="lower_and_strip_punctuation",
                                    split="whitespace",
                                    output_mode="int",
                                    output_sequence_length=max_length)

text_vectorizer.adapt(train_sentences)

In [None]:
embedding = tf.keras.layers.Embedding(input_dim = max_vocab_length,
                      output_dim = 128,
                      input_length=max_length)
embedding

<keras.layers.embeddings.Embedding at 0x7fae51292d90>

In [None]:
inputs = tf.keras.layers.Input(shape = (1,), dtype='string')
x = text_vectorizer(inputs)
x = embedding(x)
# LSTM Layers
x = tf.keras.layers.LSTM(64, return_sequences=True)(x)
x = tf.keras.layers.LSTM(64)(x)

x = tf.keras.layers.Dense(64, activation=tf.nn.relu6)(x)
# Last Layer (We pass all above layers to preprocessing function of random forest)
last_layer = tf.keras.layers.Dense(8, activation=tf.nn.relu6, name="last")(x)
# Output layer for Neural network
classification_output = tf.keras.layers.Dense(1, activation='sigmoid')(x)
nn_model = tf.keras.models.Model(inputs, classification_output)

In [None]:
nn_model.summary()

Model: "model_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_10 (InputLayer)       [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 15)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 lstm_18 (LSTM)              (None, 15, 64)            49408     
                                                                 
 lstm_19 (LSTM)              (None, 64)                33024     
                                                                 
 dense_17 (Dense)            (None, 64)                4160      
                                                          

In [None]:
nn_without_head = tf.keras.models.Model(inputs=nn_model.inputs, outputs=last_layer)
df_and_nn_model = tfdf.keras.RandomForestModel(preprocessing=nn_without_head, num_trees=1000)

Use /tmp/tmpva4_k3t3 as temporary training directory


In [None]:
nn_model.compile(
  optimizer=tf.keras.optimizers.Adam(),
  loss=tf.keras.losses.BinaryCrossentropy(),
  metrics=["accuracy"])

nn_model.fit(x=train_dataset, validation_data=valid_dataset, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fae1ae29810>

In [None]:
df_and_nn_model.compile(metrics=["accuracy"])
df_and_nn_model.fit(x=train_dataset)

Starting reading the dataset
Dataset read in 0:00:02.307906
Training model
Model trained in 0:00:03.328757
Compiling model


<keras.callbacks.History at 0x7fadbb47fb90>

In [None]:
print("Evaluation:", df_and_nn_model.evaluate(valid_dataset))

Evaluation: [0.0, 0.748031497001648]


In [None]:
print("Evaluation :", nn_model.evaluate(valid_dataset))

Evaluation : [1.791905164718628, 0.7427821755409241]


In [None]:
def calculate_results(y_true, y_pred):
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [None]:
preds = df_and_nn_model.predict(val_sentences)
preds = tf.squeeze(tf.round(preds))
preds.shape, preds[:10]

(TensorShape([762]),
 <tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 1.], dtype=float32)>)

In [None]:
result = calculate_results(y_true=val_labels,
                                    y_pred=preds)
result

{'accuracy': 74.01574803149606,
 'f1': 0.7389614398401666,
 'precision': 0.7397911693447822,
 'recall': 0.7401574803149606}