# Downloading data from kaggle

First you should create a token in kaggle and upload it in Files

In [1]:
! pip install -q kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle competitions download -c nlp-getting-started

mkdir: cannot create directory ‘/root/.kaggle’: File exists
Downloading nlp-getting-started.zip to /content
  0% 0.00/593k [00:00<?, ?B/s]
100% 593k/593k [00:00<00:00, 82.8MB/s]


In [2]:
! unzip nlp-getting-started.zip

Archive:  nlp-getting-started.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


# Importing required libraries

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
plt.style.use('seaborn')

# Importing data

In [2]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
df.loc[df.text.isna(),:]

Unnamed: 0,id,keyword,location,text,target


First we create a simple model just by using the text and target column

In [4]:
data = df.iloc[:,[0,3,4]]

In [5]:
data.id = data.index.values + 1
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,2,Forest fire near La Ronge Sask. Canada,1
2,3,All residents asked to 'shelter in place' are ...,1
3,4,"13,000 people receive #wildfires evacuation or...",1
4,5,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...
7608,7609,Two giant cranes holding a bridge collapse int...,1
7609,7610,@aria_ahrary @TheTawniest The out of control w...,1
7610,7611,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,7612,Police investigating after an e-bike collided ...,1


# Model

## Train/Test Split

In [6]:
from sklearn.model_selection import train_test_split
X = np.array(data.iloc[:,1])
y = np.array(data.iloc[:,-1])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=222)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=666)

##First layer

We use a pre-trained text embedding model as the first layer

In [7]:
import tensorflow_hub as hub
model = "https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2"
hub_layer = hub.KerasLayer(model, input_shape=[], dtype=tf.string, trainable=True)
hub_layer(X_train[:3])

<tf.Tensor: shape=(3, 128), dtype=float32, numpy=
array([[ 4.65274483e-01,  6.03381395e-02, -2.25947276e-02,
         1.27515376e-01,  1.41367197e-01, -6.01468608e-02,
         5.61336055e-02, -8.17772746e-02, -1.25141844e-01,
         2.24831462e-01,  9.35670957e-02, -4.20897692e-01,
        -4.09677513e-02, -1.45227745e-01,  7.20147043e-02,
         1.95798203e-02, -4.88073304e-02,  2.30129459e-04,
        -6.65237084e-02,  7.47236833e-02, -1.82472244e-01,
         9.60581750e-02,  5.96922873e-05, -8.05068463e-02,
        -2.24518385e-02, -2.17670530e-01,  6.77260682e-02,
        -1.09929599e-01, -1.43464863e-01,  8.11961442e-02,
         8.42682868e-02,  1.66210532e-01,  7.72339106e-02,
         5.83183914e-02,  9.52747986e-02, -1.34607181e-01,
        -1.38968810e-01, -1.78631172e-01,  9.18664634e-02,
         2.02663898e-01, -9.43761542e-02,  1.13399848e-02,
        -4.26216424e-03, -3.19792554e-02,  5.60061038e-02,
         9.58899558e-02,  7.41870850e-02, -1.89713873e-02,
      

In [8]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(1,activation='sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 128)               124642688 
                                                                 
 dense (Dense)               (None, 16)                2064      
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 124,644,769
Trainable params: 124,644,769
Non-trainable params: 0
_________________________________________________________________


In [9]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])

In [10]:
history = model.fit(X_train,
                    y_train,
                    epochs=40,
                    batch_size=512,
                    validation_data=(X_val, y_val),
                    verbose=1)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [11]:
results = model.evaluate(X_test, y_test)

print(results)

[0.9673160314559937, 0.7524622678756714]


In [12]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(1,activation='sigmoid'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 128)               124642688 
                                                                 
 dense_2 (Dense)             (None, 16)                2064      
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 124,644,769
Trainable params: 124,644,769
Non-trainable params: 0
_________________________________________________________________


In [13]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])

In [14]:
history = model.fit(X,
                    y,
                    epochs=10,
                    batch_size=512,
                    verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Setting the test set

In [15]:
test = pd.read_csv('test.csv')
X_pred = np.array(test.iloc[:,-1])

In [18]:
y_pred = model.predict(X_pred)

In [20]:
y_pred[y_pred>=0.5] = 1
y_pred[y_pred<0.5] = 0

In [25]:
output = pd.DataFrame()
output['id'] = test['id']
output['target'] = y_pred
output['target'] = output['target'].astype(int)
output

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,0
3259,10865,1
3260,10868,1
3261,10874,1


In [26]:
output.to_csv('output.csv',index=False)

In [27]:
! kaggle competitions submit -c nlp-getting-started -f output.csv -m "first try"

100% 22.2k/22.2k [00:01<00:00, 16.1kB/s]
Successfully submitted to Natural Language Processing with Disaster Tweets