In [1]:
import numpy as np
import pandas as pd

In [2]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding,CuDNNGRU
from tensorflow.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [3]:
train_dataset=pd.read_csv('train.csv')
test_dataset=pd.read_csv('test.csv')

In [4]:
#set index
train_dataset.set_index('id',inplace=True)
test_dataset.set_index('id',inplace=True)

In [5]:
#to list
target=train_dataset['label'].values.tolist()
data=train_dataset['tweet'].values.tolist()

#splitting data into train and test
cutoff=int(len(data)*0.8)
x_train, x_test=data[:cutoff],data[cutoff:]
y_train, y_test=target[:cutoff],target[cutoff:]

y_train=np.array(y_train)
y_test=np.array(y_test)

In [6]:
#creating a threshold for number of words 
num_words=10000 

#creating a tokenizer
tokenizer=Tokenizer(num_words=num_words)

#converting words to numbers
tokenizer.fit_on_texts(data) 

#creating vectors
x_train_tokens=tokenizer.texts_to_sequences(x_train)
x_test_tokens=tokenizer.texts_to_sequences(x_test)

In [7]:
print(x_train[800])
print(x_train_tokens[800])

if you want creative workers, give them enough time to play.   #success #quote  
[72, 6, 74, 1340, 1872, 335, 152, 456, 39, 3, 311, 393, 292]


In [8]:
num_tokens=[len(tokens) for tokens in x_train_tokens+x_test_tokens]
num_tokens=np.array(num_tokens)
max_tokens=np.mean(num_tokens)+2*np.std(num_tokens)
max_tokens=int(max_tokens)

In [9]:
np.sum(num_tokens<max_tokens)/len(num_tokens)

0.9485639196545899

In [10]:
#padding operation
x_train_pad=pad_sequences(x_train_tokens,
                              maxlen=max_tokens)
x_test_pad=pad_sequences(x_test_tokens,
                         maxlen=max_tokens)

In [11]:
#creating model
model=Sequential()
embedding_size=50  # we will create a 50 size vector for each word.

model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='embedding_layer')
) # this Embedding layer will take a text as an input, convert it to a vector as an output

model.add(GRU(units=16, 
              return_sequences=True) 
)
model.add(GRU(units=8, return_sequences=True))
model.add(GRU(units=4))
model.add(Dense(1,activation='sigmoid'))
optimizer=Adam(lr=1e-3)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [12]:
model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_layer (Embedding)  (None, 22, 50)            500000    
_________________________________________________________________
gru (GRU)                    (None, 22, 16)            3216      
_________________________________________________________________
gru_1 (GRU)                  (None, 22, 8)             600       
_________________________________________________________________
gru_2 (GRU)                  (None, 4)                 156       
_________________________________________________________________
dense (Dense)                (None, 1)                 5         
Total params: 503,977
Trainable params: 503,977
Non-trainable params: 0
_________________________________________________________________


In [14]:
model.fit(x_train_pad,
          y_train,
          epochs=5,
          batch_size=256)

Train on 25569 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x214f66c74c8>

In [15]:
result=model.evaluate(x_test_pad,
                      y_test)



In [16]:
#we will go over the results and false predictions and try to understand this errors.
y_pred=model.predict(x=x_test_pad[0:1000])
y_pred=y_pred.T[0]

cls_pred=np.array([1.0 if p>0.5 else 0.0 for p in y_pred])

cls_true=np.array(y_test[0:1000])

incorrect=np.where(cls_pred!=cls_true) 
incorrect=incorrect[0] 

print(len(incorrect)) 

37


In [17]:
idx=incorrect[2] # the first false prediction index
text=x_test[idx]
print(text)
print(y_pred[idx]) # the probability
print(cls_true[idx])

@user  now comes with a #taxwriteoff as four #whitenationalist groups are given #nonprofitstatus @user @user 
0.378024
1


In [18]:
#scoring test dataset
test_data=test_dataset['tweet'].values.tolist()

test_data_tokens=tokenizer.texts_to_sequences(test_data)

test_data_tokens_pad=pad_sequences(test_data_tokens,
                         maxlen=max_tokens)

test_data_predictions=model.predict(test_data_tokens_pad)
test_data_predictions=[1 if prob>0.5 else 0 for prob in test_data_predictions]
test_dataset['label']=test_data_predictions

In [19]:
test_dataset.to_csv('prediction.csv',index=False)