In [1]:
# !pip install numpy==1.16.1
# !pip install requests
# !pip install pandas
# !pip install tensorflow

In [2]:
import requests
import json
import urllib.request
import numpy as np
import pandas as pd
import tensorflow as tf
from zipfile import ZipFile

In [3]:
# Download the dataset.
dataset_url = "https://github.com/JumpThanawut/dataset/blob/master/twitter_sentiment_analysis/tweet_1600000.csv.zip?raw=true"
urllib.request.urlretrieve(dataset_url, "tweet_1600000.csv.zip")  
with ZipFile("tweet_1600000.csv.zip", "r") as zipObj:
    zipObj.extractall()
df = pd.read_csv("tweet_1600000.csv", header=None, encoding='latin-1')
df = df.sample(frac=1)

In [4]:
# Select sentiment and text columns.
sentiment_list = [0 if x == 0 else 1 for x in df[0]]
tweet_list = df[5]

In [5]:
# For tokenizer on tweets.
num_words = 200
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=num_words, lower=True)
tokenizer.fit_on_texts(tweet_list)

In [6]:
# Apply tokenizer on tweets.
tweet_frame = tokenizer.texts_to_sequences(tweet_list)
padded_tweet_frame = tf.keras.preprocessing.sequence.pad_sequences(tweet_frame, maxlen=30)
num_words = len(tokenizer.word_counts)
print("Word: {:d}".format(num_words))
print("Sample of Padded Tokenized Tweet:")
print(padded_tweet_frame[0])

Word: 690960
Sample of Padded Tokenized Tweet:
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0  15  90  11   3  81 148  94 172  39  58]


In [7]:
# Split dataset into train and test.
num_tweets = len(sentiment_list)
print("Tweets: {:d}".format(num_tweets))
split_index = int(num_tweets * 0.8)
x_train = padded_tweet_frame[0:split_index]
x_test = padded_tweet_frame[split_index:]
y_train = sentiment_list[0:split_index]
y_test = sentiment_list[split_index:]

Tweets: 1600000


In [8]:
# Train the model.
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(num_words, 32))
model.add(tf.keras.layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(tf.keras.layers.Dense(1, activation="sigmoid"))

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

model.fit(x_train, y_train,
          batch_size=2048,
          epochs=10,
          validation_data=(x_test, y_test))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Train on 1280000 samples, validate on 320000 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f40c3cfe5c0>