In [1]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense, Dropout, SpatialDropout1D
from tensorflow.keras.layers import Embedding
import matplotlib.pyplot as plt

In [2]:
# Define the column names since the file doesn't have a header
column_names = ["target", "id", "date", "flag", "user", "text"]

# Read input data
tweets = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding='latin-1', names=column_names, header=None)

# Drop 'flag' as it won't be of use
tweets = tweets.drop(['flag'], axis=1)

# Remove the timezone abbreviation (PDT) from the 'date' column as all are PDT
tweets['date'] = tweets['date'].str.replace('PDT', '')

# Convert the 'date' column to datetime format
tweets['date'] = pd.to_datetime(tweets['date'], format='%a %b %d %H:%M:%S %Y')

tweets

Unnamed: 0,target,id,date,user,text
0,0,1467810369,2009-04-06 22:19:45,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,2009-04-06 22:19:49,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,2009-04-06 22:19:53,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,2009-04-06 22:19:57,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,2009-04-06 22:19:57,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...
1599995,4,2193601966,2009-06-16 08:40:49,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,2009-06-16 08:40:49,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,2009-06-16 08:40:49,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,2009-06-16 08:40:49,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [3]:
# Tokenize the text into smaller parts

tweet = tweets.text.values

tokenizer = Tokenizer(num_words=5000)

# Association between words and assigned number

tokenizer.fit_on_texts(tweet) 

In [4]:
# Replace the words with their assigned numbers

encoded_docs = tokenizer.texts_to_sequences(tweet)

# Pad sentences to have equal length

padded_sequence = pad_sequences(encoded_docs, maxlen=200)

In [5]:
# Using Long Short Term Memory Networks to work with words in large texts
# This uses dropout to drop some neurons and avoid overfitting

vocab_size = len(tokenizer.word_index) + 1
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_length, input_length=200))
model.add(SpatialDropout1D(0.25))
model.add(LSTM(50, dropout=0.5, recurrent_dropout=0.5))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 32)           22110752  
                                                                 
 spatial_dropout1d (Spatial  (None, 200, 32)           0         
 Dropout1D)                                                      
                                                                 
 lstm (LSTM)                 (None, 50)                16600     
                                                                 
 dropout (Dropout)           (None, 50)                0         
                                                                 
 dense (Dense)               (None, 1)                 51        
                                                                 
Total params: 22127403 (84.41 MB)
Trainable params: 22127403 (84.41 MB)
Non-trainable params: 0 (0.00 Byte)
______________