# Twitter Words Project

By: Gareth Williams

In [1]:
# storing and anaysis
import numpy as np
import pandas as pd

# Tokenizers
import nltk 
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Misc. 
import os
from __future__ import absolute_import, division, print_function, unicode_literals

# SpellChecker
!pip install autocorrect
from autocorrect import Speller
!pip install pyspellchecker
from spellchecker import SpellChecker

# TensorFlow 
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
!pip install tensorflow-hub
!pip install tfds-nightly
import tensorflow_hub as hub
import tensorflow_datasets as tfds
from keras.preprocessing.sequence import pad_sequences

# Building the Model
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers


print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.experimental.list_physical_devices("GPU") else "NOT AVAILABLE")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Version:  2.2.0-rc1
Eager mode:  True
Hub version:  0.7.0
GPU is NOT AVAILABLE


Using TensorFlow backend.


# Functions

In [0]:
def spellcheck(input_words):  # Additionally casts to LC and removes excess spaces as a nice side-effect of how it spell-checks.
  speller = SpellChecker() # imported from the pyspellcheck library 
  words = speller.split_words(input_words) # splits sentence into individual words
  filtered = [speller.correction( word ) for word in words] # scans through words and spell checks them 
  filtered = " ".join(filtered) # joins the individual words together with a space between them

  return filtered

def remove_stopwords(sent):
  # Stop words are words such as “the”, “a”, “an”, “in.” It is useful to remove and ignore these words.
  stop_words = set(stopwords.words('english')) # Get the list of stop words of the targeted language
  sent = convert(sent) # function made to split the sentences into individual words
  filtered = [w for w in sent if not w in stop_words] # scans through sentence and removes Stop Words
  filtered = " ".join(filtered)
  return filtered

def convert(lst): 
  try:
    return (lst.split())
  except AttributeError: # If it cannot split, it returns the list
    return lst

def remove_hashtag_at(word_tokens):
    j = 0
    for i in range(len(word_tokens)):
        try:
            if word_tokens[j] == '@' or word_tokens[j] == '#':
                del word_tokens[j + 1]
                del word_tokens[j]
                j += -1
            j += 1
        except:
            break
    return word_tokens

def Cleaning(X):
 # Cleaning data 
  pop_list = ['#', '@', '*', '\x89Û_', '\x89Ûªt', '\x89', '\x89ÛÏ', '\x89Ûªm', '\x89ÛÓ', '\x89Ûª', '\x89Û÷',
            '\x89ÛªS', 'RAZEDåÊ', '\x89ÛÒ', '...', '?', '|', '_', '-', 'Ûªs', '', '[', ']', '`', '(', ')',
            'Û', 'ª','^', '>','0','1','2','3','4','5','6','7','8','9'] # list of things to remove
  clean_tweets = [] # empty list
  stop_words = set(stopwords.words('english')) # Probably safe to remove this 

  for i in range(len(X)):
    ws = word_tokenize(X[i]) # tokenizes tweet
    ws = remove_hashtag_at(ws) # removes #,@ and the names attatched
    # Removes websites from tweets (Note: could be improved)
    if 'http' in ws:
        text = ' '.join(ws[0:ws.index('http')])
    elif 'https' in ws:
        text = ' '.join(ws[0:ws.index('https')])
    else:
        text = ' '.join(ws)
    for pop in pop_list:
        text = text.replace(pop, '')
    # text = spellcheck(text) # Corrects tweet spelling (Warning: very, very long run times and I am sure I get better results without it)
    text = remove_stopwords(text) # Removes the Stop Words
    clean_tweets.append(text) # appends the cleaned tweets 


  X = clean_tweets # Could just use "return clean_tweets" to save memory
  return X

# Data Cleaning

In [0]:
# Load in tweet data file
data_temp = pd.read_csv('/content/drive/My Drive/DataSets/Twitter Data/Ugly_Words_FULL.csv')
data = data_temp.to_numpy() # Convert to Numpy (Trying to do everything using Pandas is a nightmare)
data = data[:1500, 0:7] # Pulls all columns and labeled rows
data_temp = [] # Free's up memory (using temp incase copying over same variable corrupted anything)

# Shuffles the Data
np.random.shuffle(data) # Results change because of this. Best manually create balanced set or labels for best results

# Cleaning
X_Full = data[:1500,6].astype(str) # columns with tweets, and forces all of it to be a strings
X_temp = Cleaning(X_Full) # Sends to the Cleaning Function
X_Full = X_temp # Just incase a copying error happens


In [0]:
# Train, Validation, and Training set
y_train = data[:1000, 5].astype(int) # Forces data to be integers incase it is read as float
X_train = X_Full[:1000]
y_valid = data[1000:1250, 5].astype(int)
X_valid = X_Full[1000:1250]
y_test = data[1250:1500, 5].astype(int)
X_test = X_Full[1250:1500]


# Tokenizing, Padding, and Embedding

In [0]:
token_len = 50 # Length we have to make each tokenized row be (important because Tensor wants everything to be rectangular)
tokenizer = tf.keras.preprocessing.text.Tokenizer() # Sets up the Tokenizer which we will feed

tokenizer.fit_on_texts(X_train) # Feed the tokenizer with the training and valid data
tokenizer.fit_on_texts(X_valid)
# tokenizer.fit_on_texts(X_test) # Get better results from not feeding it the test set
                                 # reason might be that it likes being feed new information after training

X_train = tokenizer.texts_to_sequences(X_train)  # Convert text into numerical values and into vectors
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=token_len, padding='pre', truncating='post') # Pads the vectors so they all are the same length

X_valid = tokenizer.texts_to_sequences(X_valid)
X_valid = tf.keras.preprocessing.sequence.pad_sequences(X_valid, maxlen=token_len, padding='pre', truncating='post')

X_test = tokenizer.texts_to_sequences(X_test)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=token_len, padding='pre', truncating='post')

# Converting Numpy into Tensor friendly arrays

In [32]:
X_train = tf.constant(X_train, dtype=tf.int64) # Constant and some other function can be used to convert to a tensor array
y_train = tf.constant(y_train, dtype=tf.int64)
train_data = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(128).batch(128) # Combines data with labels. (Note: Data then Labels, in that order)

X_valid = tf.constant(X_valid, dtype=tf.int64)
y_valid = tf.constant(y_valid, dtype=tf.int64)
valid_data = tf.data.Dataset.from_tensor_slices((X_valid, y_valid)).shuffle(128).batch(128) # .shuffle(128).batch(128) Must use for Tensor to understand

X_test = tf.constant(X_test, dtype=tf.int64)
y_test = tf.constant(y_test, dtype=tf.int64)
test_data = tf.data.Dataset.from_tensor_slices((X_test, y_test)).shuffle(128).batch(128)


<BatchDataset shapes: ((None, 50), (None,)), types: (tf.int64, tf.int64)>
<BatchDataset shapes: ((None, 50), (None,)), types: (tf.int64, tf.int64)>
<BatchDataset shapes: ((None, 50), (None,)), types: (tf.int64, tf.int64)>


# Model and Layers

In [33]:
model = tf.keras.Sequential() # Setting up the model
model.add(tf.keras.layers.Embedding(len(tokenizer.word_index)+5, 100)) # Adds the embedding the layer into the model ( for "(len(tokenizer.word_index)+5, 100))" changing 100 to 1 will squish your data and get poorer results, while expanding to can improve but at a performance costs )
model.add(tf.keras.layers.LSTM(100, activation ='relu', return_sequences= False, recurrent_dropout = 0.1)) # Adds LSTM layer into the model. Recurrent_dropout = Float between 0 and 1. Fraction of the units to drop for the linear transformation of the recurrent state (so it drops nodes after each iteration, i think)
model.add(tf.keras.layers.Dense(1, activation='sigmoid')) # Adds Densely-connected Neural Network layer into the model. Only one node is needed because the sigmoid smooths the outputs from zero to one.
print(model.summary()) # prints out summary of the embedded words size and other information on the parameteres. (Note: the larger the Param (a.k.a. unquie words it can store) the longer it takes to train.)

4924
Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 100)         492900    
_________________________________________________________________
lstm_5 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 101       
Total params: 573,401
Trainable params: 573,401
Non-trainable params: 0
_________________________________________________________________
None


In [34]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) # Compile the layers with certain setting we give it. 'adam' is an overall pretty good optimizer. 
                                                                                  # Binary_Crossentropy is a good loss function for this project because we have binary labels (and in a way, data)
history = model.fit(train_data, validation_data=valid_data, epochs=25, verbose=1) # Model Training. Validation Data will help counter overfitting to a point. 
                                                                                  # Careful with how many epochs used (find a sweet spot)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [36]:
# Running the Test data
results = model.evaluate(X_test, y_test, batch_size=128) # Prints out the results

# Print out the results
for name, value in zip(model.metrics_names, results): # Prints out the same results without the progress bar
  print("%s: %.3f" % (name, value))

loss: 0.503
accuracy: 0.860


In [0]:
# Save weights
# model.save_weights('Good_weights.h5', overwrite=True) # un-hastag if the accuracy is above 86% (loss: 0.503) 
                                                        # Need to balance the training data (and maybe oversample) in order to break the 90%