## Cleaning Process

In [None]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
import re

import nltk

from tqdm import tqdm
tqdm.pandas()

In [None]:
# 0 = negative, 2 = neutral, 4 = positive

twitter_df = pd.read_csv("Resources/twitter_data.csv", header=None)
twitter_df.columns= ['sentiment','user_id','date','flag','user','text']
twitter_df.head(10)

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

In [None]:
def process_text(text):
    sw = set(stopwords.words('english'))
    
    re_clean = re.sub('(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)'," ", text)
    words = word_tokenize(re_clean)
    output = [lemmatizer.lemmatize(word.lower()) for word in words if ((word.lower() not in sw) and ('@' not in word.lower()))]

    return output

In [None]:
from tqdm import tqdm
tqdm.pandas()

In [None]:
twitter_df['text'] = twitter_df['text'].progress_apply(process_text)
twitter_df.head(10)
twitter_df.to_csv("Resources/twitter_cleaned.csv")

## After cleaning the dataset

In [1]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer



In [12]:
twitter_df = pd.read_csv("Resources/final_clean_twitter.csv")
twitter_df.head(10)

Unnamed: 0,sentiment,user_id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,awww bummer shoulda got david carr third day
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,upset update facebook texting might cry result...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,dived many time ball managed save 50 rest go b...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,whole body feel itchy like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,behaving mad see
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,whole crew
6,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,need hug
7,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,hey long time see yes rain bit bit lol fine th...
8,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,k nope
9,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,que muera


In [13]:
def remap(y):
    if y == 4:
        return 1
    else:
        return y

twitter_df['sentiment'] = twitter_df['sentiment'].apply(remap)
twitter_df

Unnamed: 0,sentiment,user_id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,awww bummer shoulda got david carr third day
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,upset update facebook texting might cry result...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,dived many time ball managed save 50 rest go b...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,whole body feel itchy like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,behaving mad see
...,...,...,...,...,...,...
1599995,1,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,woke school best feeling ever
1599996,1,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,thewdb com cool hear old walt interview
1599997,1,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,ready mojo makeover ask detail
1599998,1,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,happy 38th birthday boo alll time tupac amaru ...


In [40]:
X = twitter_df['text'].astype(str)
y = twitter_df['sentiment']

X_train, X_test,Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [41]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

word_indexer = tokenizer.word_index

In [None]:
import pickle

with open('Resources/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [43]:
embedding_dict = {}
embedding_vector_size = 0
with open("Resources/glove_twitter_embedding.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embedding_dict[word] = vector
        embedding_vector_size= vector.shape[0]


In [44]:
MAX_LEN = 500

In [45]:
emb_matrix = np.zeros((len(word_indexer) + 1, embedding_vector_size))
emb_matrix

for word, index in word_indexer.items():
  embedding_vector = embedding_dict.get(word)
  if embedding_vector is not None:
    emb_matrix[index, :] = embedding_vector

In [46]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Flatten

In [47]:
model = Sequential()

model.add(Embedding(input_dim=len(word_indexer) + 1, output_dim=embedding_vector_size, input_length=MAX_LEN, weights = [emb_matrix], trainable=False))

model.add(Flatten())

model.add(Dense(embedding_vector_size, activation='relu'))
model.add(Dense(embedding_vector_size, activation='relu'))

model.add(Dense(1, activation="sigmoid"))

In [48]:
model.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=['accuracy']
)

In [49]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 50)           13819400  
                                                                 
 flatten (Flatten)           (None, 25000)             0         
                                                                 
 dense (Dense)               (None, 50)                1250050   
                                                                 
 dense_1 (Dense)             (None, 50)                2550      
                                                                 
 dense_2 (Dense)             (None, 1)                 51        
                                                                 
Total params: 15,072,051
Trainable params: 1,252,651
Non-trainable params: 13,819,400
_________________________________________________________________


In [50]:
from keras_preprocessing.sequence import pad_sequences

X_train_indices = tokenizer.texts_to_sequences(X_train)
X_train_indices = pad_sequences(X_train_indices, maxlen=MAX_LEN, padding='post')

## (ran on google collab to speed up process)
### https://colab.research.google.com/drive/1HVufo9CvhyQOo8oV2a3b1xhSeIYrGb3m?usp=sharing

In [51]:
batch_size = 100
model.fit(
    X_train_indices,
    Y_train,
    epochs=10,
    batch_size=batch_size,
    validation_split=0.20
)

Epoch 1/10

KeyboardInterrupt: 

In [None]:
model.save('/content/drive/MyDrive/netflix_final_proj_resources/model')

In [None]:
from keras_preprocessing.sequence import pad_sequences

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

word_indexer = tokenizer.word_index

X_test_indices = tokenizer.texts_to_sequences(X_test)
X_test_indices = pad_sequences(X_test_indices, maxlen=MAX_LEN, padding='post')

In [None]:
X_test_indices = tokenizer.texts_to_sequences(X_test)
X_test_indices = pad_sequences(X_test_indices, maxlen=MAX_LEN, padding='post')

## Testing Model

In [None]:
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
predictions = model.predict(X_test_indices)

In [None]:
def round_prediction(a):
  return a > 0.5

In [None]:
binary_predictions = np.array(list(map(round_prediction, predictions)), dtype=int)

binary_predictions[:10]

In [None]:
actual = Y_test


confusion_matrix = metrics.confusion_matrix(actual, binary_predictions)

ax= plt.subplot()
conf_mat = sns.heatmap(confusion_matrix, annot=True, fmt='g', cmap="Blues", ax=ax)
ax.xaxis.tick_top()

In [None]:
precision = metrics.precision_score(actual, binary_predictions)
recall =  metrics.recall_score(actual, binary_predictions)

print(f'precision: {precision}, recall {recall}')

In [None]:
figure = conf_mat.get_figure()    
figure.savefig('/content/drive/MyDrive/netflix_final_proj_resources/conf_mat.png')

Loading model for new predictions

In [None]:
import tensorflow as tf
model = tf.keras.models.load_model('/content/drive/MyDrive/netflix_final_proj_resources/model')

In [None]:
import tensorflow as tf
from keras_preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
import pickle
import numpy as np

MAX_LEN = 500
TOKENIZER_FP = '/content/drive/MyDrive/netflix_final_proj_resources/tokenizer.pickle'
MODEL_FP = '/content/drive/MyDrive/netflix_final_proj_resources/model'



def round_prediction(a):
  return a > 0.5

def get_sentiment(tweet: str):

  with open(TOKENIZER_FP, 'rb') as f:
    tokenizer = pickle.load(f)
  
  model = tf.keras.models.load_model(MODEL_FP)

  tweet_vector = tokenizer.texts_to_sequences([tweet.lower()])
  tweet_vector = pad_sequences(tweet_vector, maxlen=MAX_LEN, padding='post')

  prediction = model.predict(tweet_vector)

  return np.array(list(map(round_prediction, prediction)), dtype=int)


get_sentiment("I am glad")





  


