# 1. Sentiment analysis

Using the [Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/), we want to do a regression model that predict the ratings are on a 1-10 scale. You have an example train and test set in the `dataset` folder.

### 1.1 Regression Model

Use a feedforward neural network and NLP techniques we've seen up to now to train the best model you can on this dataset

### 1.2 RNN model

Train a RNN to do the sentiment analysis regression. The RNN should consist simply of an embedding layer (to make word IDs into word vectors) a recurrent blocks (GRU or LSTM) feeding into an output layer.

In [44]:
from os import listdir
from os.path import isfile, join
import pandas as pd
import numpy as np  
import re 

# Reading the files, making a dataframe with text and reviews for each rows

path_negative='dataset\\aclImdb\\train\\neg'
path_positive='dataset\\aclImdb\\train\\pos'

onlyfiles_n = [path_negative+'\\'+f for f in listdir(path_negative) if isfile(join(path_negative, f))]
onlyfiles_p = [path_positive+'\\'+f for f in listdir(path_positive) if isfile(join(path_positive, f))]
onlyfiles_n=onlyfiles_n[1:]
onlyfiles_p=onlyfiles_p[1:]
onlyfiles_p.extend(onlyfiles_n)
onlyfiles=onlyfiles_p


df=pd.DataFrame()

from tqdm import tqdm

L=[]
rating=[]
for File in tqdm(onlyfiles):

    #only keeping the review
    rating.append(re.sub("[^0-9 ]", "", File[-6:]))

    f = open(F"{File}", encoding="utf8")
    a=f.read()
    f.close()
    #Only keeping letters and numbers and spaces
    a=re.sub("[^a-zA-Z0-9 ]", "", a)
    L.append(a)
df=pd.DataFrame([L,rating]).T
df.columns=['review','rating']

#Shuffle DataFrame rows
df=df.sample(frac=0.05).reset_index(drop=True)
df.head(5)

df.rating=df.rating.astype(int)



100%|██████████| 24998/24998 [00:05<00:00, 4538.46it/s]


In [46]:
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer 
import sklearn.feature_extraction.text as text


# Cleaning the data  for the reviews + TF-IDF 

TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

# to remove the stop words
stop_words = stopwords.words("english")


stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer() 

#preprocess fontion 
def preprocess(text, stem=False,lem=True):
    # Remove link and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            if lem:
                tokens.append(lemmatizer.lemmatize(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

# cleaning the data for every rows
df.review= df.review.apply(lambda x: preprocess(x))


tfidf = text.TfidfVectorizer()  
tfidf.fit(df.review)
t = tfidf.transform(df.review)
t=(t.todense())

In [48]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Flatten, Dropout, Conv2D, Concatenate
from keras.models import Model
model = keras.Sequential()

model.add(keras.Input(shape=(t.shape[1])))

model.add(Dense(512,activation='relu'))
model.add(Dense(512,activation='relu'))

model.add(Dense(10,activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(t, df.rating, epochs=5, batch_size=125)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x223ebcdbc10>

## A feedforward neural network does not gice any good results...

## RNN model

In [50]:
df

Unnamed: 0,review,rating
0,moron girlfriend conduct ritual resurrect dead...,1
1,pathetic worse bad madefortv movie cant believ...,2
2,go see film honestly jim carrey past made hila...,4
3,wesley snipe james dial assassin hire agent ci...,4
4,good rent buy original watch someone gun head ...,4
...,...,...
1245,movie sit easily even particularly like movie ...,1
1246,agree veinbreaker wrote regard ahhhh feeling g...,10
1247,series ups occasional latter case agreeable am...,3
1248,saw movie advance screening found excellentbr ...,10


In [52]:
#Get the word tokens and tags into a readable list format
df['Tokenized_Sentence'] = df['review'].apply(lambda sent: sent.split(" "))

df[:10]

Unnamed: 0,review,rating,Tokenized_Sentence
0,moron girlfriend conduct ritual resurrect dead...,1,"[moron, girlfriend, conduct, ritual, resurrect..."
1,pathetic worse bad madefortv movie cant believ...,2,"[pathetic, worse, bad, madefortv, movie, cant,..."
2,go see film honestly jim carrey past made hila...,4,"[go, see, film, honestly, jim, carrey, past, m..."
3,wesley snipe james dial assassin hire agent ci...,4,"[wesley, snipe, james, dial, assassin, hire, a..."
4,good rent buy original watch someone gun head ...,4,"[good, rent, buy, original, watch, someone, gu..."
5,must accompanied special rating warning recomm...,1,"[must, accompanied, special, rating, warning, ..."
6,remarkable disturbing film truelife senseless ...,10,"[remarkable, disturbing, film, truelife, sense..."
7,made film love film somebody wacky sense humor...,8,"[made, film, love, film, somebody, wacky, sens..."
8,inexhaustible hunger basic training movie surp...,8,"[inexhaustible, hunger, basic, training, movie..."
9,christopher guest need worry supreme hold mock...,2,"[christopher, guest, need, worry, supreme, hol..."


In [54]:
def make_lexicon(token_seqs, min_freq=1):
    '''Create a lexicon for the words in the sentences as well as the tags'''
    # First, count how often each word appears in the text.
    token_counts = {}
    for seq in token_seqs:
        for token in seq:
            if token in token_counts:
                token_counts[token] += 1
            else:
                token_counts[token] = 1

    # Then, assign each word to a numerical index. Filter words that occur less than min_freq times.
    lexicon = [token for token, count in token_counts.items() if count >= min_freq]
    # Indices start at 1. 0 is reserved for padding, and 1 is reserved for unknown words.
    lexicon = {token:idx + 2 for idx,token in enumerate(lexicon)}
    lexicon[u'<UNK>'] = 1 # Unknown words are those that occur fewer than min_freq times
    lexicon_size = len(lexicon)

    print("LEXICON SAMPLE ({} total items):".format(len(lexicon)))
    print(dict(list(lexicon.items())[:20]))
    
    return lexicon

print("WORDS:")
words_lexicon = make_lexicon(df['Tokenized_Sentence'])


WORDS:
LEXICON SAMPLE (20902 total items):
{'moron': 2, 'girlfriend': 3, 'conduct': 4, 'ritual': 5, 'resurrect': 6, 'dead': 7, 'attempt': 8, 'prove': 9, 'brought': 10, 'back': 11, 'life': 12, 'surprisingly': 13, 'soul': 14, 'commences': 15, 'chopping': 16, 'axe': 17, 'next': 18, 'day': 19, 'college': 20, 'aged': 21}


In [56]:
'''Make a dictionary where the string representation of a lexicon item can be retrieved from its numerical index'''

def get_lexicon_lookup(lexicon):
    '''Make a dictionary where the string representation 
        of a lexicon item can be retrieved 
        from its numerical index
    '''
    lexicon_lookup = {idx: lexicon_item for lexicon_item, idx in lexicon.items()}
    print("LEXICON LOOKUP SAMPLE:")
    print(dict(list(lexicon_lookup.items())[:20]))
    return lexicon_lookup

def tokens_to_idxs(token_seqs, lexicon):
    idx_seqs = [[lexicon[token] if token in lexicon else lexicon['<UNK>'] for token in token_seq]  
                                                                     for token_seq in token_seqs]
    return idx_seqs

df['Sentence_Idxs'] = tokens_to_idxs(df['Tokenized_Sentence'], words_lexicon)
df[['Tokenized_Sentence', 'Sentence_Idxs']][:10]



Unnamed: 0,Tokenized_Sentence,Sentence_Idxs
0,"[moron, girlfriend, conduct, ritual, resurrect...","[2, 3, 4, 5, 6, 7, 8, 9, 7, 10, 11, 12, 13, 6,..."
1,"[pathetic, worse, bad, madefortv, movie, cant,...","[130, 131, 132, 133, 51, 134, 135, 136, 137, 1..."
2,"[go, see, film, honestly, jim, carrey, past, m...","[177, 35, 151, 178, 179, 180, 181, 168, 182, 5..."
3,"[wesley, snipe, james, dial, assassin, hire, a...","[310, 311, 312, 313, 314, 315, 316, 317, 318, ..."
4,"[good, rent, buy, original, watch, someone, gu...","[126, 401, 402, 403, 404, 405, 70, 406, 407, 4..."
5,"[must, accompanied, special, rating, warning, ...","[412, 413, 320, 414, 415, 416, 417, 418, 47, 4..."
6,"[remarkable, disturbing, film, truelife, sense...","[453, 454, 151, 455, 456, 457, 458, 459, 460, ..."
7,"[made, film, love, film, somebody, wacky, sens...","[168, 151, 381, 151, 638, 639, 451, 640, 47, 6..."
8,"[inexhaustible, hunger, basic, training, movie...","[703, 704, 705, 706, 51, 707, 69, 217, 49, 708..."
9,"[christopher, guest, need, worry, supreme, hol...","[735, 736, 389, 737, 738, 739, 740, 741, 742, ..."


In [58]:
from keras.preprocessing.sequence import pad_sequences

def pad_idx_seqs(idx_seqs, max_seq_len):
    # Keras provides a convenient padding function; 
    padded_idxs = pad_sequences(sequences=idx_seqs, maxlen=max_seq_len)
    return padded_idxs

max_seq_len = max([len(idx_seq) for idx_seq in df['Sentence_Idxs']]) # Get length of longest sequence
train_padded_words = pad_idx_seqs(df['Sentence_Idxs'], 
                                  max_seq_len + 1) #Add one to max length for offsetting sequence by 1

print("WORDS:\n", train_padded_words)
print("SHAPE:", train_padded_words.shape, "\n")



WORDS:
 [[    0     0     0 ...   128    47   129]
 [    0     0     0 ...   175   109   176]
 [    0     0     0 ...   308   174   309]
 ...
 [    0     0     0 ...  2729  1676  5065]
 [    0     0     0 ...  1990  2028   252]
 [    0     0     0 ...  5961 18896   126]]
SHAPE: (1250, 577) 



In [60]:
train_padded_words.shape

(1250, 577)

In [71]:
'''Create the model'''

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Concatenate, TimeDistributed, Dense
from tensorflow.keras.layers import Embedding, GRU

def create_model(seq_input_len, n_word_input_nodes, n_word_embedding_nodes,
                  n_hidden_nodes, stateful=False, batch_size=20):
    
    #Layers 1
    word_input = Input(shape=(None,))

    #Layers 2
    word_embeddings = Embedding(input_dim=n_word_input_nodes,
                                output_dim=n_word_embedding_nodes, 
                                mask_zero=True)(word_input) #mask_zero will ignore 0 padding

    #Layer 4
    hidden_layer = GRU(units=n_hidden_nodes)(word_embeddings)
    #Output shape = (batch_size, seq_input_len, n_hidden_nodes)

    hidden_layer=Dense(512,activation= 'relu')(hidden_layer)
    #Layer 5
    output_layer = Dense(units=10,activation='softmax')(hidden_layer)
    # Output shape = (batch_size, seq_input_len, n_tag_input_nodes)
    
    #Specify which layers are input and output, compile model with loss and optimization functions
    model = Model(inputs=word_input, outputs=output_layer)
    model.compile(loss="mean_squared_error",
                  optimizer='adam',metrics=['accuracy'])
    
    return model
    #sparse_categorical_crossentropy

In [72]:
model = create_model(seq_input_len=train_padded_words.shape[-1] - 1, #substract 1 from matrix length because of offset
                     n_word_input_nodes=len(words_lexicon) + 1, #Add one for 0 padding
                     n_word_embedding_nodes=300,
                     n_hidden_nodes=500)

In [73]:
'''Train the model'''

# output matrix (y) has extra 3rd dimension added because sparse cross-entropy function requires one label per row
model.fit(x=train_padded_words[:,1:],y=df.rating, batch_size=20, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x22298c2a610>

# 2. (evil) XOR Problem

Train an LSTM to solve the XOR problem: that is, given a sequence of bits, determine its parity. The LSTM should consume the sequence, one bit at a time, and then output the correct answer at the sequence’s end. Test the two approaches below:

### 2.1 

Generate a dataset of random <=100,000 binary strings of equal length <= 50. Train the LSTM; what is the maximum length you can train up to with precisison?
    

### 2.2

Generate a dataset of random <=200,000 binary strings, where the length of each string is independently and randomly chosen between 1 and 50. Train the LSTM. Does it succeed? What explains the difference?


In [75]:
#I got the code from https://github.com/mitchellvitez/lstm-xor/blob/master/lstm_xor.py

In [76]:
#trying something for 2.2....

arr2=np.random.randint(0,2,200000)


i2=np.random.randint(0,51)
i1=0
L=[]
y2=[]
while i2<=len(arr2):
    L.append([arr2[i1:i2].astype(str).tolist()])
    y2.append(arr2[i1:i2].sum()%2)
    i1=i2
    i2+=np.random.randint(0,51)


L.append([arr2[i1:i2].astype(str).tolist()])
y2.append(arr2[i1:i2].sum()%2)






In [77]:
# I got the code from 
#https://github.com/mitchellvitez/lstm-xor/blob/master/lstm_xor.py



from tensorflow.keras import optimizers
from tensorflow.keras.layers import Dense, Input, LSTM, Activation
from tensorflow.keras.models import Sequential
import numpy as np
import random

SEQ_LEN = 50
COUNT = 100000

bin_pair = lambda x: [x, not(x)]
training = np.array([[bin_pair(random.choice([0, 1])) for _ in range(SEQ_LEN)] for _ in range(COUNT)]).astype(float)
target = np.array([[bin_pair(x) for x in np.cumsum(example[:,0]) % 2] for example in training]).astype(float)

print('shape check:', training.shape, '=', target.shape)

model = Sequential()
model.add(Input(shape=(SEQ_LEN, 2)))
model.add(LSTM(1, return_sequences=True))
model.add(Dense(2, activation='softmax'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(training, target, epochs=10, batch_size=128)
model.summary()

predictions = model.predict(training)
i = random.randint(0, COUNT)
chance = predictions[i,-1,0]
print('randomly selected sequence:', training[i,:,0])
print('prediction:', int(chance > 0.5))
print('confidence: {:0.2f}%'.format((chance if chance > 0.5 else 1 - chance) * 100))
print('actual:', np.sum(training[i,:,0]) % 2)

shape check: (100000, 50, 2) = (100000, 50, 2)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 50, 1)             16        
_________________________________________________________________
dense_21 (Dense)             (None, 50, 2)             4         
Total params: 20
Trainable params: 20
Non-trainable params: 0
_________________________________________________________________
randomly selected sequence: [0. 1. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 0.
 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1.
 1. 1.]
prediction: 0
confidence: 93.78%
actual: 0.0
