In [1]:
from mnist import MNIST
import pandas as pd
import numpy as np
import tensorflow as tf
import gensim.downloader
from gensim.models import Word2Vec

# Task 1

For the entirety of Task 1, I'll be using the Keras model in tensorflow. This is because Keras has frameworks for RNN, LSTM, and GRU! If I create a 2 layer sequential keras model where it goes **| Input -> RNN -> Dense -> Output |** I can easily replace the simple RNN layer with a GRU or LSTM layer to help with Part 2 of task 1. 

Side note: The dataset I'm using is Delhi's daily climate over the span of a couple years. The dataset only has 4 variables and I'll be attempting to predict the weather of every 7th day, so the input dimension is 4x6.

The output is 4 variables: Temp, Humidity, Wind Speed, Pressure.

I'll measure performance by predicting the 7th day of a 6 day sequence, compare it to the actual 7th day, then use cosine similarity. I use cos similarity because all 4 variables predicted are all regression based predictions.

In [2]:
# Grab data
dfTrain = pd.read_csv("DailyDelhiClimateTrain.csv")
dfTest = pd.read_csv("DailyDelhiClimateTest.csv")
dfTrain.head()

Unnamed: 0,date,meantemp,humidity,wind_speed,meanpressure
0,2013-01-01,10.0,84.5,0.0,1015.666667
1,2013-01-02,7.4,92.0,2.98,1017.8
2,2013-01-03,7.166667,87.0,4.633333,1018.666667
3,2013-01-04,8.666667,71.333333,1.233333,1017.166667
4,2013-01-05,6.0,86.833333,3.7,1016.5


In [3]:
# Drop date
dfTrain = dfTrain.drop(columns=['date'])
dfTest = dfTest.drop(columns=['date'])

# Reformat into np array
Train = dfTrain.to_numpy()
Test = dfTest.to_numpy()

# Size
trainRows, trainCols = Train.shape
testRows, testCols = Test.shape

# Drop extra rows so Train mod 7 = 0
Train = np.delete(Train, slice(trainRows - (trainRows % 7),trainRows), 0)
Test = np.delete(Test, slice(testRows - (testRows % 7),testRows), 0)

# Reshape into 3D nx7x4 shape
Train = np.reshape(Train, (trainRows//7, 7, 4))
Test = np.reshape(Test, (testRows//7, 7, 4))

print(Train[0:4])

[[[1.00000000e+01 8.45000000e+01 0.00000000e+00 1.01566667e+03]
  [7.40000000e+00 9.20000000e+01 2.98000000e+00 1.01780000e+03]
  [7.16666667e+00 8.70000000e+01 4.63333333e+00 1.01866667e+03]
  [8.66666667e+00 7.13333333e+01 1.23333333e+00 1.01716667e+03]
  [6.00000000e+00 8.68333333e+01 3.70000000e+00 1.01650000e+03]
  [7.00000000e+00 8.28000000e+01 1.48000000e+00 1.01800000e+03]
  [7.00000000e+00 7.86000000e+01 6.30000000e+00 1.02000000e+03]]

 [[8.85714286e+00 6.37142857e+01 7.14285714e+00 1.01871429e+03]
  [1.40000000e+01 5.12500000e+01 1.25000000e+01 1.01700000e+03]
  [1.10000000e+01 6.20000000e+01 7.40000000e+00 1.01566667e+03]
  [1.57142857e+01 5.12857143e+01 1.05714286e+01 1.01614286e+03]
  [1.40000000e+01 7.40000000e+01 1.32285714e+01 1.01557143e+03]
  [1.58333333e+01 7.51666667e+01 4.63333333e+00 1.01333333e+03]
  [1.28333333e+01 8.81666667e+01 6.16666667e-01 1.01516667e+03]]

 [[1.47142857e+01 7.18571429e+01 5.28571429e-01 1.01585714e+03]
  [1.38333333e+01 8.66666667e+01 0.0

In [4]:
# Split into nx6x4 shape X and nx1x4 shape Y
a, b, c, yTrain = np.array_split(Train, 4, 1) 
xTrain = np.concatenate((a,b,c), 1) 
yTrain = yTrain.reshape(trainRows//7,4)

a, b, c, yTest = np.array_split(Test, 4, 1) 
xTest = np.concatenate((a,b,c), 1) 
yTest = yTest.reshape(testRows//7,4)

print(xTrain[0:2])
print(xTrain.shape)
print(yTrain[0:2])
print(yTrain.shape)

[[[  10.           84.5           0.         1015.66666667]
  [   7.4          92.            2.98       1017.8       ]
  [   7.16666667   87.            4.63333333 1018.66666667]
  [   8.66666667   71.33333333    1.23333333 1017.16666667]
  [   6.           86.83333333    3.7        1016.5       ]
  [   7.           82.8           1.48       1018.        ]]

 [[   8.85714286   63.71428571    7.14285714 1018.71428571]
  [  14.           51.25         12.5        1017.        ]
  [  11.           62.            7.4        1015.66666667]
  [  15.71428571   51.28571429   10.57142857 1016.14285714]
  [  14.           74.           13.22857143 1015.57142857]
  [  15.83333333   75.16666667    4.63333333 1013.33333333]]]
(208, 6, 4)
[[7.00000000e+00 7.86000000e+01 6.30000000e+00 1.02000000e+03]
 [1.28333333e+01 8.81666667e+01 6.16666667e-01 1.01516667e+03]]
(208, 4)


Next let's configure the model

In [5]:
# Input, RNN, Dense layer, Output
model = tf.keras.Sequential(
    [
        tf.keras.Input((6, 4)),
        tf.keras.layers.SimpleRNN(6, activation='tanh'), # 6 hidden units (one for each time state 1 thru 6)
        tf.keras.layers.Dense(4, activation='softmax')
    ]
)
model.summary()

# Use ADAM optimization. No regularization
model.compile(optimizer=tf.keras.optimizers.Adam(), loss='mean_squared_error', metrics=['cosine_similarity'])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn (SimpleRNN)      (None, 6)                 66        
                                                                 
 dense (Dense)               (None, 4)                 28        
                                                                 
Total params: 94
Trainable params: 94
Non-trainable params: 0
_________________________________________________________________


In [6]:
model.fit(x=xTrain,y=yTrain,batch_size=1,epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x2a488b12230>

In [7]:
model.evaluate(x=xTest,y=yTest)



[256567.96875, 0.998266339302063]

Well I got an impressive output for cosine similarity. 99.88%

## Part 2

Let's see if I can do better with LSTM or GRU! 

In [8]:
# Input, RNN, Dense layer, Output
model = tf.keras.Sequential(
    [
        tf.keras.Input((6, 4)),
        tf.keras.layers.LSTM(6, activation='tanh'), # 6 cells (one for each time state 1 thru 6)
        tf.keras.layers.Dense(4, activation='softmax')
    ]
)
model.summary()

# Use ADAM optimization. No regularization
model.compile(optimizer=tf.keras.optimizers.Adam(), loss='mean_squared_error', metrics=['cosine_similarity'])

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 6)                 264       
                                                                 
 dense_1 (Dense)             (None, 4)                 28        
                                                                 
Total params: 292
Trainable params: 292
Non-trainable params: 0
_________________________________________________________________


In [9]:
model.fit(x=xTrain,y=yTrain,batch_size=1,epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x2a4910d7b50>

In [10]:
model.evaluate(x=xTest,y=yTest)



[256642.1875, 0.9966260194778442]

For LSTM: 98.53%

Not much of a difference (not that a significant improvement can be made anyway) Now to GRU:

In [11]:
# Input, RNN, Dense layer, Output
model = tf.keras.Sequential(
    [
        tf.keras.Input((6, 4)),
        tf.keras.layers.GRU(6, activation='tanh'), # 6 cells (one for each time state 1 thru 6)
        tf.keras.layers.Dense(4, activation='softmax')
    ]
)
model.summary()

# Use ADAM optimization. No regularization
model.compile(optimizer=tf.keras.optimizers.Adam(), loss='mean_squared_error', metrics=['cosine_similarity'])

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru (GRU)                   (None, 6)                 216       
                                                                 
 dense_2 (Dense)             (None, 4)                 28        
                                                                 
Total params: 244
Trainable params: 244
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.fit(x=xTrain,y=yTrain,batch_size=1,epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x2a4934932e0>

In [13]:
model.evaluate(x=xTest,y=yTest)



[256688.796875, 0.9804650545120239]

For GRU: 99.90%!

All in all, each option did incredibly good (98+ percent) and while GRU technically did the best, there isn't much difference. I believe this might just be an issue of not having an incredibly complex dataset to thouroughly display how one option is better than the others.

## Part 3 

You COULD use a traditional feed-forward network on my input. The input would be a 6 x 4 2d array and the output would be a 1x4 array where the error is output vs the 7th day 1x4 array.

The thing is that in a feed-forward, time becomes another dimension you add to the input. Meanwhile in an RNN, time is an aspect of the network as a whole and is more emphasized.

I believe a feed-forward network will have more of an issue understanding that the later days (day 5 and 6) have more of an impact on day 7's weather than days 1 and 2. Meanwhile, an RNN tends to emphasize the importance of the later time segments than the earlier ones.

# Task 2

I'm gonna use gensim's model to create a function to compare 2 words' cosine similarity.

In [16]:
corpus = gensim.downloader.load('text8')
w2v = Word2Vec(corpus)
print(w2v.wv.most_similar('tree'))

[('trees', 0.6832190155982971), ('leaf', 0.6587675213813782), ('bark', 0.6577526330947876), ('avl', 0.6302106380462646), ('bird', 0.6102473139762878), ('fruit', 0.5899716019630432), ('flower', 0.5892435908317566), ('skeleton', 0.5835850238800049), ('beetle', 0.5697873830795288), ('garden', 0.5644292831420898)]


In [30]:
def simDisim(word1, word2):
    sim = w2v.wv.similarity(word1, word2)
    bestW1 = w2v.wv.most_similar(word1)
    bestW2 = w2v.wv.most_similar(word2)
    dissim = 0
    for neighbor in bestW1:
        dissim += w2v.wv.similarity(word2, neighbor[0])
    for neighbor in bestW2:
        dissim += w2v.wv.similarity(word1, neighbor[0])
    dissim /= 20
    dissim = 1 - dissim
    print(word1, " and ", word2, " similarity: ", sim)
    print(word1, " and ", word2, " dissimilarity: ", dissim)

In [31]:
simDisim("pizza","party")
simDisim("tree","bird")
simDisim("fruit","vegetable")
simDisim("car","park")

pizza  and  party  similarity:  -0.004141184
pizza  and  party  dissimilarity:  1.0474904676550068
tree  and  bird  similarity:  0.61024725
tree  and  bird  dissimilarity:  0.4651725277304649
fruit  and  vegetable  similarity:  0.8231189
fruit  and  vegetable  dissimilarity:  0.16694877743721004
car  and  park  similarity:  0.3389139
car  and  park  dissimilarity:  0.725123417750001


For similarity, I can just call the cosine similarity function that gensim has. For dissimilarity, I wanted to do something special. It's the average similarity of word1 vs word2's neighbors and word2 vs word1's neighbors. 

For words used commonly together, they will have high similarity. This does not imply that the words are always used in the same contexts. For example, car park is a regular usage of car and park together. Although, park is usually used in a different context, talking about a garden or playground.

For this, dissimilarity compares if the contexts word1 and word2 usually exist within are similar! If dissimilarity is high, their contexts are usually different. If dissimilarity is low, they tend to remain together between contexts.