In [1]:
import numpy as np
import pandas as pd
import spacy

In [2]:
#get data

df = pd.read_csv('train.csv', header=None)
df_test = pd.read_csv('test.csv', header=None)

In [3]:
df.head()

Unnamed: 0,0,1,2
0,3,more like funchuck,Gave this to my dad for a gag gift after direc...
1,5,Inspiring,I hope a lot of people hear this cd. We need m...
2,5,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
3,4,Chrono Cross OST,The music of Yasunori Misuda is without questi...
4,5,Too good to be true,Probably the greatest soundtrack in history! U...


In [4]:
df.rename({0:"star", 1:"rating1", 2:"rating2"}, axis=1, inplace=True)

In [5]:
df.head()

Unnamed: 0,star,rating1,rating2
0,3,more like funchuck,Gave this to my dad for a gag gift after direc...
1,5,Inspiring,I hope a lot of people hear this cd. We need m...
2,5,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
3,4,Chrono Cross OST,The music of Yasunori Misuda is without questi...
4,5,Too good to be true,Probably the greatest soundtrack in history! U...


In [6]:
df["review"] = df["rating1"] + " " +  df["rating2"]

In [7]:
df.head()

Unnamed: 0,star,rating1,rating2,review
0,3,more like funchuck,Gave this to my dad for a gag gift after direc...,more like funchuck Gave this to my dad for a g...
1,5,Inspiring,I hope a lot of people hear this cd. We need m...,Inspiring I hope a lot of people hear this cd....
2,5,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...,The best soundtrack ever to anything. I'm read...
3,4,Chrono Cross OST,The music of Yasunori Misuda is without questi...,Chrono Cross OST The music of Yasunori Misuda ...
4,5,Too good to be true,Probably the greatest soundtrack in history! U...,Too good to be true Probably the greatest soun...


In [8]:
df.drop(columns=["rating1", "rating2"], inplace=True)

In [9]:
df.head()

Unnamed: 0,star,review
0,3,more like funchuck Gave this to my dad for a g...
1,5,Inspiring I hope a lot of people hear this cd....
2,5,The best soundtrack ever to anything. I'm read...
3,4,Chrono Cross OST The music of Yasunori Misuda ...
4,5,Too good to be true Probably the greatest soun...


In [10]:
df.star.unique()

array([3, 5, 4, 1, 2], dtype=int64)

In [11]:
df.star = df.star.apply(lambda x: int(x) -1)

In [12]:
nlp = spacy.load("en_core_web_sm")

In [13]:
def preprocessing(sentence):
    """
    params sentence: a str containing the sentence we want to preprocess
    return the tokens list
    """
    doc = nlp(sentence)
    tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_stop]
    return tokens

In [14]:
import torch
from collections import Counter
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm, tqdm_notebook

In [16]:
df.shape

(3000000, 2)

In [23]:
train_df, test_df = df.iloc[:10000], df.iloc[-2000:]

In [18]:
from torchtext.vocab import FastText

In [19]:
fasttext = FastText("simple")
fasttext.vectors.shape

.vector_cache\wiki.simple.vec: 293MB [13:30, 362kB/s]                                
  0%|          | 0/111051 [00:00<?, ?it/s]Skipping token b'111051' with 1-dimensional vector [b'300']; likely a header
100%|██████████| 111051/111051 [00:36<00:00, 3045.79it/s]


torch.Size([111051, 300])

In [25]:
def token_encoder(token, vec):
    if token == "<pad>":
        return 1
    else:
        try:
            return vec.stoi[token]
        except:
            return 0

In [26]:
def encoder(tokens, vec):
    return [token_encoder(token, vec) for token in tokens]

In [27]:
def padding(list_of_indexes, max_seq_len, padding_index=1):
    output = list_of_indexes + (max_seq_len - len(list_of_indexes))*[padding_index]
    return output[:max_seq_len]

In [28]:
class TrainData(Dataset):
    def __init__(self, df, max_seq_len=32): # df is the input df, max_seq_len is the max lenght allowed to a sentence before cutting or padding
        self.max_seq_len = max_seq_len
        
        counter = Counter()
        train_iter = iter(df.review.values)
        self.vec = FastText("simple")
        self.vec.vectors[1] = -torch.ones(self.vec.vectors[1].shape[0]) # replacing the vector associated with 1 (padded value) to become a vector of -1.
        self.vec.vectors[0] = torch.zeros(self.vec.vectors[0].shape[0]) # replacing the vector associated with 0 (unknown) to become zeros
        self.vectorizer = lambda x: self.vec.vectors[x]
        self.labels = df.star
        sequences = [padding(encoder(preprocessing(sequence), self.vec), max_seq_len) for sequence in df.review.tolist()]
        self.sequences = sequences
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, i):
        assert len(self.sequences[i]) == self.max_seq_len
        return self.sequences[i], self.labels[i]

In [29]:
dataset = TrainData(train_df, max_seq_len=32)

In [30]:
def collate(batch, vectorizer=dataset.vectorizer):
    inputs = torch.stack([torch.stack([vectorizer(token) for token in sentence[0]]) for sentence in batch])
    target = torch.LongTensor([item[1] for item in batch]) # Use long tensor to avoid unwanted rounding
    return inputs, target

In [31]:
batch_size = 16
train_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate)

In [32]:
next(iter(train_loader))[0].shape

torch.Size([16, 32, 300])

In [33]:
from torch import nn
import torch.nn.functional as F
emb_dim = 300
class Classifier(nn.Module):
    def __init__(self, max_seq_len, emb_dim, hidden1=16, hidden2=16):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(max_seq_len*emb_dim, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, 5)
        self.out = nn.LogSoftmax(dim=1)
    
    
    def forward(self, inputs):
        x = F.relu(self.fc1(inputs.squeeze(1).float()))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return self.out(x)

In [34]:
MAX_SEQ_LEN = 32
model = Classifier(MAX_SEQ_LEN, 300, 16, 16)
model

Classifier(
  (fc1): Linear(in_features=9600, out_features=16, bias=True)
  (fc2): Linear(in_features=16, out_features=16, bias=True)
  (fc3): Linear(in_features=16, out_features=5, bias=True)
  (out): LogSoftmax(dim=1)
)

In [35]:
from torch import optim
criterion = nn.NLLLoss()

# Only train the classifier parameters, feature parameters are frozen
optimizer = optim.Adam(model.parameters(), lr=0.003)

In [36]:
dataiter = iter(train_loader)
sentences, labels = dataiter.next()

In [37]:
# Forward pass through the network
sentence_idx = 0
sentences.resize_(16, 1, MAX_SEQ_LEN*emb_dim).shape
log_ps = model.forward(sentences[sentence_idx,:])

sentence = sentences[sentence_idx]
torch.exp(log_ps)

tensor([[0.2082, 0.2531, 0.1976, 0.1783, 0.1627]], grad_fn=<ExpBackward0>)

In [38]:
epochs = 3
print_every = 40

for e in range(epochs):
    running_loss = 0
    print(f"Epoch: {e+1}/{epochs}")

    for i, (sentences, labels) in enumerate(iter(train_loader)):

        sentences.resize_(sentences.size()[0], 32* emb_dim)
        
        optimizer.zero_grad()
        
        output = model.forward(sentences)   # 1) Forward pass
        loss = criterion(output, labels) # 2) Compute loss
        loss.backward()                  # 3) Backward pass
        optimizer.step()                 # 4) Update model
        
        running_loss += loss.item()
        
        if i % print_every == 0:
            print(f"\tIteration: {i}\t Loss: {running_loss/print_every:.4f}")
            running_loss = 0

Epoch: 1/3
	Iteration: 0	 Loss: 0.0416
	Iteration: 40	 Loss: 1.6649
	Iteration: 80	 Loss: 1.6284
	Iteration: 120	 Loss: 1.6059
	Iteration: 160	 Loss: 1.6116
	Iteration: 200	 Loss: 1.6029
	Iteration: 240	 Loss: 1.5927
	Iteration: 280	 Loss: 1.6013
	Iteration: 320	 Loss: 1.6068
	Iteration: 360	 Loss: 1.6015
	Iteration: 400	 Loss: 1.6057
	Iteration: 440	 Loss: 1.6039
	Iteration: 480	 Loss: 1.5999
	Iteration: 520	 Loss: 1.6034
	Iteration: 560	 Loss: 1.5810
	Iteration: 600	 Loss: 1.5699
Epoch: 2/3
	Iteration: 0	 Loss: 0.0378
	Iteration: 40	 Loss: 1.6065
	Iteration: 80	 Loss: 1.5888
	Iteration: 120	 Loss: 1.5697
	Iteration: 160	 Loss: 1.5284
	Iteration: 200	 Loss: 1.5370
	Iteration: 240	 Loss: 1.4851
	Iteration: 280	 Loss: 1.4920
	Iteration: 320	 Loss: 1.4742
	Iteration: 360	 Loss: 1.5236
	Iteration: 400	 Loss: 1.5283
	Iteration: 440	 Loss: 1.5551
	Iteration: 480	 Loss: 1.5440
	Iteration: 520	 Loss: 1.5579
	Iteration: 560	 Loss: 1.5257
	Iteration: 600	 Loss: 1.5157
Epoch: 3/3
	Iteration: 0	 