# ***Applying Transfer learning and using the embedded vectors from Doc2Vec to train an MLP :***

In [74]:
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
tqdm.pandas()
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch


Now that we got our embedded vectors from all the reviews using doc2vec and saved them in a csv file, we first start by importing the embeddings and then we will split these vectors into train set , validation set and test set and then train different MLP models and pick up the one that performs the best on the validation set 

# ***Getting the arrays of the reviews and the target sentiments :***

In [75]:
data_new=pd.read_csv('./data/doc2vec_dataset.csv',sep='\t')

In [76]:
def str_to_float(row):
    rev=row['embedded review']
    rev=re.sub('[\n]', '', rev).strip('[]').split()
    return(np.array(rev,dtype=str).astype(np.float))

In [77]:
data_new['array']=data_new.progress_apply(str_to_float,axis=1)

100%|██████████| 50000/50000 [00:06<00:00, 7832.05it/s]


In [78]:
X=np.array(data_new['array'].to_list(),dtype='float64')
Y=data_new['sentiment'].to_numpy()

# ***Splitting the data and Standardizing using X_train :***

In [79]:
X_train,X_val,Y_train,Y_val=train_test_split(X,Y,test_size=0.33,random_state=40)
X_test,X_val,Y_test,Y_val=train_test_split(X_val,Y_val,test_size=0.5,random_state=40)


In [80]:
std=StandardScaler().fit(X=X_train)
std

StandardScaler()

In [81]:
X_train_std,X_val_std,X_test_std=std.transform(X_train),std.transform(X_val),std.transform(X_test)
print(f'mean of training data after standardizing : {X_train_std.mean():.5f}')
print('-'*50)
print(f'mean of testing data after standardizing : {X_test_std.mean():.5f}')

mean of training data after standardizing : -0.00000
--------------------------------------------------
mean of testing data after standardizing : -0.00007


# ***Training MLP on classifying the embeddings of the reviews :***

In [82]:
class Feedforward(torch.nn.Module):
        def __init__(self, input_size, hidden_size):
            super(Feedforward, self).__init__()
            self.input_size = input_size
            self.hidden_size  = hidden_size
            self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
            self.relu = torch.nn.ReLU()
            self.drop=torch.nn.Dropout(p=0.2)
            self.fc2 = torch.nn.Linear(self.hidden_size, 2)
            self.sigmoid = torch.nn.Sigmoid()        
        def forward(self, x):
            hidden = self.fc1(x)
            relu = self.relu(hidden)
            drop=self.drop(relu)
            output = self.fc2(drop)
            output = self.sigmoid(output)
            return output

In [83]:
X_train_std,Y_train=torch.FloatTensor(X_train_std).cuda(),torch.FloatTensor(Y_train).long().cuda()
X_val_std,Y_val=torch.FloatTensor(X_val_std).cuda(),torch.FloatTensor(Y_val).long().cuda()
X_test_std,Y_test=torch.FloatTensor(X_test_std).cuda(),torch.FloatTensor(Y_test).long().cuda()

In [84]:
model = Feedforward(300, 10)
criterion = torch.nn.CrossEntropyLoss().cuda()
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)

In [85]:
model.cuda()

Feedforward(
  (fc1): Linear(in_features=300, out_features=10, bias=True)
  (relu): ReLU()
  (drop): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=10, out_features=2, bias=True)
  (sigmoid): Sigmoid()
)

In [86]:
model.eval()
Y_pred = model(X_test_std)
before_train = criterion(Y_pred, Y_test.long())
print(f'Test loss before training   :   {before_train.item():.3f}')

Test loss before training   :   0.695


In [87]:
model.train()
epoch = 10000
for epoch in range(epoch):    
    optimizer.zero_grad()    # Forward pass
    y_pred = model(X_train_std)    # Compute Loss
    loss = criterion(y_pred, Y_train)
   
    if epoch%1000==0:
        print('-'*50)
        print('Epoch {}: train loss: {:.3f}\n'.format(epoch, loss.item()))    # Backward pass
    loss.backward()
    optimizer.step()

--------------------------------------------------
Epoch 0: train loss: 0.696

--------------------------------------------------
Epoch 1000: train loss: 0.671

--------------------------------------------------
Epoch 2000: train loss: 0.631

--------------------------------------------------
Epoch 3000: train loss: 0.592

--------------------------------------------------
Epoch 4000: train loss: 0.564

--------------------------------------------------
Epoch 5000: train loss: 0.547

--------------------------------------------------
Epoch 6000: train loss: 0.537

--------------------------------------------------
Epoch 7000: train loss: 0.531

--------------------------------------------------
Epoch 8000: train loss: 0.525

--------------------------------------------------
Epoch 9000: train loss: 0.523



In [88]:
model.eval()
Y_pred = model(X_test_std)
before_train = criterion(Y_pred, Y_test)
print(f'Test loss after training   :   {before_train.item():.3f}')

Test loss after training   :   0.520


In [91]:
print(Y_pred[3])
print(Y_test[3])

tensor([0.0524, 0.9568], device='cuda:0', grad_fn=<SelectBackward>)
tensor(1, device='cuda:0')
