<a href="https://colab.research.google.com/github/JohnnyPeng123/NLP-USYD/blob/master/Lab05%20-%20Johnny's%20Answer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lab 05

# Exercise
In this exercise, you are to preprocess the train and test data, and apply different pre-trained embeddings.

**Note**: We won't mark your exercise based on the test set performance, we will only check whether the preprocessing part and embedding part are correct.

**Important**: This exercise is very important to your assignment1 since you can use most of the codes here in your assignment1.


In [0]:
import torch
#You can enable GPU here (cuda); or just CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Download Dataset

In [0]:
# Code to download file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

id = '1gNfBqguzBu8cHKMPc8C44GbvD443dNC5'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('twitter.csv')  

import pandas as pd
df = pd.read_csv("twitter.csv")
df_pick = df.sample(40,random_state=24)

raw_text = df_pick["Text"].tolist()
raw_label = df_pick["Label"].tolist()

from sklearn.model_selection import train_test_split
text_train,text_test,label_train,label_test = train_test_split(raw_text,raw_label,test_size=0.25,random_state=42)

## Preprocessing [Complete this section]

**Case Folding**

In [0]:
text_train = [s.lower() for s in text_train]
text_test = [s.lower() for s in text_test]

**Remove punctuations [Please complete this section]**

In [0]:
import re
def remove_punctuation_re(x):
    x = re.sub(r'[^\w\s]','',x)
    return x
    
text_train = [remove_punctuation_re(s) for s in text_train]
text_test = [remove_punctuation_re(s) for s in text_test]

**Tokenization [Please complete this section]**

In [35]:
import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize

#Please complete this
text_train = [word_tokenize(s) for s in text_train]
text_test = [word_tokenize(s) for s in text_test]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**Remove stopwords [Please complete this section]**

In [36]:
nltk.download('stopwords')
from nltk.corpus import stopwords as sw
stop_words = sw.words()

text_train_ns=[]
for tokens in text_train:
    filtered_sentence = [w for w in tokens if not w in stop_words]
    text_train_ns.append(filtered_sentence)

text_test_ns=[]
for tokens in text_test:
    filtered_sentence = [w for w in tokens if not w in stop_words]
    text_test_ns.append(filtered_sentence)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Lemmatisation [Please complete this section]**

In [37]:
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

text_train_le = []
for tokens in text_train_ns:
    lemma_sentence = [lemmatizer.lemmatize(w) for w in tokens ]
    text_train_le.append(lemma_sentence)

text_test_le = []
for tokens in text_test_ns:
    lemma_sentence = [lemmatizer.lemmatize(w) for w in tokens ]
    text_test_le.append(lemma_sentence)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**Label Encoding**

In [38]:
from sklearn.preprocessing import LabelEncoder

labels = np.unique(label_train)

lEnc = LabelEncoder()
lEnc.fit(labels)
label_train_n = lEnc.transform(label_train)
label_test_n = lEnc.transform(label_test)
numClass = len(labels)

print(labels)
print(lEnc.transform(labels))

['none' 'racism' 'sexism']
[0 1 2]


## Embeddings [Complete this section]

**Padding**

In [0]:
len_list = [len(s) for s in text_train_ns]
seq_length = max(len_list)

def add_padding(corpus, seq_length):
    output = []
    for sentence in corpus:
        if len(sentence)>seq_length:
            output.append(sentence[:seq_length])
        else:
            for j in range(seq_length-len(sentence)):
                sentence.append("<PAD>")
            output.append(sentence)
    return output

text_train_pad = add_padding(text_train_le,seq_length )
text_test_pad = add_padding(text_test_le,seq_length )

**Download Embeddings [Please try other embeddings]**

You can find the details from https://github.com/RaRe-Technologies/gensim-data

In [40]:
import gensim.downloader as api
word_emb_model = api.load("glove-twitter-25") #this is only example



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


**Get embeddings**

In [41]:
def get_embeddings(corpus,word_emb_model):
    emb_dim = word_emb_model.vector_size
    out = []
    for sentence in corpus:
        out_temp = []
        for word in sentence:
            try:
                out_temp.append(word_emb_model.wv[word])
            except:
                out_temp.append([0]*emb_dim)
    
        out.append(out_temp)
    return np.array(out)

train_emb = get_embeddings(text_train_pad,word_emb_model)
test_emb = get_embeddings(text_test_pad,word_emb_model)

  


## Model

In [0]:
n_input = train_emb.shape[2]
n_hidden = 50
n_class = len(labels)
total_epoch = 100
learning_rate = 0.01

In [43]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import accuracy_score

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.lstm = nn.LSTM(n_input, n_hidden, num_layers=2, batch_first =True, dropout=0.2)
        self.linear = nn.Linear(n_hidden,n_class)

    def forward(self, x):
        x,_ = self.lstm(x)
        x = self.linear(x[:,-1,:])
        x = F.log_softmax(x, dim=1)
        return x


net = Net().to(device)
criterion = nn.NLLLoss()
optimizer = optim.Adam(net.parameters(), lr=learning_rate)

input_batch_torch = torch.from_numpy(np.array(train_emb)).float().to(device)
target_batch_torch = torch.from_numpy(np.array(label_train_n)).view(-1).to(device)


for epoch in range(total_epoch):  
    
    net.train()
    outputs = net(input_batch_torch) 
    loss = criterion(outputs, target_batch_torch)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    net.eval()
    outputs = net(input_batch_torch) 
    
    if epoch%10 == 9:
        loss = criterion(outputs, target_batch_torch)
        _, predicted = torch.max(outputs, 1)
        acc= accuracy_score(predicted.cpu().numpy(),target_batch_torch.cpu().numpy())

        print('Epoch: %d, loss: %.5f, train_acc: %.2f' %(epoch + 1, loss.item(), acc))

print('Finished Training')


ERROR! Session/line number was not unique in database. History logging moved to new session 59
Epoch: 10, loss: 0.53801, train_acc: 0.73
Epoch: 20, loss: 0.25101, train_acc: 0.87
Epoch: 30, loss: 0.00583, train_acc: 1.00
Epoch: 40, loss: 0.00143, train_acc: 1.00
Epoch: 50, loss: 0.39852, train_acc: 0.93
Epoch: 60, loss: 0.42728, train_acc: 0.87
Epoch: 70, loss: 0.19730, train_acc: 0.93
Epoch: 80, loss: 0.12919, train_acc: 0.97
Epoch: 90, loss: 0.10252, train_acc: 0.97
Epoch: 100, loss: 0.05393, train_acc: 0.97
Finished Training


## Save and Load the model [Complete this section]

**Save the model [Complete this part]**

In [44]:
torch.save(net, 'lab5.pt')

  "type " + obj.__name__ + ". It won't be checked "


**Load the model**

In [45]:
model2 = torch.load('lab5.pt')
model2.eval()

Net(
  (lstm): LSTM(25, 50, num_layers=2, batch_first=True, dropout=0.2)
  (linear): Linear(in_features=50, out_features=3, bias=True)
)

## Testing

In [46]:
input_batch_torch = torch.from_numpy(np.array(test_emb)).float().to(device)

outputs = model2(input_batch_torch) 
_, predicted = torch.max(outputs, 1)

from sklearn.metrics import classification_report
print(classification_report(label_test_n,predicted.cpu().numpy()))

              precision    recall  f1-score   support

           0       0.86      0.86      0.86         7
           1       1.00      1.00      1.00         2
           2       0.00      0.00      0.00         1

    accuracy                           0.80        10
   macro avg       0.62      0.62      0.62        10
weighted avg       0.80      0.80      0.80        10

