<a href="https://colab.research.google.com/github/Joongeun/Internship/blob/main/LSTM_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn as nn
import torch.nn.functional as F
!pip install nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from collections import Counter
import string
import re
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping
from matplotlib import pyplot
import time



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
is_cuda = torch.cuda.is_available()
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU is available


# Load Dataset

In [3]:
df = pd.read_csv("/content/Labeled Posts - preprocessed_csv.csv")
# check class distribution
df['labels'].value_counts(normalize = True)

1.0    0.522571
4.0    0.345714
2.0    0.052286
3.0    0.043429
0.0    0.028286
5.0    0.007714
Name: labels, dtype: float64

In [30]:
X,y = df['selftext'][0:3500].values,df['labels'][0:3500].values
x_train,x_test,y_train,y_test = train_test_split(X,y,stratify=y,test_size=0.1)

# Preprocess/Tokenization
(Convert text into numbers)



In [33]:
def preprocess_string(s):
    # replace digits with no space
    s = re.sub(r"\d", '', s)
    split = re.findall(r"[\w']+|[!?]", s)
    i = 0
    while i < len(split)-1:
      if split[i] == split[i+1]:
        del split[i+1]
        i-=1
      i+= 1
    # Remove all instances of \n where only the n remains
    split = [i for i in split if i != "n"]
    return " ".join(split)

def tokenize(x_train,y_train,x_val,y_val):
    word_list = []
    stop_words = set(stopwords.words('english'))
    for sent in x_train:
        for word in sent.lower().split():
            word = preprocess_string(word)
            if word not in stop_words and word != '':
                word_list.append(word)
    corpus = Counter(word_list)
    # sorting on the basis of most common words
    corpus_ = sorted(corpus,key=corpus.get,reverse=True)[:1000]                #Only using top 1000 words to train model
    # creating a dict
    onehot_dict = {w:i+1 for i,w in enumerate(corpus_)}
    # tokenize
    final_list_train,final_list_test = [],[]
    for sent in x_train:
            final_list_train.append([onehot_dict[preprocess_string(word)]  for word in sent.lower().split()
                                     if preprocess_string(word) in onehot_dict.keys()])
    for sent in x_val:
            final_list_test.append([onehot_dict[preprocess_string(word)] for word in sent.lower().split()
                                    if preprocess_string(word) in onehot_dict.keys()])
    encoded_train = [1 if label == 2.0 else 0 for label in y_train]            #  Changed from 'positive' to 2.0 for my dataset
    encoded_test = [1 if label == 2.0 else 0 for label in y_val]
    return np.array(final_list_train), np.array(encoded_train),np.array(final_list_test), np.array(encoded_test),onehot_dict

In [34]:
x_train,y_train,x_test,y_test,vocab = tokenize(x_train,y_train,x_test,y_test)

  return np.array(final_list_train), np.array(encoded_train),np.array(final_list_test), np.array(encoded_test),onehot_dict


# Padding (len 500)


In [35]:
def padding_(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features
x_train_pad = padding_(x_train,500)
x_test_pad = padding_(x_test,500)

# Shuffle Train/Val_data

In [36]:
train_data = TensorDataset(torch.from_numpy(x_train_pad), torch.from_numpy(y_train)) # len 750 - each element is a list of 500 ints (after padding)
valid_data = TensorDataset(torch.from_numpy(x_test_pad), torch.from_numpy(y_test))   # len 250
batch_size = 50    #  number of posts in x_train/x_test must be divisible by batch_size - otherwise, there'll be an error when training model
# make sure to SHUFFLE your data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)           # shuffled train_data/valid_data
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)

# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = next(dataiter)

# Create Neural Network Model

In [38]:
class SentimentRNN(nn.Module):
    def __init__(self, no_layers, vocab_size, hidden_dim, embedding_dim, drop_prob=0.5):
        super(SentimentRNN,self).__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.no_layers = no_layers
        self.vocab_size = vocab_size
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        #lstm
        self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=self.hidden_dim, num_layers=no_layers, batch_first=True)

        # dropout layer
        self.dropout = nn.Dropout(0.3)
        # linear and sigmoid layer
        self.fc = nn.Linear(self.hidden_dim, output_dim)
        self.sig = nn.Sigmoid()
    def forward(self,x,hidden):
        batch_size = x.size(0)
        # embeddings and lstm_out
        embeds = self.embedding(x)  # shape: B x S x Feature   since batch = True
        #print(embeds.shape)  #[50, 500, 1000]
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        # dropout and fully connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        # sigmoid function
        sig_out = self.sig(out)
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels
        # return last sigmoid output and hidden state
        return sig_out, hidden

    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        h0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        c0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        hidden = (h0,c0)
        return hidden
no_layers = 2
vocab_size = len(vocab) + 1 #extra 1 for padding
embedding_dim = 64
output_dim = 1
hidden_dim = 256
model = SentimentRNN(no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5)
#moving to gpu
model.to(device)

SentimentRNN(
  (embedding): Embedding(1001, 64)
  (lstm): LSTM(64, 256, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)

# Define learning rate and accuracy func

In [39]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
lr = 0.001
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

def acc(pred,label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()
    # return accuracy_score(label, pred)

# Train Model

In [40]:
clip = 5
epochs = 100
valid_loss_min = np.Inf
# train for some number of epochs
epoch_tr_loss,epoch_vl_loss = [],[]
epoch_tr_acc,epoch_vl_acc = [],[]
for epoch in range(epochs):
    a = time.time()
    train_losses = []
    train_acc = 0.0
    model.train()
    # initialize hidden state
    h = model.init_hidden(batch_size)
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])
        model.zero_grad()
        output,h = model(inputs,h)
        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        train_losses.append(loss.item())
        # calculating accuracy
        accuracy = acc(output,labels)
        train_acc += accuracy
        #`clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

    val_h = model.init_hidden(batch_size)
    val_losses = []
    val_acc = 0.0
    model.eval()
    for inputs, labels in valid_loader:
            val_h = tuple([each.data for each in val_h])
            inputs, labels = inputs.to(device), labels.to(device)
            output, val_h = model(inputs, val_h)
            val_loss = criterion(output.squeeze(), labels.float())
            val_losses.append(val_loss.item())
            accuracy = acc(output,labels)
            #add f1 for test predictions
            val_acc += accuracy
    epoch_train_loss = np.mean(train_losses)
    epoch_val_loss = np.mean(val_losses)
    epoch_train_acc = train_acc/len(train_loader.dataset)
    epoch_val_acc = val_acc/len(valid_loader.dataset)
    epoch_tr_loss.append(epoch_train_loss)
    epoch_vl_loss.append(epoch_val_loss)
    epoch_tr_acc.append(epoch_train_acc)
    epoch_vl_acc.append(epoch_val_acc)
    b = time.time()
    t = b-a
    print("Train acc:", epoch_train_acc)
    print("Train loss:", epoch_train_loss)
    print("Val loss:", epoch_val_acc)
    print("Val acc:", epoch_val_loss)
    print(f'Epoch {epoch+1}: {t:.3f} seconds')

Train acc: 0.9250793650793651
Train loss: 0.2549446557485868
Val loss: 0.9485714285714286
Val acc: 0.20457159940685546
Epoch 1: 4.036 seconds
Train acc: 0.9476190476190476
Train loss: 0.20860992037942486
Val loss: 0.9485714285714286
Val acc: 0.20782707631587982
Epoch 2: 3.983 seconds
Train acc: 0.9476190476190476
Train loss: 0.19824900705781248
Val loss: 0.9485714285714286
Val acc: 0.2133760846086911
Epoch 3: 3.911 seconds
Train acc: 0.9476190476190476
Train loss: 0.20558967938025793
Val loss: 0.9485714285714286
Val acc: 0.19838525567735946
Epoch 4: 3.919 seconds
Train acc: 0.9485714285714286
Train loss: 0.18521683621737692
Val loss: 0.9457142857142857
Val acc: 0.16567846706935338
Epoch 5: 3.943 seconds
Train acc: 0.9552380952380952
Train loss: 0.14712141690746186
Val loss: 0.9514285714285714
Val acc: 0.16439263735498702
Epoch 6: 3.914 seconds
Train acc: 0.9615873015873015
Train loss: 0.12000548325124241
Val loss: 0.9514285714285714
Val acc: 0.18229569920471736
Epoch 7: 3.934 seconds
T

In [None]:
def predict_text(texts):
  l = []
  for text in texts:
    word_seq = np.array([vocab[preprocess_string(word)] for word in text.split()
                      if preprocess_string(word) in vocab.keys()])
    word_seq = np.expand_dims(word_seq,axis=0)
    pad =  torch.from_numpy(padding_(word_seq,500))
    inputs = pad.to(device)
    batch_size = 1
    h = model.init_hidden(batch_size)
    h = tuple([each.data for each in h])
    output, h = model(inputs, h)
    l.append(output.item())
  return l

index = 4274
print(df['selftext'][4271])
print('='*70)
print(f'Actual sentiment is  : {df["labels"][4270:4272]}')
print('='*70)
pro = predict_text(df['selftext'][4270:index])
print(pro)
# status = "positive" if pro > 0.5 else "negative"
# pro = (1 - pro) if status == "negative" else pro
# print(f'Predicted sentiment is {status} with a probability of {pro}')

I’ve been meaning to make a post on here for a while, but I’m happy to finally say I’ve been vape free for 61 days now! I’d only been previously vaping for about a year so my case isn’t as bad as some, but I hated how attached I was to my vape and wanted to quit. I knew it wasn’t good for my health and since I’m into some singing and vocal performance, I knew I needed to stop soon. 

Family and friends were encouraging me for a few months but it didn’t help out too much. Sometimes I’d try to stop for a day or two, but I’d see my vape and get right back to it. But, one day I thought “I’m not gonna let this control me.” Instead of “I think I’m going to quit”, I said “I WILL quit vaping.” 

The first couple days it was terrible. By day 3 I got so close to going to buy one, but I thought back to myself and kept going. I will say though, once I made it to 1 week, it got a lot easier from there. I still get headaches and urges even now, but they normally go away after a few minutes. 

For an