In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn as nn
import torch.nn.functional as F
!pip install nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from collections import Counter
import string
import re
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from matplotlib import pyplot
import time
from sklearn.model_selection import StratifiedKFold
import os
import warnings
warnings.filterwarnings('always')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
is_cuda = torch.cuda.is_available()
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

  and should_run_async(code)


GPU is available


# Load Dataset

In [5]:
df = pd.read_csv("/content/Labeled Posts - preprocessed_csv.csv")
# check class distribution
df['labels'].value_counts(normalize = True)

  and should_run_async(code)


1.0    0.522571
4.0    0.345714
2.0    0.052286
3.0    0.043429
0.0    0.028286
5.0    0.007714
Name: labels, dtype: float64

# Preprocess/Tokenization/Padding
(Convert text into numbers)



In [6]:
def preprocess_string(s):
    # replace digits with no space
    s = re.sub(r"\d", '', s)
    split = re.findall(r"[\w']+|[!?]", s)
    i = 0
    while i < len(split)-1:
      if split[i] == split[i+1]:
        del split[i+1]
        i-=1
      i+= 1
    # Remove all instances of \n where only the n remains
    split = [i for i in split if i != "n"]
    return " ".join(split)

def tokenize(x_train, y_train):
    word_list = []
    stop_words = set(stopwords.words('english'))
    for sent in x_train:
        for word in sent.lower().split():
            word = preprocess_string(word)
            if word not in stop_words and word != '':
                word_list.append(word)
    corpus = Counter(word_list)
    # sorting on the basis of most common words
    corpus_ = sorted(corpus,key=corpus.get,reverse=True)[:1000]                #Only using top 1000 words to train model
    # creating a dict
    onehot_dict = {w:i+1 for i,w in enumerate(corpus_)}
    # tokenize
    final_list_train,final_list_test = [],[]
    for sent in x_train:
            final_list_train.append([onehot_dict[preprocess_string(word)]  for word in sent.lower().split()
                                     if preprocess_string(word) in onehot_dict.keys()])
    encoded_train = [1 if label == 2.0 else 0 for label in y_train]
    return np.array(final_list_train), np.array(encoded_train),onehot_dict
def padding_(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

  and should_run_async(code)


# Shuffle Train/Val_data

> Get 10 folds (split train_data into validation & train set - 1:9 ratio)



In [7]:
batch_size = 35    #  Number of posts per batch (when training); GCF of 2835, 315, 350 (train, val, test) = 35
def shuffler(x, y):
  data = TensorDataset(torch.from_numpy(x), torch.from_numpy(y))
  # make sure to SHUFFLE your data
  #train_loader is a list of len 63 - we divided 2835 posts in x_train into 81 batchs of 35 posts
  #each of the 63 lists are made of 2 sublists, both of len 35 - first is the padded texts, second is the list of labels
  loader = DataLoader(data, shuffle=True, batch_size=batch_size)           # shuffled train_data/test_data
  return loader

  and should_run_async(code)


In [8]:
X,y = df['selftext'][0:3500].values,df['labels'][0:3500].values
#Tokenize
X, y, vocab = tokenize(X, y)
#Padding
X = padding_(X, 500)

#Split xtr/ytr into train/val data
skf = StratifiedKFold(n_splits=10)
train_loader, valid_loader, test_loader = [], [], []
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
  xtrain, ytrain, xtest, ytest = [], [], [], []
  for j in train_index:
    xtrain.append(X[j])
    ytrain.append(y[j])
  for j in test_index:
    xtest.append(X[j])
    ytest.append(y[j])

  xtrain, ytrain, xtest, ytest = map(np.array, [xtrain, ytrain, xtest, ytest])

  #Split into train + val
  xtrain, xval, ytrain, yval = train_test_split(xtrain, ytrain, stratify=ytrain, test_size=0.1)

  #Shuffle/make dataloader (batches)
  trainloader = shuffler(xtrain, ytrain)
  valloader = shuffler(xval, yval)
  testloader = shuffler(xtest, ytest)

  train_loader.append(trainloader)
  valid_loader.append(valloader)
  test_loader.append(testloader)

#train_loader: 10 folds with 63 batches of 45 posts, valid_loader: 10 folds with 7 batches of 45 posts, test_loader: 10 folds with 8 batches of 45 posts

#Train: 2835 posts, Validation: 315 posts, Test: 350 posts

  and should_run_async(code)
  return np.array(final_list_train), np.array(encoded_train),onehot_dict


# Create Neural Network Model

In [9]:
class SentimentRNN(nn.Module):
    def __init__(self, no_layers, vocab_size, hidden_dim, embedding_dim, drop_prob=0.5):
        super(SentimentRNN,self).__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.no_layers = no_layers
        self.vocab_size = vocab_size
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        #lstm
        self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=self.hidden_dim, num_layers=no_layers, batch_first=True)

        # dropout layer
        self.dropout = nn.Dropout(0.3)
        # linear and sigmoid layer
        self.fc = nn.Linear(self.hidden_dim, output_dim)
        self.sig = nn.Sigmoid()
    def forward(self,x,hidden):
        batch_size = x.size(0)
        # embeddings and lstm_out
        embeds = self.embedding(x)  # shape: B x S x Feature   since batch = True
        #print(embeds.shape)  #[50, 500, 1000]
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        # dropout and fully connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        # sigmoid function
        sig_out = self.sig(out)
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels
        # return last sigmoid output and hidden state
        return sig_out, hidden

    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        h0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        c0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        hidden = (h0,c0)
        return hidden
no_layers = 2
vocab_size = len(vocab) + 1 #extra 1 for padding
embedding_dim = 64
output_dim = 1
hidden_dim = 256
model = SentimentRNN(no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5)

  and should_run_async(code)


# Define learning rate and accuracy func

In [10]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
lr = 0.001
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

def merge_arrs(pred, label):
  new_pred = []
  for i in pred:
    new_pred.extend(list(i))
  pred = np.array(new_pred)
  new_label = []
  for i in label:
    new_label.extend(list(i))
  label = np.array(new_label)
  return pred, label

def acc(pred,label):
  # pred = torch.round(pred.squeeze())
  # pred = pred.to("cpu").detach().numpy().squeeze()
  # label = label.to("cpu").detach().numpy().squeeze()
  return accuracy_score(label, pred)

def f1(pred, label):
  # pred = torch.round(pred.squeeze())
  # pred = pred.to("cpu").detach().numpy().squeeze()
  # label = label.to("cpu").detach().numpy().squeeze()
  pred, label = merge_arrs(pred, label)
  return f1_score(label, pred, labels=np.unique(pred))

  and should_run_async(code)


# Train Model

In [11]:
def stop(acc_list):
  a = 0
  for i in range(len(test_acc), len(test_acc)-10, -1):
    if acc_list[i] > acc_list[i-1]: #check if loss is INCREASING (getting worse)
      a += 1
  return True if a > 7 else False

  and should_run_async(code)


In [12]:
def test(model, test_loader):
  test_h = model.init_hidden(batch_size)
  test_losses = []
  test_f1 = 0.0
  preds_all, labels_all = [], []
  model.eval()
  c = 0
  t5 = 0
  for ind, (inputs, labels) in enumerate(test_loader):
    test_h = tuple([each.data for each in test_h])
    inputs, labels = inputs.to(device), labels.to(device)
    output, test_h = model(inputs, test_h)
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())

    output = torch.round(output.squeeze()).to("cpu").detach().numpy().squeeze()
    preds_all.append(output)
    labels = labels.to("cpu").detach().numpy().squeeze()
    labels_all.append(labels)
    # f1_score = f1(output,labels)
    #add f1 for test predictions
    # test_f1 += f1_score
    # #check if loss is increasing every 10 posts; save model every 5 posts
    # if ind % 5 == 0 and i != 0:
    #   save = True
    #   c += 1
    #   if c == 2:
    #     if t5 > test_loss.item():
    #       save = False
    #     if stop(test_acc):
    #       break
    #     c = 0
    #   if save:
    #     torch.save(model.state_dict(), f'/content/saved_model{ind}.pt')
    #     if ind != 5: # if ind != 5 AND save == True (meaning if we are saving a new model after 5 iterations, delete the previously saved model)
    #       os.remove(f'/content/saved_model{ind-5}.pt')
    #   t5 = test_loss.item()
  f1_score = f1(preds_all, labels_all)
  # return test_losses, test_acc/len(test_loader)
  return test_losses, f1_score

  and should_run_async(code)


In [13]:
def train(model, train_loader, valid_loader):
  clip = 5
  epochs = 100
  valid_loss_min = np.Inf
  # train for some number of epochs
  epoch_tr_loss,epoch_vl_loss = [],[]
  train_f1_scores, epoch_tr_acc,epoch_vl_acc = [],[],[]
  time = 0
  for epoch in range(epochs):
    train_losses = []
    train_acc = 0.0
    model.train()
    # initialize hidden state
    h = model.init_hidden(batch_size)
    preds_all, labels_all = [], []
    for ind, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)
        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])
        model.zero_grad()
        output,h = model(inputs,h)
        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        train_losses.append(loss.item())
        #store all predictions/labels in this epoch for the f1_score
        output = torch.round(output.squeeze()).to("cpu").detach().numpy().squeeze()
        preds_all.append(output)
        labels = labels.to("cpu").detach().numpy().squeeze()
        labels_all.append(labels)
        # calculating accuracy_score
        accuracy = acc(output,labels)
        train_acc += accuracy
        # clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

    #f1_score
    f1_score = f1(preds_all, labels_all)
    #Validation
    val_losses, val_acc = test(model, valid_loader)

    epoch_train_loss = np.mean(train_losses)
    epoch_val_loss = np.mean(val_losses)
    epoch_train_acc = train_acc/len(train_loader)
    epoch_val_acc = val_acc

    train_f1_scores.append(float(f'{f1_score:.3f}'))
    epoch_tr_loss.append(epoch_train_loss)
    epoch_vl_loss.append(epoch_val_loss)
    epoch_tr_acc.append(epoch_train_acc)
    epoch_vl_acc.append(epoch_val_acc)
  #return train (acc/loss), val (acc/loss)
  return train_f1_scores, max(epoch_tr_acc), min(epoch_tr_loss), max(epoch_vl_acc), epoch_vl_loss   # each list has 100 values (1 per epoch): each value is the average acc/loss after training on entire dataset

  and should_run_async(code)


In [14]:
total_t = 0
best_model = model
best_val_loss = 10000000000
#Loss/accuracy results for
best_f1, train_losses, train_accs, val_losses, val_accs, test_losses, test_f1_score = [], [], [], [], [], [], []
for i in range(10):
  a = time.time()
  #moving to gpu
  model.to(device)
  #Train
  f1_scores, epoch_tr_acc, epoch_tr_loss, epoch_vl_acc, epoch_vl_losses = train(model, train_loader[i], valid_loader[i])
  epoch_vl_loss = min(epoch_vl_losses)
  best_f1.append(max(f1_scores))
  print(f"Fold {i+1}'s 100 f1 scores (1 per epoch): {f1_scores}")
  print(f"Fold {i+1}'s 100 validation losses (1 per epoch): {epoch_vl_losses}")
  train_losses.append(float(f'{epoch_tr_loss:.5f}'))
  train_accs.append(float(f'{epoch_tr_acc:.3f}'))
  val_losses.append(float(f'{epoch_vl_loss:.5f}'))
  val_accs.append(float(f'{epoch_vl_acc:.3f}'))
  #you won't need test_losses - this is only used for validations step
  test_losses, test_f1 = test(model, test_loader[i])
  test_f1_score.append(float(f'{test_f1:.3f}'))
  if epoch_vl_loss < best_val_loss:
    best_model = model
    best_val_loss = epoch_vl_loss
  b = time.time()
  total_t += b-a
  print(f"Fold {i+1}/10 done")
  model = SentimentRNN(no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5)

torch.save(best_model.state_dict(), '/content/saved_best_model.pt')
print(f"time: {total_t:.3f} seconds")
# below are lists of best accuracies/losses for each fold; AKA max accuracy out of 100 epochs
#Train results
print("Train F1s for each fold:", best_f1)
print("Train accuracies:", train_accs)
print("Train losses:", train_losses)
#Val results
print("Val accuracies:", val_accs)
print("Val losses:", val_losses)
print("Best Val loss:", best_val_loss)
#Test results
print("Test accuracies:", test_f1_score)

  and should_run_async(code)


Fold 0's 100 f1 scores (1 per epoch): [0.0, 0.026, 0.221, 0.422, 0.581, 0.803, 0.85, 0.916, 0.92, 0.976, 0.962, 0.983, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
Fold 0's 100 validation losses (1 per epoch): [0.21271194517612457, 0.1909072779946857, 0.16621250576443142, 0.16229194154342017, 0.1584204381538762, 0.1812002366512186, 0.20708582808987963, 0.2085589555402597, 0.1911847550008032, 0.2638282231572602, 0.2595466644399696, 0.2709993583395974, 0.3101692560594529, 0.327099537198794, 0.33640015231665327, 0.34041438377527, 0.35489711589697337, 0.35727751058968554, 0.36437802192469665, 0.3719286

In [18]:
print(f"time: {total_t:.3f} seconds")
# below are lists of best accuracies/losses for each fold; AKA max accuracy out of 100 epochs
#Train results
print("Train F1s for each fold:", best_f1)
print("Train accuracies:", train_accs)
print("Train losses:", train_losses)
#Val results
print("Val F1s for each fold:", val_accs)
print("Val losses:", val_losses)
print("Best Val loss:", best_val_loss)
#Test results
print("Test accuracies:", test_f1_score)
print(min(epoch_vl_losses))

time: 4706.817 seconds
Train F1s for each fold: [1.0, 0.101, 0.101, 0.026, 0.1, 0.099, 0.109, 0.103, 0.102, 0.099]
Train accuracies: [1.0, 0.06, 0.106, 0.948, 0.054, 0.053, 0.579, 0.867, 0.207, 0.053]
Train losses: [0.0, 0.71141, 0.70541, 0.67116, 0.72112, 0.72867, 0.69177, 0.68335, 0.70004, 0.72488]
Val F1s for each fold: [0.562, 0.097, 0.102, 0.0, 0.097, 0.102, 0.064, 0.174, 0.095, 0.097]
Val losses: [0.59121, 0.71168, 0.70623, 0.67193, 0.72182, 0.7282, 0.69219, 0.68399, 0.70008, 0.72522]
Best Val loss: 0.5912099112239149
Test accuracies: [0.267, 0.098, 0.098, 0.0, 0.098, 0.098, 0.076, 0.074, 0.103, 0.103]
0.725224859184689


  and should_run_async(code)


In [16]:
def predict_text(texts):
  l = []
  for text in texts:
    word_seq = np.array([vocab[preprocess_string(word)] for word in text.split()
                      if preprocess_string(word) in vocab.keys()])
    word_seq = np.expand_dims(word_seq,axis=0)
    pad =  torch.from_numpy(padding_(word_seq,500))
    inputs = pad.to(device)
    batch_size = 1
    h = model.init_hidden(batch_size)
    h = tuple([each.data for each in h])
    output, h = model(inputs, h)
    l.append(output.item())
  return l

index = 4274
print(df['selftext'][4271])
print('='*70)
print(f'Actual sentiment is  : {df["labels"][4270:4272]}')
print('='*70)
pro = predict_text(df['selftext'][4270:index])
print(pro)
# status = "positive" if pro > 0.5 else "negative"
# pro = (1 - pro) if status == "negative" else pro
# print(f'Predicted sentiment is {status} with a probability of {pro}')

I’ve been meaning to make a post on here for a while, but I’m happy to finally say I’ve been vape free for 61 days now! I’d only been previously vaping for about a year so my case isn’t as bad as some, but I hated how attached I was to my vape and wanted to quit. I knew it wasn’t good for my health and since I’m into some singing and vocal performance, I knew I needed to stop soon. 

Family and friends were encouraging me for a few months but it didn’t help out too much. Sometimes I’d try to stop for a day or two, but I’d see my vape and get right back to it. But, one day I thought “I’m not gonna let this control me.” Instead of “I think I’m going to quit”, I said “I WILL quit vaping.” 

The first couple days it was terrible. By day 3 I got so close to going to buy one, but I thought back to myself and kept going. I will say though, once I made it to 1 week, it got a lot easier from there. I still get headaches and urges even now, but they normally go away after a few minutes. 

For an

  and should_run_async(code)


RuntimeError: ignored