In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud,STOPWORDS
from bs4 import BeautifulSoup
import re,string,unicodedata

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, plot_confusion_matrix, plot_roc_curve, plot_precision_recall_curve
from xgboost.sklearn import XGBClassifier

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Dense,Input, Embedding,LSTM,Dropout,Conv1D, MaxPooling1D, GlobalMaxPooling1D,Dropout,Bidirectional,Flatten,BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model
import transformers
import tokenizers

In [2]:
data=pd.read_csv('../input/depression/depression_train.csv')
data.head()

Unnamed: 0,Text_data,Label
0,Waiting for my mind to have a breakdown once t...,moderate
1,My new years resolution : I'm gonna get my ass...,moderate
2,New year : Somone else Feeling like 2020 will ...,moderate
3,"My story I guess : Hi, Im from Germany and my ...",moderate
4,Sat in the dark and cried myself going into th...,moderate


In [3]:
stop = stopwords.words('english')
wl = WordNetLemmatizer()
mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", 
           "'cause": "because", "could've": "could have", "couldn't": "could not", 
           "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", 
           "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", 
           "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", 
           "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", 
           "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", 
           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have",
           "i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", 
           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have",
           "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", 
           "might've": "might have","mightn't": "might not","mightn't've": "might not have", 
           "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", 
           "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", 
           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", 
           "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", 
           "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", 
           "she's": "she is", "should've": "should have", "shouldn't": "should not", 
           "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is",
           "that'd": "that would", "that'd've": "that would have", "that's": "that is", 
           "there'd": "there would", "there'd've": "there would have", "there's": "there is", 
           "here's": "here is","they'd": "they would", "they'd've": "they would have", 
           "they'll": "they will", "they'll've": "they will have", "they're": "they are", 
           "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", 
           "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", 
           "we're": "we are", "we've": "we have", "weren't": "were not", 
           "what'll": "what will", "what'll've": "what will have","what're": "what are",  
           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", 
           "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", 
           "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", 
           "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", 
           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", 
           "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have",
           "y'all're": "you all are","y'all've": "you all have","you'd": "you would", 
           "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", 
           "you're": "you are", "you've": "you have" }
#function to clean data
def clean_text(text,lemmatize = True):
    soup = BeautifulSoup(text, "html.parser") #remove html tags
    text = soup.get_text()
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")]) #expanding chatwords and contracts clearing contractions
    emoji_clean= re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_clean.sub(r'',text)
    text = re.sub(r'\.(?=\S)', '. ',text) #add space after full stop
    text = re.sub(r'http\S+', '', text) #remove urls
    text = "".join([word.lower() for word in text if word not in string.punctuation]) #remove punctuation
    #tokens = re.split('\W+', text) #create tokens
    if lemmatize:
        text = " ".join([wl.lemmatize(word) for word in text.split() if word not in stop and word.isalpha()]) #lemmatize
    else:
        text = " ".join([word for word in text.split() if word not in stop and word.isalpha()]) 
    return text
data_copy = data.copy()
data['Text_data']=data['Text_data'].apply(clean_text,lemmatize = True)
#converting target variable to numeric labels
labeling = {'moderate':1, 'severe':2, 'not depression':0}
data['Label'] = data['Label'].apply(lambda x : labeling[x])
#data.label = [ 1 if each == "positive" 0 else if each=="not depression" else 2 for each in data.Text_data]
#after converting labels
data.head()

Unnamed: 0,Text_data,Label
0,waiting mind breakdown feeling anymore know an...,1
1,new year resolution gonna get as therapist off...,1
2,new year somone else feeling like last year ea...,1
3,story guess hi im germany english mostly self ...,1
4,sat dark cried going new year great start,1


In [4]:
#splitting into train and test
data_copy['Text_data']=data_copy['Text_data'].apply(clean_text,lemmatize = False)
#converting target variable to numerical value
#labeling = {'moderate':1, 'severe':2, 'not depression':0}
#test_data['Label'] = test_data['Label'].apply(lambda x : labeling[x])
data_copy.sentiment = [ {'moderate':1, 'severe':2, 'not depression':0} for each in data_copy.Label]
train, test= train_test_split(data_copy, test_size=0.2, random_state=42)
Xtrain, ytrain = train['Text_data'], train['Label']
Xtest, ytest = test['Text_data'], test['Label']

  


In [5]:
#set up the tokenizer
MAX_VOCAB_SIZE = 10000
tokenizer = Tokenizer(num_words = MAX_VOCAB_SIZE,oov_token="<oov>")
tokenizer.fit_on_texts(Xtrain)
word_index = tokenizer.word_index
#print(word_index)
V = len(word_index)
print("Vocabulary of the dataset is : ",V)

Vocabulary of the dataset is :  12037


In [6]:
##create sequences of reviews
seq_train = tokenizer.texts_to_sequences(Xtrain)
seq_test =  tokenizer.texts_to_sequences(Xtest)
#choice of maximum length of sequences
seq_len_list = [len(i) for i in seq_train + seq_test]

#if we take the direct maximum then
max_len=max(seq_len_list)
print('Maximum length of sequence in the list: {}'.format(max_len))

Maximum length of sequence in the list: 1385


In [7]:
# when setting the maximum length of sequence, variability around the average is used.
max_seq_len = np.mean(seq_len_list) + 2 * np.std(seq_len_list)
max_seq_len = int(max_seq_len)
print('Maximum length of the sequence when considering data only two standard deviations from average: {}'.format(max_seq_len))

Maximum length of the sequence when considering data only two standard deviations from average: 239


In [8]:
perc_covered = np.sum(np.array(seq_len_list) < max_seq_len) / len(seq_len_list)*100
print('The above calculated number coveres approximately {} % of data'.format(np.round(perc_covered,2)))

The above calculated number coveres approximately 95.87 % of data


In [9]:
stop_words = set(stopwords.words('english'))
df = pd.read_csv('../input/depression/depression_train.csv')
#Data Processing — convert to lower case, Remove punctuation etc
def data_preprocessing(text):
    text = text.lower()
    text = re.sub('<.*?>', '', text) # Remove HTML from text
    text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
    text = [word for word in text.split() if word not in stop_words]
    text = ' '.join(text)
    return text

df['cleaned_Text_data'] = df['Text_data'].apply(data_preprocessing)
df.head()

Unnamed: 0,Text_data,Label,cleaned_Text_data
0,Waiting for my mind to have a breakdown once t...,moderate,waiting mind breakdown “new year” feeling isn’...
1,My new years resolution : I'm gonna get my ass...,moderate,new years resolution im gonna get ass therapis...
2,New year : Somone else Feeling like 2020 will ...,moderate,new year somone else feeling like 2020 last ye...
3,"My story I guess : Hi, Im from Germany and my ...",moderate,story guess hi im germany english mostly self ...
4,Sat in the dark and cried myself going into th...,moderate,sat dark cried going new year great start 2020


In [10]:
def get_corpus(text):
    words = []
    for i in text:
        for j in i.split():
            words.append(j.strip())
    return words
corpus = get_corpus(data.Text_data)
corpus[:5]

['waiting', 'mind', 'breakdown', 'feeling', 'anymore']

In [11]:
from collections import Counter
counter = Counter(corpus)
most_common = counter.most_common(10)
most_common = pd.DataFrame(most_common,columns = ['corpus','countv'])
most_common

Unnamed: 0,corpus,countv
0,like,9236
1,feel,8674
2,year,8313
3,know,6011
4,want,5684
5,life,5592
6,get,5344
7,time,5271
8,even,4393
9,friend,4227


In [12]:
def get_ngrams(review, n, g):
    vec = CountVectorizer(ngram_range=(g, g)).fit(review)
    bag_of_words = vec.transform(review) #sparse matrix of count_vectorizer
    sum_words = bag_of_words.sum(axis=0) #total number of words
    sum_words = np.array(sum_words)[0].tolist() #convert to list
    words_freq = [(word, sum_words[idx]) for word, idx in vec.vocabulary_.items()] #get word freqency for word location in count vec
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) #key is used to perform sorting using word_freqency 
    return words_freq[:n]

In [13]:
#splitting into train and test
train, test= train_test_split(data, test_size=0.2, random_state=42)
Xtrain, ytrain = train['Text_data'], train['Label']
Xtest, ytest = test['Text_data'], test['Label']
#Vectorizing data

tfidf_vect = TfidfVectorizer() #tfidfVectorizer
Xtrain_tfidf = tfidf_vect.fit_transform(Xtrain)
Xtest_tfidf = tfidf_vect.transform(Xtest)


count_vect = CountVectorizer() # CountVectorizer
Xtrain_count = count_vect.fit_transform(Xtrain)
Xtest_count = count_vect.transform(Xtest)

In [14]:
#Tokenize
corpus = [word for text in df['cleaned_Text_data'] for word in text.split()]
count_words = Counter(corpus)
sorted_words = count_words.most_common()

In [15]:
vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}

Text_data_int = []
for text in df['cleaned_Text_data']:
    r = [vocab_to_int[word] for word in text.split()]
    Text_data_int.append(r)

print(Text_data_int[:1])
df['Text_data int'] = Text_data_int

[[467, 133, 1011, 1717, 1805, 41, 517, 58, 18, 4, 48, 86, 5, 130, 191, 596, 261, 24, 45, 50, 104, 12, 57, 61, 10, 126, 2030, 109, 184, 2628, 427, 104, 346, 338, 136, 362, 461, 10, 1384, 482, 293, 673, 34, 384, 43, 753, 3, 124, 191, 249]]


In [16]:
def condition(x):
    if x=='moderate':
        return 1
    elif x=='severe':
        return 2
    else:
        return 0
df['Label'] = df['Label'].apply(condition)
df.head()

Unnamed: 0,Text_data,Label,cleaned_Text_data,Text_data int
0,Waiting for my mind to have a breakdown once t...,1,waiting mind breakdown “new year” feeling isn’...,"[467, 133, 1011, 1717, 1805, 41, 517, 58, 18, ..."
1,My new years resolution : I'm gonna get my ass...,1,new years resolution im gonna get ass therapis...,"[13, 16, 1105, 1, 189, 9, 619, 982, 1623, 8, 3..."
2,New year : Somone else Feeling like 2020 will ...,1,new year somone else feeling like 2020 last ye...,"[13, 10, 3358, 86, 41, 2, 70, 61, 10, 1021, 11..."
3,"My story I guess : Hi, Im from Germany and my ...",1,story guess hi im germany english mostly self ...,"[367, 182, 707, 1, 1583, 995, 674, 159, 1169, ..."
4,Sat in the dark and cried myself going into th...,1,sat dark cried going new year great start 2020,"[725, 332, 351, 23, 13, 10, 176, 109, 70]"


In [17]:
Text_data_len = [len(x) for x in Text_data_int]
df['Text_data len'] = Text_data_len
df.head()

Unnamed: 0,Text_data,Label,cleaned_Text_data,Text_data int,Text_data len
0,Waiting for my mind to have a breakdown once t...,1,waiting mind breakdown “new year” feeling isn’...,"[467, 133, 1011, 1717, 1805, 41, 517, 58, 18, ...",50
1,My new years resolution : I'm gonna get my ass...,1,new years resolution im gonna get ass therapis...,"[13, 16, 1105, 1, 189, 9, 619, 982, 1623, 8, 3...",26
2,New year : Somone else Feeling like 2020 will ...,1,new year somone else feeling like 2020 last ye...,"[13, 10, 3358, 86, 41, 2, 70, 61, 10, 1021, 11...",18
3,"My story I guess : Hi, Im from Germany and my ...",1,story guess hi im germany english mostly self ...,"[367, 182, 707, 1, 1583, 995, 674, 159, 1169, ...",296
4,Sat in the dark and cried myself going into th...,1,sat dark cried going new year great start 2020,"[725, 332, 351, 23, 13, 10, 176, 109, 70]",9


# Padding/truncating

In [18]:
def Padding(Text_data_int, seq_len):
    features = np.zeros((len(Text_data_int), seq_len), dtype = int)
    for i, Text_data in enumerate(Text_data_int):
        if len(Text_data) <= seq_len:
            zeros = list(np.zeros(seq_len - len(Text_data)))
            new = zeros + Text_data
        else:
            new = Text_data[: seq_len]
        features[i, :] = np.array(new)
            
    return features
features = Padding(Text_data_int, 200)
print(features[0, :])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0  467  133 1011 1717
 1805   41  517   58   18    4   48   86    5  130  191  596  261   24
   45   50  104   12   57   61   10  126 2030  109  184 2628  427  104
  346  338  136  362  461   10 1384  482  293  673   34  384   43  753
    3 

In [19]:
X_train, X_remain, y_train, y_remain = train_test_split(features, df['Label'].to_numpy(), test_size=0.2, random_state=1)
X_valid, X_test, y_valid, y_test = train_test_split(X_remain, y_remain, test_size=0.5, random_state=1)

In [20]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
# create tensor dataset
train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
test_data = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))
valid_data = TensorDataset(torch.from_numpy(X_valid), torch.from_numpy(y_valid))

# dataloaders
batch_size = 50

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print('Sample input: \n', sample_y)

Sample input size:  torch.Size([50, 200])
Sample input: 
 tensor([[    0,     0,     0,  ...,   111,  1842,   131],
        [    0,     0,     0,  ...,   272,   229,    48],
        [    0,     0,     0,  ...,    60, 12273,   131],
        ...,
        [    0,     0,     0,  ...,    59,  1899,   287],
        [    0,     0,     0,  ...,   124,     2,   443],
        [    0,     0,     0,  ...,  2289,  9602,    22]])
Sample input: 
 tensor([0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 0, 1, 0, 1, 1, 1, 2, 1, 2, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
        1, 1])


In [21]:
class SentimentRNN(nn.Module):
    def __init__(self,no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5):
        super(SentimentRNN,self).__init__()
 
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
 
        self.no_layers = no_layers
        self.vocab_size = vocab_size
    
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm
        self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=self.hidden_dim,
                           num_layers=no_layers, batch_first=True)
        
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
    
        # linear and sigmoid layer
        self.fc = nn.Linear(self.hidden_dim, output_dim)
        self.sig = nn.Sigmoid()
        
    def forward(self,x,hidden):
        batch_size = x.size(0)
        # embeddings and lstm_out
        embeds = self.embedding(x)  # shape: B x S x Feature   since batch = True
        #print(embeds.shape)  #[50, 500, 1000]
        lstm_out, hidden = self.lstm(embeds, hidden)
        
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim) 
        
        # dropout and fully connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        
        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)

        sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
        
        
        
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        h0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        c0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        hidden = (h0,c0)
        return hidden

In [22]:
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU is available


In [23]:
base_csv = '../input/depression/depression_train.csv'
df = pd.read_csv(base_csv)
df.head()

Unnamed: 0,Text_data,Label
0,Waiting for my mind to have a breakdown once t...,moderate
1,My new years resolution : I'm gonna get my ass...,moderate
2,New year : Somone else Feeling like 2020 will ...,moderate
3,"My story I guess : Hi, Im from Germany and my ...",moderate
4,Sat in the dark and cried myself going into th...,moderate


In [24]:
X,y = df['Text_data'].values,df['Label'].values
x_train,x_test,y_train,y_test = train_test_split(X,y,stratify=y)
print(f'shape of train data is {x_train.shape}')
print(f'shape of test data is {x_test.shape}')

shape of train data is (6668,)
shape of test data is (2223,)


In [25]:
def preprocess_string(s):
    # Remove all non-word characters (everything except numbers and letters)
    s = re.sub(r"[^\w\s]", '', s)
    # Replace all runs of whitespaces with no space
    s = re.sub(r"\s+", '', s)
    # replace digits with no space
    s = re.sub(r"\d", '', s)

    return s

def tockenize(x_train,y_train,x_val,y_val):
    word_list = []

    stop_words = set(stopwords.words('english')) 
    for sent in x_train:
        for word in sent.lower().split():
            word = preprocess_string(word)
            if word not in stop_words and word != '':
                word_list.append(word)
  
    corpus = Counter(word_list)
    # sorting on the basis of most common words
    corpus_ = sorted(corpus,key=corpus.get,reverse=True)[:1000]
    # creating a dict
    onehot_dict = {w:i+1 for i,w in enumerate(corpus_)}
    
    # tockenize
    final_list_train,final_list_test = [],[]
    for sent in x_train:
            final_list_train.append([onehot_dict[preprocess_string(word)] for word in sent.lower().split() 
                                     if preprocess_string(word) in onehot_dict.keys()])
    for sent in x_val:
            final_list_test.append([onehot_dict[preprocess_string(word)] for word in sent.lower().split() 
                                    if preprocess_string(word) in onehot_dict.keys()])
            
    encoded_train = [1 if label =='positive' else 0 for label in y_train]  
    encoded_test = [1 if label =='positive' else 0 for label in y_val] 
    return np.array(final_list_train), np.array(encoded_train),np.array(final_list_test), np.array(encoded_test),onehot_dict

In [26]:
x_train,y_train,x_test,y_test,vocab = tockenize(x_train,y_train,x_test,y_test)



In [27]:
def padding_(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features
#we have very less number of reviews with length > 500.
#So we will consideronly those below it.
x_train_pad = padding_(x_train,500)
x_test_pad = padding_(x_test,500)

In [28]:
# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(x_train_pad), torch.from_numpy(y_train))
valid_data = TensorDataset(torch.from_numpy(x_test_pad), torch.from_numpy(y_test))

# dataloaders
batch_size = 50

# make sure to SHUFFLE your data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print('Sample input: \n', sample_y)

Sample input size:  torch.Size([50, 500])
Sample input: 
 tensor([[  0,   0,   0,  ..., 206,   3,   5],
        [  0,   0,   0,  ..., 191,  36,  49],
        [  0,   0,   0,  ...,  25,  22,   7],
        ...,
        [  0,   0,   0,  ..., 187, 265,  39],
        [  0,   0,   0,  ..., 234,   1, 460],
        [  0,   0,   0,  ..., 253, 861, 336]])
Sample input: 
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0])


In [29]:
no_layers = 2
vocab_size = len(vocab) + 1 #extra 1 for padding
embedding_dim = 64
output_dim = 1
hidden_dim = 256


model = SentimentRNN(no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5)

#moving to gpu
model.to(device)

print(model)

SentimentRNN(
  (embedding): Embedding(1001, 64)
  (lstm): LSTM(64, 256, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [30]:
# loss and optimization functions
lr=0.001

criterion = nn.BCELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# function to predict accuracy
def acc(pred,label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()

In [31]:
def predict_text(text):
        word_seq = np.array([vocab[preprocess_string(word)] for word in text.split() 
                         if preprocess_string(word) in vocab.keys()])
        word_seq = np.expand_dims(word_seq,axis=0)
        pad =  torch.from_numpy(padding_(word_seq,500))
        inputs = pad.to(device)
        batch_size = 1
        h = model.init_hidden(batch_size)
        h = tuple([each.data for each in h])
        output, h = model(inputs, h)
        return(output.item())

In [32]:
index = 30
print(df['Text_data'][index])
print('='*70)
print(f'Actual sentiment is  : {df["Label"][index]}')
print('='*70)
pro = predict_text(df['Text_data'][index])
status = "positive" if pro > 0.5 else "negative"
pro = (1 - pro) if status == "negative" else pro
print(f'Predicted sentiment is {status} with a probability of {pro}')

I just got let go from my job yesterday and I am so depressed right now. : Hey everyone. I’m pretty new to this community but from what I have read it seems really supportive. I was already kind of experiencing a depressive episode but this was just the cherry on top. I’m too depressed to cry if that’s even possible. Do you guys have any tips on how to feel better? I’ve been applying to new jobs all day. thanks in advance ❤️
Actual sentiment is  : moderate
Predicted sentiment is positive with a probability of 0.5080389976501465


In [33]:
index = 32
print(df['Text_data'][index])
print('='*70)
print(f'Actual sentiment is  : {df["Label"][index]}')
pro = predict_text(df['Text_data'][index])
status = "positive" if pro > 0.5 else "negative"
pro = (1 - pro) if status == "negative" else pro
print(f'predicted sentiment is {status} with a probability of {pro}')

2019 was my worst year with 2 depression crisis. I'm happy it ended but so afraid of what 2020 will bring. : This years was rough. It started on the NYE with my puppy almost dying from the fireworks. He literally shat all over myself. In may I had my first terrible depression and anxiety crisis and had to be away from my internship for 2 weeks. Then in June I went through the first real loss of my life. My dear uncle died from a heart attack al of sudden. Even though all of this, I got promoted on my job. It was a complicated time dealing with new responsibilities and grief. I fell on depression again and this time was worse  I literally wanted to die. Even hurted myself. This made me get away from work again but now for 2 months. I had great support from family and the company I work for, but it was hard to get back on my feet again. During all of these I had to leave university for the year as I couldn't deal with the pression at the time. The thing is I study Industrial Engineer and