In [17]:
import pandas as pd
import numpy as np 
import torch
import torch.nn as nn
import torch.nn.functional as F
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from sklearn.model_selection import train_test_split
import datetime


In [296]:
class vocab_builder:
    def __init__(self, tokenized_df):
        self.idx_word= {}
        self.longest= 0
        self.word_idx = {}
        tokenized_df['tweet'].apply(vocab_builder.word_to_idx, args=(self,))
        tokenized_df['tweet'].apply(vocab_builder.idx_to_word, args=(self,))
        self.idx_word[99999] = '<PAD>'
        self.word_idx['<PAD>'] = 99999
        
    def word_to_idx(sentence, self):
        count = 0 
        for word in sentence:
            count +=1
            if word.norm not in self.idx_word:
                #word_idx[word] = len(word_idx)
                self.idx_word[len(self.idx_word)] = word
        if self.longest < count:
            self.longest = count
        return 

    def idx_to_word(sentence, self):
        for word in sentence:
            if word not in self.word_idx:
                #word_idx[word] = len(word_idx)
                self.word_idx[word] = len(self.word_idx)
        return 

In [297]:
class Dataset:
    def __init__(self, filepath):
        self.data = pd.read_csv(filepath)
        self.tokenized_df = Dataset.tokenize(self, self.data)
        

    def tokenize(self, df):
        nlp= English()
        token_gen = Tokenizer(nlp.vocab)
        tweet = df['tweet'].apply(lambda x: token_gen(x))
        tokenized = df.assign(tweet = tweet)
        return tokenized

In [298]:
class data_processor:
    def __init__(self, tokenized_df, vocab):
        self.vocab = vocab
        self.normalized = tokenized_df['tweet'].apply(data_processor.sentence_normalizer, args=(self,))
        self.normalized_df = tokenized_df.assign(tweet = self.normalized)
        self.normalized_df = self.normalized_df.drop("Unnamed: 0" , axis=1)
        self.numerized = self.normalized_df['tweet'].apply(data_processor.numerizer, args=(self,))
        self.numerized_df = self.normalized_df.assign(numerized_tweet = self.numerized)


    def sentence_normalizer(sentence, self):
        final_tok = []
        for token in sentence:
            final_tok.append(token)
        if len(final_tok)<self.vocab.longest:
            final_tok.extend(['<PAD>']*(self.vocab.longest-len(final_tok)))
        return final_tok


    def numerizer(x, self):
        base = []
        for token in x:
            try:
                base.append(self.vocab.word_idx[token])
            except:
                base.append(99999)
        return base

In [299]:
class Data_Loader:
    def __init__(self, df, train_size, val_size, test_size, gpu):
        dataframe = df[['Toxicity',"numerized_tweet"]]
        self.train_valid, self.test = train_test_split(dataframe, test_size=test_size)
        self.train, self.validate = train_test_split(dataframe, test_size=val_size)
        self.test = Data_Loader.convert(self.test, gpu)
        self.validate = Data_Loader.convert(self.validate, gpu)
        self.train = Data_Loader.convert(self.train, gpu)

    
    def convert(self, data_partition, gpu_cpu):
        if gpu_cpu:
            text = torch.tensor(np.stack(data_partition['tweet']))
            tag = torch.tensor(data_partition['Toxicity']).unsqueeze(1)
            return torch.cat((text,tag),1).cuda()
            
        else:
            text = np.stack(data_partition['tweet'])
            tag = np.array(data_partition['Toxicity'])
            tag = np.expand_dims(tag, 1)
            return np.concatenate((text,tag),1)

    def shuffle(self, gpu_cpu):
        if gpu_cpu:
            idx =torch.randperm(self.train.shape[0])
            self.train = self.train[idx]
        else: 
            np.random.shuffle(self.train)  

    def create_batches(self, data, batch_size):
        length = len(data)
        base=[]
        tracker = 0 
        while tracker<len(length):
            if (tracker+64)>=len(length):
                base.append((tracker,len(length)))
                break
            base.append((tracker,tracker+64))
            tracker+=64
        

        

In [300]:
class NLP_LSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size):
        self.fc = nn.Linear(hidden_dim, vocab_size)
        #self.fc = nn.Linear(self.hidden_size, self.vocab.idx)
        self.embedder = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=2, batch_first=True)
        

    def forward(self, inputs):
        embeddings = self.embedder(inputs)
        outputs, hidden_states = self.lstm(embeddings, hidden_states)
        linear_layer1 = torch.relu(self.fc(outputs))
        linear_layer2 = torch.relu(self.fc(linear_layer1))
        prediction = F.sigmoid(linear_layer2)
        


In [301]:
class Trainer:
    def __init__(self, filepath, batch_size, learning_rate, epochs):
        self.last_epoch = epochs
        self.gpu_avail = torch.cuda.is_available()
        self.loaded_Data = Dataset(filepath)
        self.vocabulary = vocab_builder(self.loaded_Data.tokenized_df)
        self.processed_data = data_processor(self.loaded_Data.tokenized_df,self.vocabulary)
        

        ## model
        self.model = NLP_LSTM()
        self.lr = learning_rate
        self.batch_size= batch_size
        self.loss = torch.nn.BCELoss()
        self.__optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001)
        self.lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(self.__optimizer, gamma=0.9)

    def main(self):
        print("Training Commencing")
        for epoch in range(0, self.last_epoch):
            start_time = datetime.now()
            print(F'epoch: {epoch+1}')
            




In [304]:
h = Trainer("C:/MyJoseph/Projects/NLP_text_ML/data/FinalBalancedDataset.csv", 64, 0.005)