In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pytorch_lightning as pl


device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# !pip install pytorch-lightning



In [3]:
class TweetClassifier(nn.ModuleList):

	def __init__(self, batch_size, hidden_dim, lstm_layers, max_words):
		super().__init__()
		
		# Hyperparameters
		self.batch_size = batch_size
		self.hidden_dim = hidden_dim
		self.LSTM_layers = lstm_layers
		self.input_size = max_words
		
		self.dropout = nn.Dropout(0.5)
		self.embedding = nn.Embedding(self.input_size, self.hidden_dim, padding_idx=0)
		self.lstm = nn.LSTM(input_size=self.hidden_dim, hidden_size=self.hidden_dim, num_layers=self.LSTM_layers, batch_first=True)
		self.fc1 = nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim*2)
		self.fc2 = nn.Linear(self.hidden_dim*2, 1)
		
	def forward(self, x):
		
		# Hidden and cell state definion
		h = torch.zeros((self.LSTM_layers, x.size(0), self.hidden_dim)).to(device)
		c = torch.zeros((self.LSTM_layers, x.size(0), self.hidden_dim)).to(device)
		
		# Initialization fo hidden and cell states
		torch.nn.init.xavier_normal_(h)
		torch.nn.init.xavier_normal_(c)

		# Each sequence "x" is passed through an embedding layer
		out = self.embedding(x)
		# Feed LSTMs
		out, (hidden, cell) = self.lstm(out, (h,c))
		out = self.dropout(out)
		# The last hidden state is taken
		out = torch.relu_(self.fc1(out[:,-1,:]))
		out = self.dropout(out)
		out = torch.sigmoid(self.fc2(out))

		return out.squeeze(1)

In [4]:
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
from transformers import AutoTokenizer

class CustomDataset(Dataset):
  
	def __init__(self, path, text_col_name="text", label_col_name="target", tokenizer=None):
		"""
		path : path to the file
		text_col_name, label_col_name : name or num of the column with text and label
		"""
		data = pd.read_csv(path)
		self.x, self.y = data[text_col_name].to_numpy(), data[label_col_name].values
		
		if tokenizer == None:
			self.tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
		
		self.x = np.array(self.tokenizer(list(self.x), padding=True).input_ids)
		self.y = torch.FloatTensor(self.y)

	def inplace(self, dataset):
		self.x = dataset[0]
		self.y = dataset[1]

	def vocab_size(self):
		return self.tokenizer.vocab_size
		
	def __len__(self):
		return len(self.x)
		
	def __getitem__(self, idx):
		return self.x[idx], self.y[idx]

In [5]:
class LitClassifactor(pl.LightningModule):
    def __init__(self, model) -> None:
        super().__init__()
        self.model = model
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        prediction = self.model(x)
        loss = F.binary_cross_entropy(prediction, y)
        self.log("train_loss", loss)
        return loss
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        prediction = self.model(x)
        loss = F.binary_cross_entropy(prediction, y)
        self.log("test_log", loss)
        return loss
        
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=4e-3)
        return optimizer


In [6]:
BATCH_SIZE = 256
TRAIN_SIZE = 6000

dataset = CustomDataset('train.csv')
assert(TRAIN_SIZE < len(dataset))
train_dataset, test_dataset = random_split(dataset, [6000, len(dataset) - 6000])
train_dataset, test_dataset = train_dataset.dataset, test_dataset.dataset


train_laoder = DataLoader(train_dataset, batch_size=BATCH_SIZE, pin_memory_device='cuda:0')
test_laoder = DataLoader(test_dataset, batch_size=BATCH_SIZE)

model = TweetClassifier(BATCH_SIZE, 128, 1, train_dataset.vocab_size())
litmodel = LitClassifactor(model)

In [9]:
trainer = pl.Trainer(max_epochs=10, accelerator="gpu")
trainer.fit(litmodel, train_laoder)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type            | Params
------------------------------------------
0 | model | TweetClassifier | 3.9 M 
------------------------------------------
3.9 M     Trainable params
0         Non-trainable params
3.9 M     Total params
15.507    Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


In [11]:
trainer.test(litmodel, test_laoder)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_log            0.6833913326263428
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_log': 0.6833913326263428}]