In [None]:
import pandas as pd
import re

In [None]:
df = pd.read_csv("/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv", encoding='latin-1', names=['labels',  'id', 'date', 'query',  'username', 'tweet'])

In [None]:
sampled_df = df.sample(n=100000, replace=False, random_state=1, ignore_index=True)

In [None]:
sampled_df = sampled_df.drop(columns=['id', 'date', 'query', 'username'])
sampled_df['labels'] = sampled_df['labels'].map({0:0, 4:1})
sampled_df['tweet'] = sampled_df.tweet.apply(lambda x: re.sub('@[^\s]+','',x))
sampled_df.head()

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForSequenceClassification, BertTokenizer
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm
from datasets import load_metric

In [None]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(list(sampled_df.tweet.values), sampled_df.labels.values, test_size=.2)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [None]:
class SentimentDataset(torch.utils.data.Dataset):
    
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataloader = DataLoader(SentimentDataset(train_encodings, train_labels), shuffle=True, batch_size=8)
eval_dataloader = DataLoader(SentimentDataset(val_encodings, val_labels), batch_size=8)

In [None]:
del model
torch.cuda.empty_cache()

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

In [None]:
def evaluate():
    metric = load_metric("accuracy")
    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

    return metric.compute()

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for i, batch in enumerate(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        if i % 1000 == 0:
            print(f"iteration: {i} | loss: {loss.item()}")
        progress_bar.update(1)
    print(evaluate())


In [None]:
loss = [0.7825751304626465,0.34935635328292847,0.4183889925479889,0.5266145467758179,0.3440805971622467,0.6664698123931885,
0.9303305745124817,0.5289250612258911,0.33006468415260315,0.5056890249252319,0.4399178624153137,0.1170702800154686,
0.09948326647281647,0.10625694692134857,0.2394929975271225,0.08324136584997177,0.3338935077190399,0.2283564805984497,
0.6068935990333557,0.13533364236354828,0.23950281739234924,0.057980410754680634,0.3307877779006958,0.01022527925670147,
0.006380629725754261,0.01793970912694931,0.013707959093153477,0.14926961064338684,0.12756234407424927]

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(loss)
plt.ylabel('loss')
plt.xlabel('iteration')

## Save Model

In [None]:
import os
os.makedirs("sentiment_bert_2")

In [None]:
model.save_pretrained("sentiment_bert_2")

## Load Saved Model

In [None]:
test_df = df.sample(10000, random_state=46)
test_df = test_df.drop(columns=['id', 'date', 'query', 'username'])
test_df['labels'] = test_df['labels'].map({0:0, 4:1})
# test_df['tweet'] = test_df.tweet.apply(lambda x: re.sub('@[^\s]+','',x))
test_df.head()

In [None]:
test_labels = list(test_df.labels.values)

In [None]:
test_encodings = tokenizer(list(test_df.tweet.values), truncation=True, padding=True)

In [None]:
ls ../input/sentiment-bert

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

test_model = AutoModelForSequenceClassification.from_pretrained("../input/sentiment-bert", num_labels=2)
test_model.to(device)

In [None]:
test_dataloader = DataLoader(SentimentDataset(test_encodings, test_labels), shuffle=False, batch_size=8)

with data cleaning : {'accuracy': 0.8381} <br>
without data cleaning : {'accuracy': 0.8331}

In [None]:
metric = load_metric("accuracy")
test_model.eval()
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = test_model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

In [None]:
test_df.groupby('labels').count()