In [None]:
import pandas as pd 
from collections import Counter
import ipywidgets as widgets
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
sns.set_theme()
warnings.filterwarnings("ignore")
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler,Dataset
from transformers import AdamW, get_linear_schedule_with_warmup
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from sklearn.model_selection import train_test_split

In [None]:
nltk.download("stopwords")
STOPWORDS = stopwords.words("english")
stemmer = PorterStemmer()

In [None]:
def clean_text(text, lower=True, stem=False, stopwords=STOPWORDS):
    # Remove stopwords
    if len(stopwords):
        pattern = re.compile(r'\b(' + r"|".join(stopwords) + r")\b\s*")
        text = pattern.sub('', text)

    # Spacing and filters
    text = re.sub(
        r"([!\"'#$%&()*\+,-./:;<=>?@\\\[\]^_`{|}~])", r" \1 ", text
    )  # add spacing between objects to be filtered
    clean=text
    text = re.sub("[^A-Za-z0-9]+", " ", text)  # remove non alphanumeric chars
    text = re.sub(" +", " ", text)  # remove multiple spaces
    text = text.strip()  # strip white space at the ends
    text=re.sub("pic.twitter\S+","",text)
    text=re.sub("@\S+","",text)
    text = re.sub('#', '', text)
    text = re.sub('goooooooaaaaaal', 'goal', text)
    text = re.sub('SOOOO', 'SO', text)
    text = re.sub('LOOOOOOL', 'LOL', text)
    text = re.sub('Cooool', 'cool', text)
    text = re.sub('|', '', text)
    text = re.sub(r'\?{2,}', '? ', text)
    text = re.sub(r'\.{2,}', '. ', text)
    text = re.sub(r'\!{2,}', '! ', text)
    text = re.sub('&amp;', '&', text)
    text = re.sub('Comin', 'Coming', text)
    text = re.sub('&gt;', '> ', text)
    text = re.sub('&lt;', '< ', text)
    text = re.sub(r'.:', '', text)
    text = re.sub('baaaack', 'back', text)
    text = re.sub('RT', '', text)
    text = re.sub('\s{2,}', ' ', text)
    # Remove links
    text = re.sub(r"http\S+", "", text)

    # Stemming
    if stem:
        text = " ".join([stemmer.stem(word, to_lowercase=lower) for word in text.split(" ")])

    return text

In [None]:
train = pd.read_csv(data_path)

In [None]:
original_df = train.copy()
train.text =train.text.apply(clean_text, lower=True, stem=False)
print (f"{original_df.text.values[0]}\n{train.text.values[0]}")

In [None]:
target_variable = 'target'
train_df, test_df = train_test_split(train, test_size=0.2, random_state=42, stratify=train[target_variable])


In [None]:
def append_keyword(df):
    for index, row in df.iterrows():
        if not pd.isna(row['keyword']):
            df.at[index, 'text'] = row['keyword'] + ' ' + row['text']
    return df

In [None]:
train_df=append_keyword(train_df)
test_df=append_keyword(test_df)

In [None]:
# Define a custom dataset class
class TextClassificationDataset(Dataset):
    def __init__(self, texts, targets, tokenizer):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = str(self.texts[item])
        target = self.targets[item]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            return_token_type_ids=False,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt",
        )
        
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }


# Split the data into input and target columns
train_texts = train_df["text"].tolist()
train_labels = train_df["target"].tolist()

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# Initialize the dataset and data loader
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Initialize the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2)

# Define the optimizer and learning rate
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
epochs=3
for epoch in range(epochs):
    print("gah")
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=targets)
        loss = outputs.loss
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# Save the model
model.save_pretrained("my_model")

In [None]:
def predict_label(text):
    encoding = tokenizer.encode_plus(text, add_special_tokens=True, max_length=128,
                                     padding='max_length', return_attention_mask=True,
                                     return_token_type_ids=False, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    predictions = torch.argmax(outputs[0], dim=1)
    return predictions.item()

In [None]:
test['target'] = test['text'].apply(predict_label)
result_df = test[['id', 'target']]
result_df.to_csv('output.csv', index=False)

In [None]:
test = pd.read_csv(test_path)
original_df = test.copy()
test.text =test.text.apply(clean_text, lower=True, stem=False)
print (f"{original_df.text.values[0]}\n{test.text.values[0]}")

In [None]:
def calculate_accuracy(df):
    correct = 0
    total = 0
    for index, row in df.iterrows():
        predicted_label = predict_label(row['text'])
        actual_label = row['target']
        if predicted_label == actual_label:
            correct += 1
        total += 1
    accuracy = correct / total
    return accuracy