In [None]:
import os
import csv
import numpy as np
import pandas as pd
import torch
from google.colab import drive
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.cuda.amp import autocast, GradScaler

# Mount Google Drive
drive.mount('/content/drive')

def load_data(folder_path):
    small_neg_path = os.path.join(folder_path, "train_neg.txt")
    small_pos_path = os.path.join(folder_path, "train_pos.txt")
    test_path = os.path.join(folder_path, "test_data.txt")

    with open(small_neg_path, 'r') as file:
        lines_neg = file.readlines()
    with open(small_pos_path, 'r') as file:
        lines_pos = file.readlines()
    with open(test_path, 'r') as file:
        lines_test = file.readlines()

    small_neg_df = pd.DataFrame({'Tweets': lines_neg, 'Sentiment': -1})
    small_pos_df = pd.DataFrame({'Tweets': lines_pos, 'Sentiment': 1})
    test_df= pd.DataFrame({'Tweets': lines_test})
    combined_df = pd.concat([small_neg_df, small_pos_df], ignore_index=True)

    return combined_df, test_df

def create_csv_submission(ids, y_pred, name):
    """
    This function creates a csv file named 'name' in the format required for a submission in Kaggle or AIcrowd.
    The file will contain two columns the first with 'ids' and the second with 'y_pred'.
    y_pred must be a list or np.array of 1 and -1 otherwise the function will raise a ValueError.

    Args:
        ids (list,np.array): indices
        y_pred (list,np.array): predictions on data correspondent to indices
        name (str): name of the file to be created
    """
    # Check that y_pred only contains -1 and 1
    if not all(i in [-1, 1] for i in y_pred):
        raise ValueError("y_pred can only contain values -1, 1")

    with open(name, "w", newline="") as csvfile:
        fieldnames = ["Id", "Prediction"]
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({"Id": int(r1), "Prediction": int(r2)})

def tokenize_tweets(tokenizer, tweet):
    tokens = tokenizer.encode_plus(tweet, add_special_tokens=True, max_length=128, padding='max_length', return_tensors='pt')
    return tokens['input_ids'].squeeze(), tokens['attention_mask'].squeeze()

class TweetsDataset(Dataset):
    def __init__(self, tweets, sentiments, tokenizer):
        self.tweets = tweets
        self.sentiments = sentiments
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, idx):
        tweet, sentiment = self.tweets[idx], self.sentiments[idx]
        input_ids, attention_mask = tokenize_tweets(self.tokenizer, tweet)
        sentiment = 1 if sentiment == 1 else 0
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'sentiment': sentiment}

def train_model(model, train_loader, optimizer, criterion, device):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, sentiment = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['sentiment'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=sentiment)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

def evaluate_model(model, test_loader, device):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, sentiment = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['sentiment'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(sentiment.cpu().numpy())

    return true_labels, predictions

def predict_sentiment(model, tokenizer, tweet, device):
    model.eval()
    tokens = tokenizer.encode_plus(tweet, add_special_tokens=True, max_length=128, padding='max_length', return_tensors='pt')
    input_ids, attention_mask = tokens['input_ids'].to(device), tokens['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    predicted_class = torch.argmax(logits, dim=1).item()
    return predicted_class

def main():
    # File path in Google Drive
    folder_path = '/content/drive/MyDrive/Colab Notebooks'

    # Load pre-trained BERT model and tokenizer
    model_name = 'bert-base-uncased'
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForSequenceClassification.from_pretrained(model_name).to(device)

    # Tokenize and preprocess tweets using BERT
    combined_df, test_df = load_data(folder_path)

    # Split the data into train and test sets
    train_df, test_df = train_test_split(combined_df, test_size=0.2, random_state=42)

    # Create DataLoader for training set
    train_dataset = TweetsDataset(train_df['Tweets'].values, train_df['Sentiment'].values, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=20, shuffle=True)

    # Create DataLoader for test set
    test_dataset = TweetsDataset(test_df['Tweets'].values, test_df['Sentiment'].values, tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=20, shuffle=False)

    # Training loop with optimizations
    scaler = GradScaler()

    epochs = 5
    accumulation_steps = 4  # Adjust this based on GPU memory
    total_steps = len(train_loader) * epochs // accumulation_steps

    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, total_steps)

    for epoch in range(epochs):
        model.train()
        print(epoch)
        for i, batch in enumerate(train_loader):
            input_ids, attention_mask, sentiment = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['sentiment'].to(device)

            with autocast():
                input_ids, attention_mask, sentiment = input_ids.to(device), attention_mask.to(device), sentiment.to(device)
                outputs = model(input_ids, attention_mask=attention_mask, labels=sentiment)
                loss = outputs.loss / accumulation_steps  # Scale the loss

            scaler.scale(loss).backward()

            if (i + 1) % accumulation_steps == 0 or i == len(train_loader) - 1:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()

    # Evaluation
    true_labels, predictions = evaluate_model(model, test_loader, device)

    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predictions)
    print(f"Accuracy: {accuracy}")

    # Predict and create submission
    test_df['Predicted_Sentiment'] = test_df['Tweets'].apply(lambda tweet: predict_sentiment(model, tokenizer, tweet, device))
    sentiment_mapping = {0: -1, 1: 1}
    test_df['Predicted_Sentiment'] = test_df['Predicted_Sentiment'].map(sentiment_mapping)
    y_pred = test_df['Predicted_Sentiment'].values

    ids = np.arange(1, len(y_pred) + 1)
    submission_path = os.path.join(folder_path, "submission.csv")
    create_csv_submission(ids, y_pred, submission_path)

if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    main()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0
1
2
3
4
Accuracy: 0.881325


In [None]:
test_df['Predicted_Sentiment'] = test_df['Tweets'].apply(lambda tweet: predict_sentiment(model, tokenizer, tweet, device))
sentiment_mapping = {0: -1, 1: 1}
test_df['Predicted_Sentiment'] = test_df['Predicted_Sentiment'].map(sentiment_mapping)
y_pred = test_df['Predicted_Sentiment'].values

NameError: ignored

In [None]:
# File path in Google Drive
folder_path = '/content/drive/MyDrive/Colab Notebooks'
test_path = os.path.join(folder_path, "test_data.txt")

#create small data dataframe
with open(test_path, 'r') as file:
    lines_test = file.readlines()

test_df= pd.DataFrame({'Tweets': lines_test})

# Create a function to preprocess tweets and get predictions
def predict_sentiment(tweet):
    model.eval()
    tokens = tokenizer.encode_plus(tweet, add_special_tokens=True, max_length=128, padding='max_length', return_tensors='pt')
    input_ids, attention_mask = tokens['input_ids'].to(device), tokens['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    predicted_class = torch.argmax(logits, dim=1).item()
    return predicted_class

# Apply the function to the 'Tweets' column in test_df
test_df['Predicted_Sentiment'] = test_df['Tweets'].apply(predict_sentiment)

sentiment_mapping = {0: -1, 1: 1}

# Map predicted sentiments
test_df['Predicted_Sentiment'] = test_df['Predicted_Sentiment'].map(sentiment_mapping)

# 'Mapped_Predicted_Sentiment' column now contains the mapped sentiments
y_pred = test_df['Predicted_Sentiment'].values

def create_csv_submission(ids, y_pred, name):
    """
    This function creates a csv file named 'name' in the format required for a submission in Kaggle or AIcrowd.
    The file will contain two columns the first with 'ids' and the second with 'y_pred'.
    y_pred must be a list or np.array of 1 and -1 otherwise the function will raise a ValueError.

    Args:
        ids (list,np.array): indices
        y_pred (list,np.array): predictions on data correspondent to indices
        name (str): name of the file to be created
    """
    # Check that y_pred only contains -1 and 1
    if not all(i in [-1, 1] for i in y_pred):
        raise ValueError("y_pred can only contain values -1, 1")

    with open(name, "w", newline="") as csvfile:
        fieldnames = ["Id", "Prediction"]
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({"Id": int(r1), "Prediction": int(r2)})

#!!!!don't forget to change the name of the model!!!!
ids=np.arange(1,len(y_pred)+1)
submission_path = os.path.join(folder_path, "submission.csv")
create_csv_submission(ids, y_pred, submission_path)

NameError: ignored