In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hashes.txt
/kaggle/input/Reviews.csv
/kaggle/input/database.sqlite


In [2]:
!pip install transformers datasets scikit-learn torch



In [3]:
import torch
from transformers import BertModel, BertPreTrainedModel
from torch import nn

class BERTCNN(BertPreTrainedModel):
    def __init__(self, config):
        super(BERTCNN, self).__init__(config)
        self.bert = BertModel.from_pretrained("bert-base-uncased", config=config)
        self.conv = nn.Conv1d(in_channels=config.hidden_size, out_channels=128, kernel_size=5, padding=2)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.dropout = nn.Dropout(0.5)
        self.classifier = nn.Linear(128, config.num_labels)
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds)
        sequence_output = outputs[0]
        sequence_output = sequence_output.permute(0, 2, 1)
        x = self.conv(sequence_output)
        x = self.pool(x).squeeze(-1)
        x = self.dropout(x)
        logits = self.classifier(x)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))

        return (loss, logits) if loss is not None else logits

In [4]:
import torch
from torch import nn
from transformers import BertModel, BertPreTrainedModel, BertConfig, AutoTokenizer, TrainingArguments, Trainer

class TransBLSTM(BertPreTrainedModel):
    def __init__(self, config):
        super(TransBLSTM, self).__init__(config)
        self.bert = BertModel.from_pretrained("bert-base-uncased", config=config)
        self.blstm = nn.LSTM(config.hidden_size, config.hidden_size // 2, 
                             num_layers=1, bidirectional=True, batch_first=True)
        self.layer_norm = nn.LayerNorm(config.hidden_size)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.dropout = nn.Dropout(0.5)
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, 
                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
        bert_outputs = self.bert(input_ids, attention_mask=attention_mask, 
                                 token_type_ids=token_type_ids, position_ids=position_ids, 
                                 head_mask=head_mask, inputs_embeds=inputs_embeds)
        
        sequence_output = bert_outputs[0]
        blstm_output, _ = self.blstm(sequence_output)
        combined_output = self.layer_norm(sequence_output + blstm_output)
        
        pooled_output = combined_output[:, 0]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
        return (loss, logits) if loss is not None else logits


2024-06-19 15:53:30.370947: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-19 15:53:30.371072: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-19 15:53:30.489949: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:
from transformers import AutoModelForSequenceClassification

class RoBERTa:
    def __init__(self, model_type='cardiffnlp/twitter-roberta-base-sentiment', num_labels=3):
        self.model = AutoModelForSequenceClassification.from_pretrained(model_type, num_labels=num_labels)

    def get_model(self):
        return self.model

In [6]:
!pip install transformers[torch] accelerate -U

  pid, fd = os.forkpty()


Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.30.1
    Uninstalling accelerate-0.30.1:
      Successfully uninstalled accelerate-0.30.1
Successfully installed accelerate-0.31.0


In [7]:
import pandas as pd
from transformers import AutoTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, BertConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from datasets import Dataset
import torch

def load_data(file_path):
    df = pd.read_csv(file_path, on_bad_lines='skip', nrows=70000)
    df['Sentiment'] = df['Score'].apply(map_score_to_sentiment)
    return train_test_split(df[['Text', 'Sentiment']], test_size=0.4, random_state=42)

def map_score_to_sentiment(score):
    return 0 if score < 3 else (1 if score == 3 else 2)

def tokenize_data(tokenizer, texts, labels):
    tokenized_inputs = tokenizer(texts.tolist(), padding="max_length", truncation=True, max_length=512)
    return Dataset.from_dict({**tokenized_inputs, 'labels': labels.tolist()})


def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {"f1": f1_score(p.label_ids, preds, average='macro')}

def train_model(model, train_dataset, test_dataset, tokenizer, output_dir):
    training_args = TrainingArguments(
        output_dir=output_dir,
        report_to="none",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        save_total_limit=2,
        save_steps=500,
        eval_strategy="steps",
        eval_steps=500,
        load_best_model_at_end=True,
        metric_for_best_model="f1" 
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
    )

    trainer.train()

    model_path = f"{output_dir}/best_model"
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(model_path)

In [8]:
train_df, test_df = load_data('/kaggle/input/Reviews.csv')
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
train_dataset = tokenize_data(tokenizer, train_df['Text'], train_df['Sentiment'])
test_dataset = tokenize_data(tokenizer, test_df['Text'], test_df['Sentiment'])

def data_stats(data_df):
    num_reviews = data_df.shape[0]
    print(f"Total number of reviews: {num_reviews}")

    lengths = data_df['Text'].apply(len)
    average_length = lengths.mean()
    print(f"Average review length: {average_length:.2f} characters")

    all_words = ' '.join(data_df['Text']).split()
    vocab_size = len(set(all_words))
    print(f"Vocabulary size: {vocab_size}")

    median_length = lengths.median()
    min_length = lengths.min()
    max_length = lengths.max()
    print(f"Median review length: {median_length} characters")
    print(f"Minimum review length: {min_length} characters")
    print(f"Maximum review length: {max_length} characters")


    negative_count = len([negative for negative in data_df['Sentiment'] if negative == 0])
    neutral_count = len([neutral for neutral in data_df['Sentiment'] if neutral == 1])
    positive_count = len([positive for positive in data_df['Sentiment'] if positive == 2])
    print(f'Negative Count: {negative_count}')
    print(f'Neutral Count: {neutral_count}')
    print(f'Positive Count: {positive_count}')

print("Training Data Statistics:")
data_stats(train_df)
print("\nTesting Data Statistics:")
data_stats(test_df)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Training Data Statistics:
Total number of reviews: 42000
Average review length: 435.00 characters
Vocabulary size: 131319
Median review length: 305.0 characters
Minimum review length: 33 characters
Maximum review length: 10327 characters
Negative Count: 6306
Neutral Count: 3317
Positive Count: 32377

Testing Data Statistics:
Total number of reviews: 28000
Average review length: 436.48 characters
Vocabulary size: 102562
Median review length: 307.0 characters
Minimum review length: 44 characters
Maximum review length: 16952 characters
Negative Count: 4231
Neutral Count: 2221
Positive Count: 21548


In [9]:
# bert_model_type = 'bert-base-uncased'
# bert_cnn_config = BertConfig.from_pretrained(bert_model_type, num_labels=3)
# bert_cnn_model = BERTCNN(config=bert_cnn_config)
# train_model(bert_cnn_model, train_dataset, test_dataset, tokenizer, './bert_cnn_results')

In [10]:
bert_model_type = "bert-base-uncased"
trans_blstm_config = BertConfig.from_pretrained(bert_model_type, num_labels=3)
trans_blstm_model = TransBLSTM.from_pretrained(bert_model_type, config=trans_blstm_config)
train_model(trans_blstm_model, train_dataset, test_dataset, tokenizer, "./trans_blstm_model")



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of TransBLSTM were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['blstm.bias_hh_l0', 'blstm.bias_hh_l0_reverse', 'blstm.bias_ih_l0', 'blstm.bias_ih_l0_reverse', 'blstm.weight_hh_l0', 'blstm.weight_hh_l0_reverse', 'blstm.weight_ih_l0', 'blstm.weight_ih_l0_reverse', 'classifier.bias', 'classifier.weight', 'layer_norm.bias', 'layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,F1
500,0.4151,0.387679,0.560522
1000,0.2799,0.31935,0.691528
1500,0.2692,0.300247,0.696126
2000,0.2357,0.304391,0.725081
2500,0.2742,0.338783,0.650696
3000,0.2592,0.311768,0.737709
3500,0.234,0.330907,0.702581
4000,0.1927,0.316016,0.736578
4500,0.0794,0.332638,0.746636
5000,0.226,0.278982,0.754214


In [11]:
# roberta_model_type = 'cardiffnlp/twitter-roberta-base-sentiment'
# roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_type)
# roberta_train_dataset = tokenize_data(roberta_tokenizer,  train_df['Text'], train_df['Sentiment'])
# roberta_test_dataset = tokenize_data(roberta_tokenizer,  test_df['Text'], test_df['Sentiment'])

# roberta_model = RoBERTa(model_type=roberta_model_type).get_model()
# train_model(roberta_model, roberta_train_dataset, roberta_test_dataset, tokenizer, './roberta_results')

In [24]:
import numpy as np

In [25]:
def test_model(model, test_dataset):
    trainer = Trainer(model=model)
    result = trainer.predict(test_dataset)
    prediction = np.argmax(result.predictions, axis=1)
    return result, prediction

In [14]:
# bert_cnn_result, bert_cnn_preds = test_model(bert_cnn_model, test_dataset)

In [26]:
trans_blstm_result, trans_blstm_preds = test_model(trans_blstm_model, test_dataset)

In [None]:
# roberta_result, roberta_preds = test_model(roberta_model, roberta_test_dataset)

In [32]:
def compare(model_result, model_preds, model_type):
    print(model_type)
    cases = ['negative', 'neutral', 'positive']

    predictions_map = {
        'negative': [],
        'neutral': [],
        'positive': [],
    }
    truth_map = {
        'negative': [],
        'neutral': [],
        'positive': [],
    }
    falsy_map = {
        'negative': [],
        'neutral': [],
        'positive': [],
    }

    for i, (result, preds) in enumerate(zip(model_result, model_preds)):
        score = test_df['Sentiment'].iloc[i]
        truth_map[cases[score]].append(i)
        predictions_map[cases[preds]].append(i)
        if score > 0 and preds == 0:
            falsy_map[cases[0]].append(i)
        elif score != 1 and preds == 1:
            falsy_map[cases[1]].append(i)
        elif score <2 and preds == 2:
            falsy_map[cases[2]].append(i)


    total_data = len(predictions_map[cases[0]]) + len(predictions_map[cases[1]]) + len(predictions_map[cases[2]])

    print("Predictions")
    print(f'Negative:{len(predictions_map[cases[0]])} | Neutral: {len(predictions_map[cases[1]])} | Positive: {len(predictions_map[cases[2]])}')
    print("============\n")
    print("Truth")
    print(f'Negative:{len(truth_map[cases[0]])} | Neutral: {len(truth_map[cases[1]])} | Positive: {len(truth_map[cases[2]])}')
    print("============\n")
    print("False Positives")
    print(f'Negative:{len(falsy_map[cases[0]])} ({len(falsy_map[cases[0]])/len(truth_map[cases[0]])*100})| Neutral: {len(falsy_map[cases[1]])} ({len(falsy_map[cases[1]])/len(truth_map[cases[1]])*100})| Positive: {len(falsy_map[cases[2]])} ({len(falsy_map[cases[2]])/len(truth_map[cases[2]])*100})')
    
    print("============\n\n\n")

In [33]:
# compare(bert_cnn_result.predictions, bert_cnn_preds,'bert-cnn')
# compare(roberta_result.predictions, roberta_preds, roberta_model_type)
compare(trans_blstm_result.predictions, trans_blstm_preds, 'trans-blstm')

trans-blstm
Predictions
Negative:4125 | Neutral: 2257 | Positive: 21618

Truth
Negative:4231 | Neutral: 2221 | Positive: 21548

False Positives
Negative:701 (16.568187189789647)| Neutral: 1106 (49.797388563710044)| Positive: 881 (4.088546500835345)





In [4]:
!zip -r roberta_results.zip /kaggle/working/trans_blstm_model/best_model/


  adding: kaggle/working/trans_blstm_model/best_model/ (stored 0%)
  adding: kaggle/working/trans_blstm_model/best_model/model.safetensors (deflated 7%)
  adding: kaggle/working/trans_blstm_model/best_model/tokenizer.json (deflated 71%)
  adding: kaggle/working/trans_blstm_model/best_model/config.json (deflated 50%)
  adding: kaggle/working/trans_blstm_model/best_model/tokenizer_config.json (deflated 76%)
  adding: kaggle/working/trans_blstm_model/best_model/special_tokens_map.json (deflated 42%)
  adding: kaggle/working/trans_blstm_model/best_model/vocab.txt (deflated 53%)


In [5]:
from IPython.display import FileLink
FileLink(r'roberta_results.zip')