In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset
import transformers
import torch

In [2]:
import nltk
nltk.download('words')
words = set(nltk.corpus.words.words())

[nltk_data] Downloading package words to /Users/jyotsana/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [58]:
data = pd.read_csv("dataset_2/train.csv", nrows=100)

In [59]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Index   100 non-null    int64 
 1   Review  100 non-null    object
 2   label   100 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 2.5+ KB


In [49]:
data.head()

Unnamed: 0,Index,Review,label
0,0,<html>.This is a very short review😁 😅 .\n\nThi...,1
1,1,Decidí probar Galloping Groomers porque han es...,1
2,2,This store certainly indulges my out of contro...,1
3,3,Bugs in my salad and vomit in the bathroom uri...,0
4,4,<br>I have a dog walking business and this par...,1


In [60]:
data.label.value_counts()

label
0    53
1    47
Name: count, dtype: int64

In [50]:
def remove_htmltags(df):
  df['Review'] = df['Review'].str.replace(r'<[^<>]*>', '', regex=True)
  return df

data = remove_htmltags(data)

In [52]:
punctuations = "!"#$%&'()*+, -/:;<=>?@[\]^_`{|}~"

def remove_punctuations(text):
    return text.translate(str.maketrans("","", punctuations))

data["Review"] = data["Review"].apply(remove_punctuations)

In [53]:
data = data.replace(r'^\s*$', np.nan, regex=True)

In [54]:
data.isna().any()

Index     False
Review    False
label     False
dtype: bool

In [56]:
data[['Review', 'label']].to_csv("dataset_2/cleaned_train.csv", index=False)

In [3]:
data = load_dataset("csv", data_files="dataset_2/cleaned_train.csv", split="train", nrows=100)

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
# from datasets import train_test_split

In [5]:
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [7]:
dataset = data.train_test_split(test_size=0.2, shuffle=True)
dataset["train"].shape

(80, 2)

In [57]:
BATCH_SIZE=16
MAX_LENGTH=512
OUT_DIR="analysis"
EPOCHS = 10

In [43]:
from torch.utils.data import DataLoader

def custom_collate_fn(examples):
    inputs = [f"Sentiment Analysis: {article}" for article in examples['Review']]
    # model_inputs = tokenizer(
    #     inputs,
    #     max_length=MAX_LENGTH,
    #     truncation=True,
    #     padding='max_length'
    # )
    
    # print(model_inputs)
    labels= torch.tensor(examples["label"], dtype=float.long)

    return {
        "tokenized_input": inputs,
        "label": labels
    }

In [54]:
train_loader = DataLoader(data, batch_size=2, shuffle=False)

In [55]:
len(train_loader)
# len(sample)

50

In [56]:
next(iter(train_loader))

{'Review': [".This is a very short review😁 😅 .\\n\\nThis place served me the best meal I've had in years, and I'm enormously happy it's open; it's spendy, but damn, if you're going out for a good dinner, this is the place I'd recommend, hands down.\\n\\nPhenomenal.  Perfect.  No fault..  Awesome😒 ",
  'Decidí probar Galloping Groomers porque han estado en el negocio durante mucho tiempo y deben ser buenos😇 . Bueno, estaba extremadamente decepcionado y no solo le dieron a mi Bichon quemaduras de navaja en su vientre, sino que el corte no fue uniforme. se veía terrible'],
 'label': tensor([1, 1])}

In [None]:
training_args = TrainingArguments(
    output_dir=OUT_DIR,

    num_train_epochs=EPOCHS,
    max_steps = -1, # if set will overwrite epochs
    dataloader_num_workers=2,

    ###### Memory optimization
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=1,
    gradient_checkpointing = False,
    bf16=False, # Needs Ampere
    fp16=False,
    # deepspeed=False,
    # fsdp=False,
    dataloader_pin_memory = True,

    ###### Better training
    lr_scheduler_type="linear", # check SchedulerType
    warmup_steps = 200,
    weight_decay=0.01,
    learning_rate=0.001,
    eval_steps=200,
    evaluation_strategy='epoch',

    # logging_dir=OUT_DIR,
    save_strategy='epoch',
    logging_steps=200,
    save_total_limit=1,
    # report_to='tensorboard',
    save_safetensors=True, # To save state_dicts instead of whole,
    # save_only_model= False,

    load_best_model_at_end = True,
    metric_for_best_model= "loss",
    greater_is_better=False,
    resume_from_checkpoint = False,
    use_cpu = False
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
)

In [None]:
history = trainer.train()

In [58]:
test_df = load_dataset("csv",data_files="dataset_2/test.csv", nrows=100)

Generating train split: 0 examples [00:00, ? examples/s]

In [59]:
def inference(df):
    inputs=[f"Sentiment Analysis: {article}" for article in df['Review']]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_LENGTH,
        truncation=True,
        padding='max_length'
    )
    output = model.generate(model_inputs)
    print(output)

In [None]:
import sklearn

sklearn.metrics.f1_score(label,pred)