In [None]:
!pip install transformers

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer

# Step 1: Load your custom dataset
df = pd.read_csv("both_likes_dislikes.csv")
dataset = Dataset.from_pandas(df)

# Step 2: Tokenize the dataset
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Step 3: Split the dataset
train_dataset = tokenized_dataset.shuffle(seed=42).select([i for i in list(range(0, int(0.8 * len(tokenized_dataset))))])
test_dataset = tokenized_dataset.shuffle(seed=42).select([i for i in list(range(int(0.8 * len(tokenized_dataset)), len(tokenized_dataset)))])

# Step 4: Load the pre-trained model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Step 5: Set up the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Step 6: Set up the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Step 7: Train the model
trainer.train()

# Step 8: Evaluate the model
results = trainer.evaluate()
print(results)

# Step 9: Make predictions
texts = ["I love this movie!", "I hate this movie."]
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
outputs = model(**inputs)
predictions = outputs.logits.argmax(dim=-1)
print(predictions)
