In [1]:
import pandas as pd
from datasets import Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Load dataset
df = pd.read_csv("tarot_readings_with_sentiments.csv")  # replace with your file path

# Keep only relevant columns
df = df[['question', 'question_sentiment']].dropna()

# Normalize sentiment labels
df['question_sentiment'] = df['question_sentiment'].str.strip().str.lower()

# Map to numeric labels
label_map = {"negative": 0, "neutral": 1, "positive": 2}
df['label'] = df['question_sentiment'].map(label_map)

# Drop rows with unmapped or invalid labels
df = df.dropna(subset=['label'])
df['label'] = df['label'].astype(int)

# Optional: check balance
print(df['label'].value_counts())


label
1    9757
0    1736
2     497
Name: count, dtype: int64


In [5]:
# Only keep necessary columns
hf_dataset = Dataset.from_pandas(df[['question', 'label']])


In [6]:
# Load tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Define tokenization function
def tokenize_function(example):
    tokens = tokenizer(example["question"], padding="max_length", truncation=True, max_length=128)
    tokens["label"] = example["label"]
    return tokens

# Apply tokenization
tokenized_dataset = hf_dataset.map(tokenize_function)


Map: 100%|██████████| 11990/11990 [00:01<00:00, 7134.02 examples/s]


In [7]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)


In [8]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
training_args = TrainingArguments(
    output_dir="./results",
    do_train=True,
    do_eval=True,
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    save_steps=100,
    eval_steps=100,
    save_total_limit=2
)


In [10]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average='weighted')
    }


In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [12]:
trainer.train()


Step,Training Loss
100,0.3888
200,0.0542
300,0.0043
400,0.002
500,0.0012
600,0.0009
700,0.0006
800,0.0072
900,0.0004
1000,0.0004


TrainOutput(global_step=2400, training_loss=0.0192588037198099, metrics={'train_runtime': 880.2668, 'train_samples_per_second': 43.587, 'train_steps_per_second': 2.726, 'total_flos': 1270649947742208.0, 'train_loss': 0.0192588037198099, 'epoch': 4.0})

In [13]:
trainer.save_model("models/distilbert-sentiment-tarot")
tokenizer.save_pretrained("models/distilbert-sentiment-tarot")


('models/distilbert-sentiment-tarot\\tokenizer_config.json',
 'models/distilbert-sentiment-tarot\\special_tokens_map.json',
 'models/distilbert-sentiment-tarot\\vocab.txt',
 'models/distilbert-sentiment-tarot\\added_tokens.json',
 'models/distilbert-sentiment-tarot\\tokenizer.json')

In [14]:
import torch

# Automatically use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move model to correct device

def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Move inputs to same device
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        predicted_class = outputs.logits.argmax().item()
    return {0: "negative", 1: "neutral", 2: "positive"}[predicted_class]

# Example
print(predict_sentiment("Will I find love this year?"))


positive


In [17]:
import torch
import pandas as pd
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

# Load model and tokenizer
model_path = "models/distilbert-sentiment-tarot"  # path to your fine-tuned model
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Load original dataset
df = pd.read_csv("tarot_readings_combined.csv")  # replace with actual file path

# Drop rows with missing question or reading
df = df.dropna(subset=['question', 'reading'])

# Define prediction function
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        predicted_class = outputs.logits.argmax().item()
    return {0: "negative", 1: "neutral", 2: "positive"}[predicted_class]


In [18]:
# Apply sentiment model to both question and reading
df['question_predicted_sentiment'] = df['question'].apply(predict_sentiment)
df['reading_predicted_sentiment'] = df['reading'].apply(predict_sentiment)


In [19]:
df.to_csv("tarot_with_sentiment_predictions.csv", index=False)
print("✅ Saved to tarot_with_sentiment_predictions.csv")


✅ Saved to tarot_with_sentiment_predictions.csv
