In [1]:
!pip install -U transformers
!pip install -U datasets

Collecting transformers
  Downloading transformers-4.48.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.48.1-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.47.1
    Uninstalling transformers-4.47.1:
      Successfully uninstalled transformers-4.47.1
Successfully installed transformers-4.48.1
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multi

In [2]:
import torch
import numpy as np
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset

In [50]:
import yfinance as yf
import pandas as pd

# Scarica i dati di Apple (solo il prezzo di chiusura)
data = yf.download("AAPL", start="2014-01-01", end="2024-01-01", interval="1d")

# Se il DataFrame ha MultiIndex, lo sistemiamo
if isinstance(data.columns, pd.MultiIndex):
    data.columns = data.columns.droplevel(1)  # Rimuove il secondo livello (ticker)

# Seleziona solo la colonna "Close"
closing_prices = data["Close"]

# Creiamo un DataFrame con le date e i prezzi di chiusura
df = pd.DataFrame({
    "Date": closing_prices.index,
    "Close": closing_prices.values
}).reset_index(drop=True)

print(df.head())  # Controlla il formato

[*********************100%***********************]  1 of 1 completed

        Date      Close
0 2014-01-02  17.234295
1 2014-01-03  16.855724
2 2014-01-06  16.947647
3 2014-01-07  16.826441
4 2014-01-08  16.933002





In [51]:
WINDOW_SIZE = 30  # Numero di giorni usati per predire il successivo
rows = []

# Scorriamo il DataFrame e creiamo finestre di 30 giorni
for i in range(len(df) - WINDOW_SIZE):
    input_days = df["Close"].iloc[i : i + WINDOW_SIZE].values  # 30 giorni precedenti
    target_day = df["Close"].iloc[i + WINDOW_SIZE]  # Prezzo del 31° giorno

    # Creiamo la stringa di input e target
    input_text = "Predict the next day price given these 30 days: " + ", ".join([f"{x:.2f}" for x in input_days])
    target_text = f"{target_day:.2f}"  # Il prezzo da predire come testo

    rows.append({"input_text": input_text, "target_text": target_text})

# Convertiamo in DataFrame
processed_df = pd.DataFrame(rows)

# Salviamo il dataset su CSV
processed_df.to_csv("apple_30to1.csv", index=False)

print(processed_df.head())  # Controlla il formato

                                          input_text target_text
0  Predict the next day price given these 30 days...       17.05
1  Predict the next day price given these 30 days...       17.11
2  Predict the next day price given these 30 days...       16.84
3  Predict the next day price given these 30 days...       16.65
4  Predict the next day price given these 30 days...       16.46


In [54]:
# Carichiamo il dataset da CSV
dataset = load_dataset("csv", data_files="apple_30to1.csv", split="train")

# Suddividiamo il dataset in train e validation
full_dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)
train_dataset = full_dataset["train"]
val_dataset   = full_dataset["test"]

print(train_dataset)
print(val_dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['input_text', 'target_text'],
    num_rows: 1988
})
Dataset({
    features: ['input_text', 'target_text'],
    num_rows: 498
})


In [52]:
MODEL = "t5-small"
BATCH_SIZE = 4
NUM_PROCS = 4
EPOCHS = 10
OUR_DIR = "t5_small_FT"

In [53]:
tokenizer = T5Tokenizer.from_pretrained(MODEL)

In [56]:
def preprocess_function(examples):
    # Assicurati che gli input e i target siano stringhe
    inputs = [str(x) for x in examples["input_text"]]
    targets = [str(x) for x in examples["target_text"]]

    # Tokenizziamo gli input
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_LENGTH,
        truncation=True,
        padding="max_length"
    )

    # Tokenizziamo i target
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=MAX_LENGTH,
            truncation=True,
            padding="max_length"
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenizziamo i dataset
train_tokenized = train_dataset.map(preprocess_function, batched=True)
val_tokenized   = val_dataset.map(preprocess_function,   batched=True)

Map:   0%|          | 0/1988 [00:00<?, ? examples/s]

Map:   0%|          | 0/498 [00:00<?, ? examples/s]