In [None]:
import os
import torch
from tqdm import tqdm

from utils.load_dotenv import load_dotenv
from utils.load_config import load_config
from llms.huggingface_client import HuggingFaceClient
from preprocessors.weather_preprocessor import WeatherPreprocessor

In [None]:
load_dotenv()

CONFIGS_DIR = os.getenv("CONFIGS_DIR")
DATASETS_DIR = os.getenv("DATASETS_DIR")
PROCESSED_DIR = os.getenv("PROCESSED_DIR")

WEATHER_DATASET = os.path.join(DATASETS_DIR, "weather_sj_2023_2024.csv")

In [None]:
print(f"GPU availability: {torch.cuda.is_available()}")
tqdm.pandas()

In [None]:
# Preprocessing

preprocessor = WeatherPreprocessor(WEATHER_DATASET)
preprocessor.clean_dataset()

jsonl_path = preprocessor.generate_jsonl(PROCESSED_DIR)

In [None]:
# Load Model

config_path = os.path.join(CONFIGS_DIR, "mistral7b_config.yaml")
hf_config = load_config(config_path)

hf_client = HuggingFaceClient(config=hf_config)

hf_client.load()

tokenized_dataset = preprocessor.tokenize_dataset(hf_client.tokenizer, jsonl_file=jsonl_path)

train_test = tokenized_dataset.train_test_split(test_size=0.05)
train_dataset = train_test["train"]
val_dataset = train_test["test"]

hf_client.train(train_dataset, val_dataset)