In [3]:
!pip install -q transformers kagglehub zarr datasets gcsfs peft accelerate bitsandbytes xarray googletrans==4.0.0-rc1 netCDF4 netcdf4 scipy store

In [15]:
from datasets import load_dataset
from googletrans import Translator
climate_ds = load_dataset("climate_fever", split="test[:100]")
translator = Translator()
def translate_and_tokenize(example):
    translated = translator.translate(example["claim"], dest="hi").text
    return {"text": f"दावा: {translated}\nनिष्कर्ष:"}

climate_translated = climate_ds.map(translate_and_tokenize)
import xarray as xr
from datasets import Dataset


precip_url = "https://psl.noaa.gov/thredds/dodsC/Datasets/cpc_us_precip/precip.V1.0.mon.mean.nc"
nasa_ds = xr.open_dataset(precip_url)

nasa_samples = [
    {"text": f"Precipitation at t={i} is {val.values.mean():.2f} mm"}
    for i, val in enumerate(nasa_ds["precip"][:10])
]

nasa_dataset = Dataset.from_list(nasa_samples)
# Import necessary libraries
import xarray as xr
import numpy as np

import kagglehub

# Download latest version
path = kagglehub.dataset_download("thedevastator/weather-prediction")

print("Path to dataset files:", path)
from datasets import load_dataset

ds = load_dataset("Nursultan2003ch/weather")
print("Dataset loaded successfully!")
from datasets import concatenate_datasets
climate_translated = climate_ds.map(translate_and_tokenize)
merged_dataset = concatenate_datasets([
    climate_translated,
    ds["train"],
    nasa_dataset,
])
from transformers import AutoTokenizer

model_name = "tiiuae/falcon-7b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

tokenized_dataset = merged_dataset.map(tokenize, batched=True)
import torch
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    device_map="auto",
    quantization_config=quant_config
)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

training_args = TrainingArguments(
    output_dir="./MeghaNetra",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    learning_rate=2e-5,
    save_strategy="epoch",
    logging_steps=5,
    fp16=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

trainer.train()
trainer.save_model("./MeghaNetra")
import shutil
shutil.make_archive("MeghaNetra", 'zip', "./MeghaNetra")    


Map: 100%|██████████| 100/100 [00:18<00:00,  5.51 examples/s]


Path to dataset files: /home/codespace/.cache/kagglehub/datasets/thedevastator/weather-prediction/versions/2
Dataset loaded successfully!


Map: 100%|██████████| 100/100 [00:18<00:00,  5.48 examples/s]
Map:   0%|          | 0/27635873 [00:00<?, ? examples/s]


TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]