In [1]:
# Step 1: Install necessary libraries (run in terminal or notebook if needed)
# !pip install datasets transformers sentence-transformers pandas scikit-learn numpy torch

# Step 2: Import required libraries
import pandas as pd
import re
from datasets import load_dataset
from transformers import AutoTokenizer, T5ForConditionalGeneration
from sentence_transformers import SentenceTransformer, util
from sklearn.model_selection import train_test_split
import torch
import numpy as np

# Step 3: Define utility functions

def clean_text(text):
    """
    Clean text by converting to lowercase, removing extra spaces, and special characters.
    Args:
        text (str): Input text to clean.
    Returns:
        str: Cleaned text.
    """
    if not isinstance(text, str):
        return ""
    text = text.lower().strip()  # Convert to lowercase and remove leading/trailing spaces
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
    text = re.sub(r'[^\w\s.,!?]', '', text)  # Remove special characters except punctuation
    return text

def filter_fitness_relevance(df, question_col, keywords):
    """
    Filter dataset to keep only fitness-related questions based on keywords.
    Args:
        df (pd.DataFrame): Input dataframe with question column.
        question_col (str): Name of the question column.
        keywords (list): List of fitness-related keywords.
    Returns:
        pd.DataFrame: Filtered dataframe.
    """
    pattern = '|'.join(keywords)
    return df[df[question_col].str.contains(pattern, case=False, na=False)]

def paraphrase_question(question, model, tokenizer, device='cpu'):
    """
    Generate a paraphrased version of the input question using T5.
    Args:
        question (str): Original question.
        model: T5 model for paraphrasing.
        tokenizer: T5 tokenizer.
        device (str): Device to run model on ('cpu' or 'cuda').
    Returns:
        str: Paraphrased question or original if paraphrasing fails.
    """
    try:
        input_text = f"paraphrase: {question}"
        inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
        outputs = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=512,
            num_beams=5,
            no_repeat_ngram_size=2,  # Prevent repetitive phrases
            early_stopping=True
        )
        paraphrased = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Ensure paraphrase is different from original
        if paraphrased.lower().strip() == question.lower().strip():
            return question  # Fallback to original if no change
        return paraphrased
    except Exception as e:
        print(f"Paraphrasing failed for '{question}': {e}")
        return question

def augment_data(df, question_col, answer_col, model, tokenizer, num_augmentations=2, device='cpu'):
    """
    Augment dataset by generating paraphrased questions.
    Args:
        df (pd.DataFrame): Input dataframe with question and answer columns.
        question_col (str): Name of the question column.
        answer_col (str): Name of the answer column.
        model: T5 model for paraphrasing.
        tokenizer: T5 tokenizer.
        num_augmentations (int): Number of paraphrases per question.
        device (str): Device to run model on ('cpu' or 'cuda').
    Returns:
        pd.DataFrame: Augmented dataframe.
    """
    augmented_rows = []
    for _, row in df.iterrows():
        original_question = row[question_col]
        answer = row[answer_col]
        augmented_rows.append({question_col: original_question, answer_col: answer})
        for _ in range(num_augmentations):
            paraphrased_question = paraphrase_question(original_question, model, tokenizer, device)
            augmented_rows.append({question_col: paraphrased_question, answer_col: answer})
    return pd.DataFrame(augmented_rows)

def tokenize_data(row, question_col, answer_col, tokenizer, max_length=512):
    """
    Tokenize question-answer pair for T5 model.
    Args:
        row (pd.Series): Dataframe row with question and answer.
        question_col (str): Name of the question column.
        answer_col (str): Name of the answer column.
        tokenizer: Transformers tokenizer.
        max_length (int): Maximum token length.
    Returns:
        dict: Tokenized input and attention mask.
    """
    input_text = f"question: {row[question_col]} answer: {row[answer_col]}"
    return tokenizer(input_text, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")

def main():
    # Step 4: Load the dataset from Hugging Face
    try:
        dataset = load_dataset("its-myrto/fitness-question-answers")
        df = dataset['train'].to_pandas()
        print(f"Initial dataset size: {len(df)}")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return

    # Step 5: Drop unnecessary column
    if 'Unnamed: 0' in df.columns:
        df = df.drop(columns=['Unnamed: 0'])
        print(f"Dropped 'Unnamed: 0' column. New columns: {df.columns.tolist()}")

    # Step 6: Define column names
    question_col = 'Question'
    answer_col = 'Answer'

    # Verify column names exist
    if question_col not in df.columns or answer_col not in df.columns:
        print(f"Error: Columns '{question_col}' and/or '{answer_col}' not found in dataset.")
        print("Available columns:", df.columns.tolist())
        return

    # Step 7: Clean the data
    # Remove duplicates
    df = df.drop_duplicates(subset=[question_col, answer_col], keep='first')
    print(f"Rows after removing duplicates: {len(df)}")

    # Remove missing values
    df = df.dropna(subset=[question_col, answer_col])
    print(f"Rows after removing missing values: {len(df)}")

    # Clean questions and answers
    df[question_col] = df[question_col].apply(clean_text)
    df[answer_col] = df[answer_col].apply(clean_text)

    # Step 8: Filter for fitness relevance with expanded keywords
    fitness_keywords = [
        'exercise', 'workout', 'fitness', 'nutrition', 'muscle', 'cardio', 'strength', 
        'yoga', 'running', 'sleep', 'stress', 'recovery', 'flexibility', 'balance', 
        'posture', 'hydration', 'motivation'
    ]
    df = filter_fitness_relevance(df, question_col, fitness_keywords)
    print(f"Rows after filtering for fitness relevance: {len(df)}")

    # Step 9: Data augmentation using T5 for paraphrasing
    if len(df) < 1000:
        print("Augmenting dataset due to small size...")
        try:
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            print(f"Using device: {device}")
            paraphraser = T5ForConditionalGeneration.from_pretrained('t5-base').to(device)
            paraphraser_tokenizer = AutoTokenizer.from_pretrained('t5-base')
            df = augment_data(df, question_col, answer_col, paraphraser, paraphraser_tokenizer, num_augmentations=2, device=device)
            print(f"Rows after augmentation: {len(df)}")
        except Exception as e:
            print(f"Augmentation failed: {e}. Proceeding without augmentation.")
            # Continue without augmentation if it fails

    # Step 10: Split dataset into train and validation
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
    print(f"Train size: {len(train_df)}, Validation size: {len(val_df)}")

    # Step 11: Tokenize data for T5 model
    try:
        tokenizer = AutoTokenizer.from_pretrained('t5-small')
        train_tokens = train_df.apply(lambda row: tokenize_data(row, question_col, answer_col, tokenizer), axis=1)
        val_tokens = val_df.apply(lambda row: tokenize_data(row, question_col, answer_col, tokenizer), axis=1)
    except Exception as e:
        print(f"Tokenization failed: {e}")
        return

    # Step 12: Save cleaned and split datasets
    train_df.to_csv('train_cleaned.csv', index=False)
    val_df.to_csv('val_cleaned.csv', index=False)
    print("Cleaned datasets saved as 'train_cleaned.csv' and 'val_cleaned.csv'")

if __name__ == "__main__":
    main()

2025-06-18 02:38:01.570183: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750214281.805864      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750214281.873556      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


README.md:   0%|          | 0.00/203 [00:00<?, ?B/s]

conversational_dataset.csv:   0%|          | 0.00/289k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/965 [00:00<?, ? examples/s]

Initial dataset size: 965
Dropped 'Unnamed: 0' column. New columns: ['Question', 'Answer']
Rows after removing duplicates: 965
Rows after removing missing values: 965
Rows after filtering for fitness relevance: 542
Augmenting dataset due to small size...
Using device: cuda


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Rows after augmentation: 1626
Train size: 1300, Validation size: 326


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Cleaned datasets saved as 'train_cleaned.csv' and 'val_cleaned.csv'


In [2]:
import pandas as pd

def clean_invalid_questions():
    files = ['train_cleaned.csv', 'val_cleaned.csv']
    for file in files:
        try:
            df = pd.read_csv(f"/kaggle/working/{file}")
            print(f"\nProcessing {file}: {len(df)} rows")
            # Remove rows where 'Question' is not a valid question (e.g., 'entailment', 'True')
            df = df[df['Question'].str.contains(r'[a-zA-Z\s]+[?]', na=False)]  # Must contain letters and end with '?'
            print(f"After removing invalid questions: {len(df)} rows")
            # Save cleaned file
            df.to_csv(f"/kaggle/working/{file}", index=False)
            print(f"Saved cleaned {file}")
            # Preview cleaned data
            print(f"Preview of cleaned {file}:")
            print(df.head(5))
        except Exception as e:
            print(f"Error processing {file}: {e}")

if __name__ == "__main__":
    clean_invalid_questions()


Processing train_cleaned.csv: 1300 rows
After removing invalid questions: 834 rows
Saved cleaned train_cleaned.csv
Preview of cleaned train_cleaned.csv:
                                            Question  \
0  what are some nonrunning exercise options that...   
3  how is lowintensity exercise different from hi...   
4               can exercise improve brain function?   
5                 how much cardio should i be doing?   
7               which is better, walking or running?   

                                              Answer  
0  some nonrunning exercise options include using...  
3  lowintensity exercise involves moderate physic...  
4  yes. as blood pumps to the brain during physic...  
5  general recommendations of around 150 minutes ...  
7  walking and running are both excellent forms o...  

Processing val_cleaned.csv: 326 rows
After removing invalid questions: 213 rows
Saved cleaned val_cleaned.csv
Preview of cleaned val_cleaned.csv:
                                

In [3]:
!pip install transformers datasets pandas torch evaluate nltk

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cuspa

In [5]:
# Step 1: Install specific versions of libraries
!pip cache purge
!pip uninstall -y transformers evaluate torch pandas datasets nltk
!pip install transformers==4.45.2 evaluate==0.4.2 torch==2.6.0 pandas==2.2.2 datasets==3.0.1 nltk==3.9.1

# Step 2: Set environment variable to suppress tokenizers warning
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Step 3: Import required libraries
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch
import evaluate
import nltk
from nltk.tokenize import word_tokenize

# Download NLTK data for BLEU score calculation
nltk.download('punkt')

# Step 4: Define utility functions

def prepare_data(df, tokenizer, max_length=512):
    """
    Prepare DataFrame for T5 fine-tuning by tokenizing inputs and targets.
    Args:
        df (pd.DataFrame): DataFrame with 'Question' and 'Answer' columns.
        tokenizer: T5 tokenizer.
        max_length (int): Maximum token length.
    Returns:
        Dataset: Tokenized dataset with input_ids, attention_mask, and labels.
    """
    inputs = [f"question: {row['Question']} answer:" for row in df.to_dict('records')]
    targets = [row['Answer'] for row in df.to_dict('records')]
    
    encodings = tokenizer(
        inputs,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    target_encodings = tokenizer(
        targets,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    
    dataset = {
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }
    return Dataset.from_dict(dataset)

def compute_metrics(eval_pred):
    """
    Compute BLEU score for evaluation.
    Args:
        eval_pred: Tuple of (predictions, labels) from Trainer.
    Returns:
        dict: BLEU score.
    """
    predictions, labels = eval_pred
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    predictions = predictions.argmax(-1) if predictions.ndim == 3 else predictions
    
    predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    bleu = evaluate.load("bleu")
    results = bleu.compute(predictions=predictions, references=[[label] for label in labels])
    return {"bleu": results["bleu"]}

def main():
    # Step 5: Load preprocessed datasets
    try:
        train_df = pd.read_csv('/kaggle/working/train_cleaned.csv')
        val_df = pd.read_csv('/kaggle/working/val_cleaned.csv')
        print(f"Loaded training data: {len(train_df)} rows")
        print(f"Loaded validation data: {len(val_df)} rows")
    except Exception as e:
        print(f"Error loading datasets: {e}")
        return

    # Step 6: Initialize tokenizer and model
    model_name = 't5-small'
    try:
        global tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = T5ForConditionalGeneration.from_pretrained(model_name)
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        model.to(device)
        print(f"Using device: {device}")
    except Exception as e:
        print(f"Error initializing model/tokenizer: {e}")
        return

    # Step 7: Prepare datasets
    try:
        train_dataset = prepare_data(train_df, tokenizer)
        val_dataset = prepare_data(val_df, tokenizer)
    except Exception as e:
        print(f"Error preparing datasets: {e}")
        return

    # Step 8: Define training arguments
    training_args = TrainingArguments(
        output_dir='/kaggle/working/t5_fitness_model',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        warmup_steps=200,
        weight_decay=0.01,
        logging_dir='/kaggle/working/logs',
        logging_steps=50,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="bleu",
        greater_is_better=True,
        fp16=True
    )

    # Step 9: Initialize Trainer
    try:
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics
        )
    except Exception as e:
        print(f"Error initializing Trainer: {e}")
        return

    # Step 10: Fine-tune the model
    try:
        trainer.train()
        print("Fine-tuning completed")
    except Exception as e:
        print(f"Error during fine-tuning: {e}")
        return

    # Step 11: Save the fine-tuned model
    try:
        trainer.save_model('/kaggle/working/t5_fitness_model_final')
        tokenizer.save_pretrained('/kaggle/working/t5_fitness_model_final')
        print("Fine-tuned model and tokenizer saved to '/kaggle/working/t5_fitness_model_final'")
    except Exception as e:
        print(f"Error saving model: {e}")
        return

    # Step 12: Qualitative evaluation
    test_questions = [
        "how can i improve my running endurance?",
        "what are effective core exercises?",
        "how do i stay motivated for workouts?"
    ]
    model.eval()
    with torch.no_grad():
        for question in test_questions:
            input_text = f"question: {question} answer:"
            inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
            outputs = model.generate(**inputs, max_length=512, num_beams=5)
            answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
            print(f"Question: {question}\nAnswer: {answer}\n")

if __name__ == "__main__":
    main()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Files removed: 158


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Found existing installation: transformers 4.52.4
Uninstalling transformers-4.52.4:
  Successfully uninstalled transformers-4.52.4
Found existing installation: evaluate 0.4.3
Uninstalling evaluate-0.4.3:
  Successfully uninstalled evaluate-0.4.3
Found existing installation: torch 2.7.1
Uninstalling torch-2.7.1:
  Successfully uninstalled torch-2.7.1
Found existing installation: pandas 2.3.0
Uninstalling pandas-2.3.0:
  Successfully uninstalled pandas-2.3.0
Found existing installation: datasets 3.6.0
Uninstalling datasets-3.6.0:
  Successfully uninstalled datasets-3.6.0
Found existing installation: nltk 3.9.1
Uninstalling nltk-3.9.1:
  Successfully uninstalled nltk-3.9.1


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting transformers==4.45.2
  Downloading transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate==0.4.2
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting torch==2.6.0
  Downloading torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting pandas==2.2.2
  Downloading pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting datasets==3.0.1
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting nltk==3.9.1
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers==4.45.2)
  Downloading tokenizers-0.20.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-

RuntimeError: Failed to import transformers.pipelines because of the following error (look up to see its traceback):
cannot import name 'get_file_from_repo' from 'transformers.utils' (/usr/local/lib/python3.11/dist-packages/transformers/utils/__init__.py)

In [6]:
import pandas as pd

def clean_invalid_questions():
    files = ['train_cleaned.csv', 'val_cleaned.csv']
    for file in files:
        try:
            df = pd.read_csv(f"/kaggle/working/{file}")
            print(f"\nProcessing {file}: {len(df)} rows")
            # Remove rows where 'Question' is not a valid question
            df = df[df['Question'].str.contains(r'[a-zA-Z\s]+[?]', na=False)]
            print(f"After removing invalid questions: {len(df)} rows")
            # Save cleaned file
            df.to_csv(f"/kaggle/working/{file}", index=False)
            print(f"Saved cleaned {file}")
            # Preview cleaned data
            print(f"Preview of cleaned {file}:")
            print(df.head(5))
        except Exception as e:
            print(f"Error processing {file}: {e}")

if __name__ == "__main__":
    clean_invalid_questions()


Processing train_cleaned.csv: 834 rows
After removing invalid questions: 834 rows
Saved cleaned train_cleaned.csv
Preview of cleaned train_cleaned.csv:
                                            Question  \
0  what are some nonrunning exercise options that...   
1  how is lowintensity exercise different from hi...   
2               can exercise improve brain function?   
3                 how much cardio should i be doing?   
4               which is better, walking or running?   

                                              Answer  
0  some nonrunning exercise options include using...  
1  lowintensity exercise involves moderate physic...  
2  yes. as blood pumps to the brain during physic...  
3  general recommendations of around 150 minutes ...  
4  walking and running are both excellent forms o...  

Processing val_cleaned.csv: 213 rows
After removing invalid questions: 213 rows
Saved cleaned val_cleaned.csv
Preview of cleaned val_cleaned.csv:
                                 

In [7]:
     import os
     import json

     dataset_dir = "/kaggle/working/fitness_qa_dataset"
     os.makedirs(dataset_dir, exist_ok=True)

     files = ['train_cleaned.csv', 'val_cleaned.csv']
     for file in files:
         src_path = f"/kaggle/working/{file}"
         dst_path = f"{dataset_dir}/{file}"
         if os.path.exists(src_path):
             os.system(f"cp {src_path} {dst_path}")
             print(f"Copied {file} to {dst_path}")
         else:
             print(f"Error: {file} not found")

     metadata = {
         "title": "Fitness QA Preprocessed Dataset",
         "id": "gatetekaggle/fitness-qa-preprocessed",  # Replace with your Kaggle username
         "licenses": [{"name": "CC0-1.0"}]
     }
     with open(f"{dataset_dir}/dataset-metadata.json", "w") as f:
         json.dump(metadata, f, indent=2)
     print("Created dataset-metadata.json")

     print("\nTo upload the dataset:")
     print("1. Go to https://www.kaggle.com/datasets/new")
     print("2. Click 'New Dataset' and upload /kaggle/working/fitness_qa_dataset/")
     print("3. Download from your Kaggle profile")


Copied train_cleaned.csv to /kaggle/working/fitness_qa_dataset/train_cleaned.csv
Copied val_cleaned.csv to /kaggle/working/fitness_qa_dataset/val_cleaned.csv
Created dataset-metadata.json

To upload the dataset:
1. Go to https://www.kaggle.com/datasets/new
2. Click 'New Dataset' and upload /kaggle/working/fitness_qa_dataset/
3. Download from your Kaggle profile


In [27]:
!pip cache purge
!pip uninstall -y transformers evaluate torch pandas datasets nltk fsspec torchvision torchaudio -q

Files removed: 48
[0m

In [35]:
!pip install --upgrade --force-reinstall \
  numpy==1.26.4 \
  pandas==2.2.2 \
  pyarrow==14.0.2 \
  scipy==1.13.0 \
  scikit-learn==1.3.2 \
  tensorflow==2.18.0 \
  matplotlib==3.8.0 \
  google-auth==2.38.0 \
  notebook==6.5.7 \
  requests==2.32.3 \
  fsspec==2025.3.2 \
  toolz==0.12.1 \
  packaging==24.0 \
  rich==13.7.1 \
  cryptography==43.0.3 \
  pyopenssl==24.2.1 \
  google-api-core==2.19.1


Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting pandas==2.2.2
  Using cached pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting pyarrow==14.0.2
  Using cached pyarrow-14.0.2-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Collecting scipy==1.13.0
  Using cached scipy-1.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting scikit-learn==1.3.2
  Using cached scikit_learn-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting tensorflow==2.18.0
  Using cached tensorflow-2.18.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting matplotlib==3.8.0
  Using cached matplotlib-3.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.8 kB)
Collecting google-auth==2.38.0
  Using cached google_auth-2.38.0-py2.py3-none-any

In [31]:
import transformers
import evaluate
import torch
import torchvision

print("Transformers:", transformers.__version__)
print("Evaluate:", evaluate.__version__)
print("Torch:", torch.__version__)
print("Torchvision:", torchvision.__version__)

RuntimeError: Failed to import transformers.pipelines because of the following error (look up to see its traceback):
cannot import name 'get_file_from_repo' from 'transformers.utils' (/usr/local/lib/python3.11/dist-packages/transformers/utils/__init__.py)