In [7]:
from datasets import load_dataset
from transformers import AutoTokenizer

In [8]:
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-moe-16b-base")

In [31]:
# gsm8k

dataset = load_dataset("openai/gsm8k", 'main')

# Create a txt file with all the dataset in the specified format
with open("gsm8k_dataset.txt", "w") as f:
    # # Process training data
    # for item in dataset["train"]:
    #     f.write(item["question"] + "\n")
    #     f.write(item["answer"] + "\n\n")
    
    # Process test data
    for item in dataset["test"]:
        f.write(item["question"] + "\n")
        f.write(item["answer"] + "\n\n")

print(f"Dataset saved to gsm8k_dataset.txt with {len(dataset['train'])} training examples and {len(dataset['test'])} test examples")

# Display a sample
dataset["train"][0]


Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Dataset saved to gsm8k_dataset.txt with 7473 training examples and 1319 test examples


{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
 'answer': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72'}

In [32]:
with open("gsm8k_dataset.txt", "r") as f:
    text = f.read()
    tokens = tokenizer.encode(text)
    print(len(tokens))

252002


In [36]:
# for aime 1983 - 2024
# form csv file to txt and the format is ID,Year,Problem Number,Question,Answer,Part

import pandas as pd
import os

# Load the AIME CSV file
# Assuming the file is in the current directory or you need to specify the path
aime_csv_path = "AIME_Dataset_1983_2024 (1).csv"  # Update this path if needed

if os.path.exists(aime_csv_path):
    # Read the CSV file
    df = pd.read_csv(aime_csv_path)
    
    # Create a text file with just the problems
    with open("aime_problems.txt", "w") as f:
        for _, row in df.iterrows():
            # Write the question to the file
            f.write(f"{row['Question']}\n")
    
    print(f"AIME problems extracted to aime_problems.txt with {len(df)} problems")
else:
    print(f"CSV file not found at {aime_csv_path}")


AIME problems extracted to aime_problems.txt with 933 problems


In [38]:
with open("aime_problems.txt", "r") as f:
    text = f.read()
    tokens = tokenizer.encode(text)
    print(len(tokens))

105050


In [9]:
# Process the arXiv metadata JSON file to extract titles and abstracts (random 10k papers)
import json
import random

# Path to the arXiv metadata JSON file
arxiv_json_path = "arxiv-metadata-oai-snapshot.json"
output_txt_path = "arxiv_title_abstract.txt"

# First, count total number of papers in the file
total_papers = 0
with open(arxiv_json_path, "r") as json_file:
    for _ in json_file:
        total_papers += 1
print(f"Total papers in dataset: {total_papers}")

# Generate 10k random indices without repetition
sample_size = min(500, total_papers)
selected_indices = set(random.sample(range(total_papers), sample_size))

# Process the JSON file, only extracting papers at the selected indices
with open(arxiv_json_path, "r") as json_file, open(output_txt_path, "w") as txt_file:
    processed_count = 0
    for idx, line in enumerate(json_file):
        if idx in selected_indices:
            try:
                # Parse the JSON object
                paper = json.loads(line.strip())
                
                # Extract title and abstract
                title = paper.get("title", "").strip()
                abstract = paper.get("abstract", "").strip()
                
                # Write to the output file
                if title and abstract:
                    txt_file.write(f"{title}\n{abstract}\n\n")
                
                processed_count += 1
                if processed_count % 1000 == 0:
                    print(f"Processed {processed_count} papers")
                    
            except json.JSONDecodeError:
                print(f"Error parsing JSON at index {idx}")
                continue
            except Exception as e:
                print(f"Error processing paper at index {idx}: {str(e)}")
                continue

print(f"Completed processing {processed_count} random papers. Output saved to {output_txt_path}")

# Check the token count of the resulting file
with open(output_txt_path, "r") as f:
    text = f.read()
    tokens = tokenizer.encode(text)
    print(f"Number of tokens in the arXiv title+abstract file: {len(tokens)}")


Total papers in dataset: 2694879
Completed processing 500 random papers. Output saved to arxiv_title_abstract.txt
Number of tokens in the arXiv title+abstract file: 111068


In [None]:
load_arxiv = load_dataset("armanc/scientific_papers", "arxiv", trust_remote_code=True)

In [20]:
print(load_arxiv)

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract', 'section_names'],
        num_rows: 203037
    })
    validation: Dataset({
        features: ['article', 'abstract', 'section_names'],
        num_rows: 6436
    })
    test: Dataset({
        features: ['article', 'abstract', 'section_names'],
        num_rows: 6440
    })
})


In [42]:
# Save 1000 articles from the test set
import os
from tqdm import tqdm

# Define the output directory and file
output_dir = "arxiv_samples"
output_file = os.path.join(output_dir, "arxiv_test_1000.txt")

# Create the directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Get the test set
test_set = load_arxiv["test"]


sample_size = 25
test_samples = test_set.select(range(min(sample_size, len(test_set))))

# Save the articles to a text file
with open(output_file, "w", encoding="utf-8") as f:
    for i, article in enumerate(tqdm(test_samples, desc="Saving test articles")):
        article_text = article.get("article", "").strip()
        abstract = article.get("abstract", "").strip()
        
        if article_text and abstract:
            f.write(f"Article: {article_text}\n\n")
print(f"Saved {sample_size} articles from the test set to {output_file}")

# Check the token count of the resulting file
with open(output_file, "r", encoding="utf-8") as f:
    text = f.read()
    tokens = tokenizer.encode(text)
    print(f"Number of tokens in the saved test articles: {len(tokens)}")


Saving test articles: 100%|██████████| 25/25 [00:00<00:00, 10174.42it/s]


Saved 25 articles from the test set to arxiv_samples/arxiv_test_1000.txt
Number of tokens in the saved test articles: 197352


In [30]:
load_arxiv["test"][10]

{'article': 'entanglement @xcite in a composite system refers to certain implicit correlation between the subsystems arising from their interaction .\nit is the key resource of quantum computation and quantum information processing @xcite .\ndue to recent advances in this field , entanglement has generated renewed interest .\nthere have been different approaches to understand and to quantify entanglement @xcite .\nbut so far the entanglement , only in a bipartite pure state has been investigated very extensively .\nthe von neumann entropy @xcite of either of the subsystems provides a good measure of entanglement in this case @xcite .\nthis is the quantum partner of the shannon s entropy @xcite in classical information theory and is defined as @xcite @xmath1 where @xmath2 . here , @xmath3 is the reduced density operator of the subsystem @xmath4 and is given by @xmath5 where @xmath6 is the density operator of the composite system under consideration and @xmath7 , @xmath8 .\nin general , 

In [15]:
# Load data from the parquet file and save content to a text file
import pandas as pd
import os


# https://huggingface.co/datasets/codeparrot/github-code

# Define the file path
file_path = "/Users/idhantgulati/Documents/moe-interp/data-ext/full-datasets/train-00000-of-01126.parquet"
output_file = "code_samples.txt"

# Check if the file exists
if os.path.exists(file_path):
    # Load the parquet file
    df = pd.read_parquet(file_path)
    
    # Limit to first 1000 samples
    df = df.head(200)
    
    # Save path and content to a text file
    with open(output_file, "w", encoding="utf-8") as f:
        for index, row in df.iterrows():
            if 'path' in df.columns and 'content' in df.columns:
                path = row['path']
                content = row['content']
                f.write(f"{path}\n{content}\n\n")
    
    print(f"Saved {len(df)} entries to {output_file}")
else:
    print(f"File not found: {file_path}")


Saved 200 entries to code_samples.txt


In [16]:
with open("code_samples.txt", "r") as f:
    text = f.read()
    tokens = tokenizer.encode(text)
    print(len(tokens))

384815


In [11]:
# Process the valid.json file to create a context-question format text file
import json
import os

# Define the input and output file paths
valid_json_path = "valid.json"
output_file = "context_question_pairs.txt"

# Check if the file exists
if os.path.exists(valid_json_path):
    # Load the JSON file
    with open(valid_json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    # Open the output file for writing
    with open(output_file, "w", encoding="utf-8") as f:
        # Iterate through the data
        for item in data.get("data", []):
            title = item.get("title", "")
            
            for paragraph in item.get("paragraphs", []):
                context = paragraph.get("context", "")
                
                # Write the context
                f.write(f"\n{context}\n")
                
                # Process questions and answers
                for qa in paragraph.get("qas", []):
                    question = qa.get("question", "")
                    
                    # Write the question
                    f.write(f"{question}\n")
                    
    
    print(f"Processed data saved to {output_file}")
else:
    print(f"File not found: {valid_json_path}")


Processed data saved to context_question_pairs.txt


In [15]:
with open("french-qa.txt", "r") as f:
    text = f.read()
    tokens = tokenizer.encode(text)
    print(len(tokens))

Token indices sequence length is longer than the specified maximum sequence length for this model (289517 > 16384). Running this sequence through the model will result in indexing errors


289517


In [23]:
import pandas as pd

# chinese
# https://huggingface.co/datasets/opencsg/chinese-fineweb-edu

# Load the parquet file
try:
    df = pd.read_parquet("00001.parquet")
    
    # Get the first 100 rows
    df_sample = df.head(275)
    
    # Save to a text file
    output_file = "parquet_sample.txt"
    
    # Write to text file with double newlines between each row
    with open(output_file, 'w', encoding='utf-8') as f:
        for index, row in df_sample.iterrows():
            f.write('\n\n'.join([str(value) for value in row.values]))
            f.write('\n\n')
    
    print(f"First 100 rows saved to {output_file}")
except FileNotFoundError:
    print("File not found: 00001.parquet")
except Exception as e:
    print(f"Error reading parquet file: {e}")


First 100 rows saved to parquet_sample.txt


In [24]:
with open("parquet_sample.txt", "r") as f:
    text = f.read()
    tokens = tokenizer.encode(text)
    print(len(tokens))

310000


In [28]:
import pandas as pd
import json

# https://huggingface.co/datasets/sedthh/gutenberg_english

# Try to read the parquet file and save the first 100 text entries to a json file
try:
    # Load the parquet file
    # df = pd.read_parquet("train-00000-of-00037-f5fce855b93d2d02.parquet")
    df = pd.read_parquet("train-00001-of-00037-9f227d74fc154ce9.parquet")
    
    # Get the first 100 rows, skipping the 9th
    df_sample = df.head(5)
    
    # Save TEXT column to a json file
    output_file = "first_100_texts.json"
    
    # Write to json file with each text entry as one line
    with open(output_file, 'w', encoding='utf-8') as f:
        for text in df_sample['TEXT']:
            json_line = json.dumps({"text": str(text)})
            f.write(json_line + '\n')  # Each JSON object on a separate line
    
    print(f"First 100 text entries saved to {output_file}")
    
    # Print the dataframe info for reference
    print("\nDataFrame Info:")
    print("Columns:", df.columns.tolist())
    print(f"Total entries: {len(df)}")
    
except FileNotFoundError:
    print("File not found: train-00000-of-00037-f5fce855b93d2d02.parquet")
except Exception as e:
    print(f"Error reading parquet file: {e}")


First 100 text entries saved to first_100_texts.json

DataFrame Info:
Columns: ['TEXT', 'SOURCE', 'METADATA']
Total entries: 1305


In [29]:
with open("first_100_texts1.txt", "r") as f:
    text = f.read()
    tokens = tokenizer.encode(text)
    print(len(tokens))

745010


In [4]:
    # Load the Gutenberg English dataset
    # Format: Parquet file with columns including 'TEXT' containing book content
    # The dataset is split into multiple files (37 shards)
    # Each file contains thousands of text entries from public domain books
    df = pd.read_parquet("train-00001-of-00037-9f227d74fc154ce9.parquet")
    
    # Print the first 2 entries to understand the format
    print("First 2 entries in the dataset:")
    print(df.head(2))

First 2 entries in the dataset:
                                                TEXT     SOURCE  \
0  HONORINE\r\n\r\n    \r\n\r\n\r\n      By Honor...  gutenberg   
1  THE MYSTERY OF “THE YELLOW ROOM”\r\n\r\n    \r...  gutenberg   

                                            METADATA  
0  {"language": "en", "text_id": 1683, "title": "...  
1  {"language": "en", "text_id": 1685, "title": "...  
