In [1]:
import pandas as pd
import os

def prepare_dataset(path, suffix='.lean'):
    """
    Prepares an unsupervised dataset from files with a specific suffix in the given directory and its subdirectories.
    
    Parameters:
    - path (str): The directory path to look for files.
    - suffix (str): The file suffix to filter by. Default is '.txt'.

    Returns:
    - pd.DataFrame: A DataFrame with a single 'text' column containing the content of each file.
    """

    # List for storing file contents
    texts = []

    # Walking through the directory and its subdirectories
    for dirpath, _, filenames in os.walk(path):
        for filename in filenames:
            if filename.endswith(suffix):
                with open(os.path.join(dirpath, filename), 'r', encoding='utf-8', errors='ignore') as file:
                    content = file.read()
                    texts.append(content)

    # Convert list of texts to a pandas DataFrame
    df = pd.DataFrame(texts, columns=['text'])

    return df

# Example usage
# path = input("Enter the path: ")
path = '../../../lean/mathlib4/Mathlib/'
# suffix = input("Enter the file suffix (default is .txt): ")
# if not suffix:
#     suffix = '.txt'

df = prepare_dataset(path, "lean")


                                                text
0  import Mathlib.Tactic.Abel\nimport Mathlib.Tac...
1  /-\nCopyright (c) 2020 Thomas Browning and Pat...
2  /-\nCopyright (c) 2021 Anne Baanen. All rights...
3  /-\nCopyright (c) 2018 Kenny Lau. All rights r...
4  /-\nCopyright (c) 2019 Johan Commelin. All rig...


In [18]:
from transformers import LlamaForCausalLM, CodeLlamaTokenizer, BitsAndBytesConfig

In [20]:
import pandas as pd
import os
from transformers import AutoTokenizer

def prepare_next_token_dataset(path, tokenizer, suffix='.txt', MAX_CONTENT_LENGTH=512, MIN_LENGTH=128):
    """
    Prepares a dataset for next token prediction from files with a specific suffix in the given directory and its subdirectories.

    Parameters:
    - path (str): The directory path to look for files.
    - tokenizer: Tokenizer from the transformers library.
    - suffix (str): The file suffix to filter by. Default is '.txt'.
    - MAX_CONTENT_LENGTH (int): The maximum length for the input content.

    Returns:
    - pd.DataFrame: A DataFrame with 'input' and 'target' columns for next token prediction.
    """

    # Lists for storing input sequences and target tokens
    inputs = []
    targets = []

    # Walking through the directory and its subdirectories
    for dirpath, _, filenames in os.walk(path):
        for filename in filenames:
            if filename.endswith(suffix):
                with open(os.path.join(dirpath, filename), 'r', encoding='utf-8', errors='ignore') as file:
                    content = file.read()
                    # Tokenize the content using the provided tokenizer
                    tokens = tokenizer.tokenize(content)
                    for i in range(0, len(tokens) - MIN_LENGTH - 1):  # -1 to leave space for the target token
                        input_sequence = tokens[i:min(i + MAX_CONTENT_LENGTH, len(tokens))]
                        target_token = tokens[min(i + MAX_CONTENT_LENGTH, len(tokens) - 1)]
                        inputs.append(tokenizer.convert_tokens_to_string(input_sequence))
                        targets.append(target_token)

    # Convert lists of inputs and targets to a pandas DataFrame
    df = pd.DataFrame({'input': inputs, 'target': targets})

    return df

# Example usage
# path = input("Enter the path: ")
# suffix = input("Enter the file suffix (default is .txt): ")
# if not suffix:
#     suffix = '.txt'

# You can initialize the tokenizer here. For example, for BERT:
tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")

path = '../../../lean/mathlib4/Mathlib/'

MAX_CONTENT_LENGTH=2_048
df = prepare_next_token_dataset(path, tokenizer, ".lean", MAX_CONTENT_LENGTH, MIN_LENGTH=128)
print(df.head())

# If you want to save the DataFrame to a CSV:
# df.to_csv('next_token_dataset.csv', index=False)


In [17]:
df

Unnamed: 0,input,target
0,▁import,▁Math
1,▁Math,lib
2,lib,.
3,.,T
4,T,actic
...,...,...
23008322,<0x0A>,<0x0A>
23008323,<0x0A>,end
23008324,end,▁First
23008325,▁First,Order


In [13]:
len(df.get("text"))

3528

In [1]:
!pip install fastai

Defaulting to user installation because normal site-packages is not writeable
[33mDEPRECATION: pytorch-lightning 1.6.5 has a non-standard dependency specifier torch>=1.8.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorch-lightning or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [2]:
from fastai.text import TextBlock

ImportError: cannot import name 'TextBlock' from 'fastai.text' (/home/lev/.local/lib/python3.9/site-packages/fastai/text/__init__.py)

In [18]:
import fastai
print(fastai.__version__)
from fastai.text import *
from fastai.text.data import TextBlock, TextDataLoaders
from pathlib import Path

# Path to your data
path = Path('../../../lean/mathlib4/Mathlib/')

# Create a DataBunch suitable for language modeling
data_lm = TextBlock.from_folder(path, extensions=['.lean'], is_lm=True)
          #  .filter_by_folder(include=['.lean'])  # Include all files in the root directory of 'path'
          #  .split_by_rand_pct(0.1)  # Randomly split 10% of data for validation
          #  .label_for_lm()  # Use this method for a language model
          #  .databunch(bs=64, num_workers=1))

# data_lm.save('data_lm.pkl')
# print(data_lm).
# dls = data_lmh.dataloaders(df, bs=64)
data_lm.batch_tfms

2.7.12


(#0) []

In [24]:
# https://docs.fast.ai/text.data.html#textdataloaders.from_folder
TextDataLoaders.from_folder(path, train='train', valid='valid')

TypeError: 'NoneType' object is not iterable