In [None]:
%pip install python-dotenv
%pip install pypandoc
%pip install tiktoken
%pip install unstructured

In [None]:
import os
import re
import shutil
import requests
import pandas as pd
from tqdm import tqdm
from langchain_community.document_loaders import UnstructuredRSTLoader
from langchain.text_splitter import TokenTextSplitter
from glob import glob
from transformers import AutoTokenizer
from dotenv import load_dotenv
import boto3
DATA_PATH="./data"
SAGEMAKER_S3_BUCKET="sagemaker-studio-412356575323-guqzsetziqj"

In [None]:
AIRFLOW_VERSION = "2.9.1"
url = f"https://github.com/apache/airflow/releases/download/{AIRFLOW_VERSION}/apache-airflow-{AIRFLOW_VERSION}-source.tar.gz"
airflow_zip_path = os.path.join(DATA_PATH, f"apache-airflow-{AIRFLOW_VERSION}-source.tar.gz")
response = requests.get(url)
if response.status_code == 200:
    with open(airflow_zip_path, 'wb') as file:
        file.write(response.content)
    print(f"Successfully downloaded airflow {AIRFLOW_VERSION} source code to {airflow_zip_path}.")
else:
    print(f"Failed to download the repository: {response.status_code}")
    raise Exception(f"Download failed with status code: {response.status_code}")

In [None]:
load_dotenv()
huggingface_token = os.getenv('HF_API_TOKEN')

In [None]:
CHUNK_SIZE = 256
shutil.unpack_archive(airflow_zip_path, "./data", "gztar")
airflow_docs_path = os.path.join(DATA_PATH, f"apache-airflow-{AIRFLOW_VERSION}", "docs")
files = glob(f"{airflow_docs_path}/**/*.rst", recursive=True)
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", token=huggingface_token)
token_splitter = TokenTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=0)
all_text = []
for filepath in tqdm(files):
    rst_loader = UnstructuredRSTLoader(filepath, mode="single")
    docs = rst_loader.load()
    for doc in docs:
        text = doc.page_content
        text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
        text = re.sub(r'\n+', ' ', text)  # Remove newlines
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        blocks = token_splitter.split_text(text)
        for block in blocks:
            token_length = len(tokenizer.encode(block))
            if token_length > CHUNK_SIZE:
                all_text.append(block + tokenizer.eos_token)
result = pd.Series(all_text)
result.to_csv(os.path.join(DATA_PATH, "dataset.csv"), index = False)

In [None]:
s3 = boto3.Session().resource("s3")
s3.meta.client.upload_file(os.path.join(DATA_PATH, "dataset.csv"), SAGEMAKER_S3_BUCKET, 'zephyrus/data/airflow_dataset.csv')