## **✅ Import Dependencies**

In [None]:
import pandas as pd, os, json
from datasets import Dataset
from huggingface_hub import HfApi, HfFolder, login

## **✅ Step 1: Load and Clean the CSV**

In [None]:
# Login using e.g. `huggingface-cli login` to access this dataset
data = pd.read_csv("hf://datasets/Muhammad-Umer-Khan/FAQ_Dataset/BankFAQs.csv")

data.head()

Unnamed: 0,Question,Answer,Class
0,Do I need to enter ‘#’ after keying in my Card...,Please listen to the recorded message and foll...,security
1,What details are required when I want to perfo...,"To perform a secure IVR transaction, you will ...",security
2,How should I get the IVR Password if I hold a...,An IVR password can be requested only from the...,security
3,How do I register my Mobile number for IVR Pas...,Please call our Customer Service Centre and en...,security
4,How can I obtain an IVR Password,By Sending SMS request: Send an SMS 'PWD<space...,security


## **✅ Step 2: Convert Each Row into Mistral Format**

In [None]:
# Step 3: Define a function to format each row
def format_to_mistral_prompt(row):
    question = row["Question"].strip()
    answer = row["Answer"].strip()
    return {
        "text": f"<s>[INST] {question} [/INST] {answer} </s>"
    }

In [None]:
# Step 4: Apply formatting function to the whole DataFrame
formatted_data = data.apply(format_to_mistral_prompt, axis=1).tolist()

# Preview the first formatted example
print(formatted_data[0])

{'text': '<s>[INST] Do I need to enter ‘#’ after keying in my Card number/ Card expiry date/ CVV number [/INST] Please listen to the recorded message and follow the instructions while entering your card details. </s>'}


## **✅ Step 3: Convert to Hugging Face Dataset**

In [None]:
# Step 5: Convert list of formatted dicts into Hugging Face Dataset
dataset = Dataset.from_list(formatted_data)

# Step 6: Preview the first row from dataset
print(dataset[0])

{'text': '<s>[INST] Do I need to enter ‘#’ after keying in my Card number/ Card expiry date/ CVV number [/INST] Please listen to the recorded message and follow the instructions while entering your card details. </s>'}


In [None]:
# Step 7: Save the formatted dataset to a JSONL file (optional)
os.makedirs(os.path.join("..", "..", "PreprocessedFAQs"), exist_ok=True)

with open(os.path.join("..", "..", "PreprocessedFAQs", "formatted_support_faq.jsonl"), "w") as f:
    for item in formatted_data:
        json.dump(item, f)
        f.write("\n")

## **✅ Step 4: Push Data to HuggingFace Hub**

In [None]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### **1. Push Original Data**

- **Create the Dataset Repository:**

In [None]:
# Initialize API
api = HfApi()
repo_id = "Muhammad-Umer-Khan/FAQ_Dataset"

# Create the dataset repository
api.create_repo(
    repo_id=repo_id,
    repo_type="dataset",
    private=False,  # Set to True if you want a private repo
    token=HfFolder.get_token()  # Your API token
)

RepoUrl('https://huggingface.co/datasets/Muhammad-Umer-Khan/FAQ_Dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Muhammad-Umer-Khan/FAQ_Dataset')

- **Upload the Dataset: After creating the repository, run your original code to upload the dataset:**

In [None]:
# Push dataset folder to Hugging Face
api.upload_folder(
    folder_path=os.path.join("..", "..", "FAQS"),  # Local path to your dataset
    repo_id=repo_id,
    repo_type="dataset",
    token=HfFolder.get_token()  # Your API token
)

CommitInfo(commit_url='https://huggingface.co/datasets/Muhammad-Umer-Khan/FAQ_Dataset/commit/c125d38b8f2d99f3223fc0d1568a387e5dc3aa51', commit_message='Upload folder using huggingface_hub', commit_description='', oid='c125d38b8f2d99f3223fc0d1568a387e5dc3aa51', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Muhammad-Umer-Khan/FAQ_Dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Muhammad-Umer-Khan/FAQ_Dataset'), pr_revision=None, pr_num=None)

- **Check Out Dataset Here: [Click Here](https://huggingface.co/datasets/Muhammad-Umer-Khan/FAQ_Dataset)**

### **1. Push Mistral Formatted Data**

- **Create the Dataset Repository:**

In [None]:
# Initialize API
api = HfApi()
repo_id = "Muhammad-Umer-Khan/FAQs-Mistral-7b-v03-17k"

# Create the dataset repository
api.create_repo(
    repo_id=repo_id,
    repo_type="dataset",
    private=False,  # Set to True if you want a private repo
    token=HfFolder.get_token()  # Your API token
)

RepoUrl('https://huggingface.co/datasets/Muhammad-Umer-Khan/FAQs-Mistral-7b-v03-17k', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Muhammad-Umer-Khan/FAQs-Mistral-7b-v03-17k')

- **Upload the Dataset: After creating the repository, run your original code to upload the dataset:**

In [None]:
# Push dataset folder to Hugging Face
api.upload_folder(
    folder_path=os.path.join("..", "..", "PreprocessedFAQs"),  # Local path to your dataset
    repo_id=repo_id,
    repo_type="dataset",
    token=HfFolder.get_token()  # Your API token
)

CommitInfo(commit_url='https://huggingface.co/datasets/Muhammad-Umer-Khan/FAQs-Mistral-7b-v03-17k/commit/8f2485dcf360fb2467d0c6e209669ada3dda626a', commit_message='Upload folder using huggingface_hub', commit_description='', oid='8f2485dcf360fb2467d0c6e209669ada3dda626a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Muhammad-Umer-Khan/FAQs-Mistral-7b-v03-17k', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Muhammad-Umer-Khan/FAQs-Mistral-7b-v03-17k'), pr_revision=None, pr_num=None)

- **Check Out Dataset Here: [Click Here](https://huggingface.co/datasets/Muhammad-Umer-Khan/FAQs-Mistral-7b-v03-17k)**