In [1]:
pip install transformers datasets scikit-learn

Collecting transformers
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp312-cp312-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.3-cp312-none-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp312-none-win_amd64.whl.metadata (3.9 kB)
Collecting tqdm>=4.27 (from transformers)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-18.0.0-cp312-cp312-win_amd64.whl.metadata (3.4 kB)


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.5.0 requires sympy==1.13.1; python_version >= "3.9", but you have sympy 1.13.2 which is incompatible.


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from transformers import pipeline
from datasets import load_dataset

# Step 1: Load Dataset
dataset = load_dataset("HuggingFaceTB/smoltalk", "all")
queries = dataset["train"]["query"]
contexts = dataset["train"]["retrieved_context"]
labels = dataset["train"]["label"]

# Step 2: Use TF-IDF for Retrieval
vectorizer = TfidfVectorizer(max_features=1000)
vectorizer.fit(contexts)

retrieved_contexts = []
for query in queries:
    query_vector = vectorizer.transform([query])
    similarity_scores = (query_vector @ vectorizer.transform(contexts).T).toarray().flatten()
    best_match_idx = similarity_scores.argmax()
    retrieved_contexts.append(contexts[best_match_idx])

# Step 3: Use RAG for Generated Answers
rag_pipeline = pipeline("question-answering", model="facebook/rag-token-base")
generated_answers = [
    rag_pipeline({"question": query, "context": context})["answer"]
    for query, context in zip(queries, retrieved_contexts)
]

# Step 4: Prepare Data for SVM
X = [" ".join([query, context]) for query, context in zip(queries, retrieved_contexts)]
X_tfidf = vectorizer.fit_transform(X)
y = labels

# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Step 6: Train SVM Classifier
svm = SVC(kernel="linear", C=1.0)
svm.fit(X_train, y_train)

# Step 7: Evaluate SVM
y_pred = svm.predict(X_test)
print(classification_report(y_test, y_pred))

README.md:   0%|          | 0.00/9.03k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


train-00000-of-00009.parquet:   0%|          | 0.00/223M [00:00<?, ?B/s]

train-00001-of-00009.parquet:   0%|          | 0.00/223M [00:00<?, ?B/s]

train-00002-of-00009.parquet:   0%|          | 0.00/223M [00:00<?, ?B/s]

train-00003-of-00009.parquet:   0%|          | 0.00/223M [00:00<?, ?B/s]

train-00004-of-00009.parquet:   0%|          | 0.00/224M [00:00<?, ?B/s]

train-00005-of-00009.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

train-00006-of-00009.parquet:   0%|          | 0.00/223M [00:00<?, ?B/s]

train-00007-of-00009.parquet:   0%|          | 0.00/224M [00:00<?, ?B/s]

train-00008-of-00009.parquet:   0%|          | 0.00/224M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/105M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1043917 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/54948 [00:00<?, ? examples/s]

KeyError: "Column query not in the dataset. Current columns in the dataset: ['messages', 'source']"

In [3]:
from datasets import load_dataset

# Load dataset
dataset = load_dataset("HuggingFaceTB/smoltalk", "all")
print(dataset["train"].column_names)


['messages', 'source']


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from transformers import pipeline
from datasets import load_dataset

# Step 1: Load Dataset
dataset = load_dataset("HuggingFaceTB/smoltalk", "all")
queries = dataset["train"]["messages"]  # Extract queries
contexts = dataset["train"]["source"]  # Extract contexts
labels = [0] * len(queries)  # Assign dummy binary labels if labels are not provided

# Step 2: Use TF-IDF for Retrieval
vectorizer = TfidfVectorizer(max_features=1000)
vectorizer.fit(contexts)

retrieved_contexts = []
for query in queries:
    query_vector = vectorizer.transform([query])
    similarity_scores = (query_vector @ vectorizer.transform(contexts).T).toarray().flatten()
    best_match_idx = similarity_scores.argmax()
    retrieved_contexts.append(contexts[best_match_idx])

# Step 3: Use RAG for Generated Answers
rag_pipeline = pipeline("question-answering", model="facebook/rag-token-base")
generated_answers = [
    rag_pipeline({"question": query, "context": context})["answer"]
    for query, context in zip(queries, retrieved_contexts)
]

# Step 4: Prepare Data for SVM
X = [" ".join([query, context]) for query, context in zip(queries, retrieved_contexts)]
X_tfidf = vectorizer.fit_transform(X)

# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, labels, test_size=0.2, random_state=42)

# Step 6: Train SVM Classifier
svm = SVC(kernel="linear", C=1.0)
svm.fit(X_train, y_train)

# Step 7: Evaluate SVM
y_pred = svm.predict(X_test)
print(classification_report(y_test, y_pred))
