In [1]:
# Install required packages
!pip install sentence-transformers scikit-learn

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

print(" Setup completed!")


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-trans

2025-07-20 13:24:26.358205: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753017866.529297      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753017866.584263      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


 Setup completed!


In [2]:
# Load data with correct column names
df = pd.read_csv('/kaggle/input/sample-rag-knowledge-item-dataset/rag_sample_qas_from_kis.csv')

# Use correct column names
articles = df['ki_text'].tolist()
topics = df['ki_topic'].tolist()
questions = df.iloc[:, 2].tolist()  # Third column
ground_truths = df.iloc[:, 3].tolist()  # Fourth column

print(f" Loaded {len(articles)} articles")
print(f"Columns: {df.columns.tolist()}")


 Loaded 10 articles
Columns: ['ki_topic', 'ki_text', 'sample_question', 'sample_ground_truth']


In [3]:
def chunk_text(text, chunk_size=200, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        if len(chunk.strip()) > 50:
            chunks.append(chunk)
    return chunks

# Process articles into chunks
processed_chunks = []
for idx, article in enumerate(articles):
    chunks = chunk_text(article)
    for chunk_idx, chunk in enumerate(chunks):
        processed_chunks.append({
            'article_id': idx,
            'chunk_id': chunk_idx,
            'text': chunk,
            'topic': topics[idx]
        })

chunk_texts = [chunk['text'] for chunk in processed_chunks]
print(f" Created {len(chunk_texts)} chunks")


 Created 30 chunks


In [4]:
class SimpleKeywordExtractor:
    def __init__(self):
        self.tfidf = TfidfVectorizer(max_features=1000, ngram_range=(1, 3), stop_words='english')
        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
    
    def extract_tfidf_keywords(self, texts, top_k=10):
        tfidf_matrix = self.tfidf.fit_transform(texts)
        feature_names = self.tfidf.get_feature_names_out()
        mean_scores = np.mean(tfidf_matrix.toarray(), axis=0)
        top_indices = np.argsort(mean_scores)[-top_k:][::-1]
        return [(feature_names[i], mean_scores[i]) for i in top_indices]
    
    def extract_semantic_keywords(self, text, top_k=10):
        from sklearn.feature_extraction.text import CountVectorizer
        vectorizer = CountVectorizer(ngram_range=(1, 3), stop_words='english')
        count_matrix = vectorizer.fit_transform([text])
        candidates = vectorizer.get_feature_names_out()
        
        if len(candidates) == 0:
            return []
        
        doc_embedding = self.embedder.encode([text])
        candidate_embeddings = self.embedder.encode(list(candidates))
        similarities = cosine_similarity(doc_embedding, candidate_embeddings)[0]
        
        keyword_scores = list(zip(candidates, similarities))
        keyword_scores.sort(key=lambda x: x[1], reverse=True)
        return keyword_scores[:top_k]
    
    def extract_frequency_keywords(self, text, top_k=10):
        words = re.findall(r'\b[a-z]{4,}\b', text.lower())
        word_freq = Counter(words)
        
        # Simple stop words
        stop_words = {'this', 'that', 'with', 'have', 'will', 'from', 'they', 'been', 'their', 'what', 'there', 'other', 'which', 'some'}
        
        # Fixed filtering - clear syntax
        filtered_keywords = []
        for word, freq in word_freq.items():
            if freq > 1 and word not in stop_words:
                filtered_keywords.append((word, freq))
        
        filtered_keywords.sort(key=lambda x: x[1], reverse=True)
        return filtered_keywords[:top_k]

# Initialize and test
extractor = SimpleKeywordExtractor()

# Test all methods
combined_text = ' '.join(chunk_texts)

print("Keyword Extraction Results:")
print("=" * 40)

# TF-IDF
tfidf_keywords = extractor.extract_tfidf_keywords(chunk_texts)
print("\n TF-IDF Keywords:")
for i, (keyword, score) in enumerate(tfidf_keywords[:7]):
    print(f"   {i+1}. {keyword}: {score:.4f}")

# Semantic
semantic_keywords = extractor.extract_semantic_keywords(combined_text)
print("\n SEMANTIC Keywords:")
for i, (keyword, score) in enumerate(semantic_keywords[:7]):
    print(f"   {i+1}. {keyword}: {score:.4f}")

# Frequency
freq_keywords = extractor.extract_frequency_keywords(combined_text)
print("\n FREQUENCY Keywords:")
for i, (keyword, score) in enumerate(freq_keywords[:7]):
    print(f"   {i+1}. {keyword}: {score}")

print("\n All methods working!")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Keyword Extraction Results:

 TF-IDF Keywords:
   1. email: 0.0888
   2. step: 0.0653
   3. company: 0.0600
   4. network: 0.0459
   5. issues: 0.0454
   6. backup: 0.0438
   7. office: 0.0437


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/141 [00:00<?, ?it/s]


 SEMANTIC Keywords:
   1. connect company email: 0.7078
   2. email account mobile: 0.6976
   3. synchronize company email: 0.6677
   4. access company email: 0.6516
   5. company email server: 0.6441
   6. device sync email: 0.6412
   7. company email account: 0.6391

 FREQUENCY Keywords:
   1. your: 140
   2. step: 89
   3. email: 71
   4. company: 63
   5. device: 38
   6. issues: 38
   7. settings: 31

 All methods working!


In [5]:
class SimpleRAG:
    def __init__(self, chunks):
        self.chunks = chunks
        self.texts = [chunk['text'] for chunk in chunks]
        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
        
        print(" Creating embeddings...")
        self.embeddings = self.embedder.encode(self.texts, show_progress_bar=True)
        print("RAG system ready!")
    
    def search(self, query, top_k=5):
        query_embedding = self.embedder.encode([query])
        similarities = cosine_similarity(query_embedding, self.embeddings)[0]
        
        top_indices = np.argsort(similarities)[-top_k:][::-1]
        
        results = []
        for idx in top_indices:
            results.append({
                'text': self.texts[idx],
                'similarity': similarities[idx],
                'topic': self.chunks[idx]['topic'],
                'article_id': self.chunks[idx]['article_id']
            })
        
        return results
    
    def rag_pipeline(self, query):
        search_results = self.search(query)
        context = ' '.join([r['text'] for r in search_results])
        keywords = extractor.extract_semantic_keywords(context, top_k=8)
        
        return {
            'query': query,
            'context': context,
            'keywords': keywords,
            'results': search_results,
            'confidence': np.mean([r['similarity'] for r in search_results])
        }

# Initialize RAG
rag_system = SimpleRAG(processed_chunks)

# Test the system
test_query = questions[0]
result = rag_system.rag_pipeline(test_query)

print(f" Query: {result['query']}")
print(f" Confidence: {result['confidence']:.3f}")
print("Keywords:")
for kw, score in result['keywords'][:5]:
    print(f"   • {kw}: {score:.3f}")

print(" RAG system working!")


 Creating embeddings...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG system ready!


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

 Query: "How do I set up my company email on my mobile device?"
 Confidence: 0.678
Keywords:
   • connect company email: 0.708
   • email account mobile: 0.698
   • use company email: 0.674
   • synchronize company email: 0.668
   • access company email: 0.652
 RAG system working!


In [6]:
# Simple evaluation
print(" Final Evaluation")
print("=" * 30)

all_results = []
for i, question in enumerate(questions[:5]):
    result = rag_system.rag_pipeline(question)
    all_results.append(result)
    print(f"{i+1}. Confidence: {result['confidence']:.3f}")

avg_confidence = np.mean([r['confidence'] for r in all_results])
print(f"\n Average Confidence: {avg_confidence:.3f}")



 Final Evaluation


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

1. Confidence: 0.678


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/33 [00:00<?, ?it/s]

2. Confidence: 0.481


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/33 [00:00<?, ?it/s]

3. Confidence: 0.583


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/29 [00:00<?, ?it/s]

4. Confidence: 0.529


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/35 [00:00<?, ?it/s]

5. Confidence: 0.537

 Average Confidence: 0.562
