# -----------------------------
# 1. Install Dependencies
# -----------------------------

In [1]:
!pip install --quiet faiss-cpu transformers datasets torch wget

# -----------------------------
# 2. Import Libraries
# -----------------------------

In [19]:
import torch
import random
import numpy as np
import wget
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from mpl_toolkits.mplot3d import Axes3D

from transformers import (
    DPRContextEncoder, DPRContextEncoderTokenizer,
    DPRQuestionEncoder, DPRQuestionEncoderTokenizer,
    AutoTokenizer, AutoModelForCausalLM
)

import faiss
import warnings
warnings.filterwarnings("ignore")


import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

from transformers.utils import logging
logging.set_verbosity_error()

# -----------------------------
# 3. Download & Preprocess Document
# -----------------------------

In [5]:
filename = 'companyPolicies.txt'
url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/6JDbUb_L3egv_eOkouY71A.txt'
wget.download(url, out=filename)

def read_and_split_text(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        paragraphs = [p.strip() for p in file.read().split('\n') if p.strip()]
    return paragraphs

paragraphs = read_and_split_text(filename)

# -----------------------------
# 4. t-SNE Visualization
# -----------------------------

In [6]:
def tsne_plot(data):
    tsne = TSNE(n_components=3, random_state=42, perplexity=min(30, data.shape[0] - 1))
    data_3d = tsne.fit_transform(data)

    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot(111, projection='3d')
    colors = plt.cm.tab20(np.linspace(0, 1, len(data_3d)))

    for idx, point in enumerate(data_3d):
        ax.scatter(*point, label=str(idx), color=colors[idx])
    
    ax.set_xlabel('TSNE Component 1')
    ax.set_ylabel('TSNE Component 2')
    ax.set_zlabel('TSNE Component 3')
    plt.title('3D t-SNE Visualization')
    plt.legend(title='Paragraph Index')
    plt.show()

# -----------------------------
# 5. Encode Contexts with DPR
# -----------------------------

In [20]:
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
context_encoder = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')

def encode_contexts(text_list):
    embeddings = []
    for text in text_list:
        inputs = context_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=256)
        outputs = context_encoder(**inputs)
        embeddings.append(outputs.pooler_output)
    return torch.cat(embeddings).detach().numpy()

context_embeddings = encode_contexts(paragraphs)


# -----------------------------
# 6. Index with FAISS
# -----------------------------

In [11]:
embedding_dim = context_embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(context_embeddings.astype('float32'))

# -----------------------------
# 7. Encode Questions
# -----------------------------

In [21]:
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
question_encoder = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base')

def search_relevant_contexts(question, k=5):
    question_inputs = question_tokenizer(question, return_tensors='pt')
    question_embedding = question_encoder(**question_inputs).pooler_output.detach().numpy()
    D, I = index.search(question_embedding, k)
    return D, I

# -----------------------------
# 8. GPT-2 Answer Generation
# -----------------------------

In [13]:
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
model.generation_config.pad_token_id = tokenizer.pad_token_id

def generate_answer(question, contexts, max_new_tokens=100):
    input_text = question + ' ' + ' '.join(contexts)
    inputs = tokenizer(input_text, return_tensors='pt', truncation=True, max_length=1024)
    summary_ids = model.generate(inputs['input_ids'], max_new_tokens=max_new_tokens, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# -----------------------------
# 9. Example Run
# -----------------------------

In [22]:
question = "What is the mobile phone policy?"
_, indices = search_relevant_contexts(question, k=5)
top_contexts = [paragraphs[i] for i in indices[0]]

print("Top Contexts:")
for i, ctx in enumerate(top_contexts, 1):
    print(f"{i}. {ctx}\n")

answer = generate_answer(question, top_contexts)
print("Generated Answer:\n", answer)

Top Contexts:
1. 4.	Mobile Phone Policy

2. The Mobile Phone Policy sets forth the standards and expectations governing the appropriate and responsible usage of mobile devices in the organization. The purpose of this policy is to ensure that employees utilize mobile phones in a manner consistent with company values and legal compliance.

3. The Mobile Phone Policy is aimed at promoting the responsible and secure use of mobile devices in line with legal and ethical standards. Every employee is expected to comprehend and abide by these guidelines. Regular reviews of the policy ensure its ongoing alignment with evolving technology and security best practices.

4. Monitoring: The company retains the right to monitor internet and email usage for security and compliance purposes.

5. Acceptable Use: Company-provided internet and email services are primarily meant for job-related tasks. Limited personal use is allowed during non-work hours, provided it doesn't interfere with work responsibili