**Installing Libraries**

In [1]:
!pip install sentence-transformers beautifulsoup4
!pip install faiss-cpu
# !pip install -q -U langchain playwright sentence_transformers faiss-gpu
# !pip install -U langchain-community




In [2]:
import re
import pandas as pd
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer
import faiss
import numpy as np

  from tqdm.autonotebook import tqdm, trange


# Load & Explore Data

In [14]:
jobs = pd.read_csv('/content/RAG System.csv', usecols=['job_title', 'description', 'requirements', 'career_level'])
jobs.head()

Unnamed: 0,job_title,description,requirements,career_level
0,Senior Developer Relations Manager,<p>Senior Developer Relations Manager page is ...,,Not specified
1,Costing Manager - Cairo,"<ul>\n<li>Supervise, design and implement a co...",<ul>\n<li>Bachelor’s degree in Accounting</li>...,Manager
2,Banquet Supervisor,Mandarin Oriental Hotel GroupMandarin Oriental...,,Experienced (Non-Manager)
3,Trade Finance & Credit Collection,<p><b>About Us</b></p><br><p>Alfa Laval is a l...,,Not specified
4,Taste & Wellbeing Creative Marketing Associate...,Join us and celebrate the beauty of human expe...,,Not specified


In [15]:
jobs.reset_index(drop=True, inplace=True)
jobs.head()

Unnamed: 0,job_title,description,requirements,career_level
0,Senior Developer Relations Manager,<p>Senior Developer Relations Manager page is ...,,Not specified
1,Costing Manager - Cairo,"<ul>\n<li>Supervise, design and implement a co...",<ul>\n<li>Bachelor’s degree in Accounting</li>...,Manager
2,Banquet Supervisor,Mandarin Oriental Hotel GroupMandarin Oriental...,,Experienced (Non-Manager)
3,Trade Finance & Credit Collection,<p><b>About Us</b></p><br><p>Alfa Laval is a l...,,Not specified
4,Taste & Wellbeing Creative Marketing Associate...,Join us and celebrate the beauty of human expe...,,Not specified


In [16]:
# Calculate the number of NaN values in each column
nan_counts = jobs.isna().sum()

# Print the results
print(nan_counts)

# Find rows with any NaN values
rows_with_nan = jobs[jobs.isna().any(axis=1)]
print("Rows with NaN values:")
print(rows_with_nan)

job_title           0
description         0
requirements    25634
career_level        0
dtype: int64
Rows with NaN values:
                                               job_title  \
0                     Senior Developer Relations Manager   
2                                     Banquet Supervisor   
3                      Trade Finance & Credit Collection   
4      Taste & Wellbeing Creative Marketing Associate...   
6                                     BMC Remedy Analyst   
...                                                  ...   
39992                                  SAP SD Specialist   
39996                               Supply Chain Manager   
39997                                Reservation Manager   
39998                            Supply Planning Officer   
39999                                    Project Planner   

                                             description requirements  \
0      <p>Senior Developer Relations Manager page is ...          NaN   
2      Man

In [17]:
# Get summary statistics
summary = jobs.describe(include='all')
print("Summary statistics:")
print(summary)

Summary statistics:
         job_title                                        description  \
count        40000                                              40000   
unique       24968                                              34135   
top     Accountant  <p> </p><ul><li>Responding to customer queries...   
freq           208                                                134   

              requirements               career_level  
count                14366                      40000  
unique               11494                          6  
top     <ul><li></li></ul>  Experienced (Non-Manager)  
freq                   278                      14471  


# Data Preparation

In [18]:
jobs = jobs.drop(columns=['requirements'])

In [19]:
# Function to clean HTML tags
def clean_html(text):
    return BeautifulSoup(text, "html.parser").get_text()

# Apply cleaning to job descriptions and requirements
jobs['description'] = jobs['description'].apply(lambda x: clean_html(str(x)))

# Display the cleaned data
jobs.head()


  return BeautifulSoup(text, "html.parser").get_text()


Unnamed: 0,job_title,description,career_level
0,Senior Developer Relations Manager,Senior Developer Relations Manager page is loa...,Not specified
1,Costing Manager - Cairo,"\nSupervise, design and implement a consistent...",Manager
2,Banquet Supervisor,Mandarin Oriental Hotel GroupMandarin Oriental...,Experienced (Non-Manager)
3,Trade Finance & Credit Collection,About UsAlfa Laval is a leading global provide...,Not specified
4,Taste & Wellbeing Creative Marketing Associate...,Join us and celebrate the beauty of human expe...,Not specified


In [21]:
# Combine job description and Job_title
jobs['combined_text'] = jobs['description'] + ' ' + jobs['job_title']

# Create embeddings for job descriptions in chunks
chunk_size = 1000  # Define chunk size
chunks = [jobs[i:i + chunk_size] for i in range(0, jobs.shape[0], chunk_size)]

# Create Embeddings from chunks

In [22]:
# Initialize the Sentence Transformer model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')




In [23]:
def create_embeddings(chunk):
    chunk['embeddings'] = chunk['combined_text'].apply(lambda x: embedding_model.encode(x))
    return chunk

In [24]:
# Apply embedding creation to each chunk
chunks = [create_embeddings(chunk) for chunk in chunks]

# Concatenate chunks back into a single dataframe
data = pd.concat(chunks)
data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['embeddings'] = chunk['combined_text'].apply(lambda x: embedding_model.encode(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['embeddings'] = chunk['combined_text'].apply(lambda x: embedding_model.encode(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['embeddings'] = chunk['

Unnamed: 0,job_title,description,career_level,combined_text,embeddings
0,Senior Developer Relations Manager,Senior Developer Relations Manager page is loa...,Not specified,Senior Developer Relations Manager page is loa...,"[-0.047061484, 0.0002702253, 0.0033784653, -0...."
1,Costing Manager - Cairo,"\nSupervise, design and implement a consistent...",Manager,"\nSupervise, design and implement a consistent...","[-0.0063196504, 0.084171005, -0.07820158, 0.06..."
2,Banquet Supervisor,Mandarin Oriental Hotel GroupMandarin Oriental...,Experienced (Non-Manager),Mandarin Oriental Hotel GroupMandarin Oriental...,"[0.029390838, 0.04998568, 0.045057967, 0.06801..."
3,Trade Finance & Credit Collection,About UsAlfa Laval is a leading global provide...,Not specified,About UsAlfa Laval is a leading global provide...,"[-0.08184936, -0.048034273, -0.047277275, 0.01..."
4,Taste & Wellbeing Creative Marketing Associate...,Join us and celebrate the beauty of human expe...,Not specified,Join us and celebrate the beauty of human expe...,"[0.020883054, -0.025496112, 0.044499386, 0.028..."
...,...,...,...,...,...
39995,Credit & Banking Accountant,Opening bank facilities for contracting compan...,Experienced (Non-Manager),Opening bank facilities for contracting compan...,"[0.03521658, -0.053588018, -0.025778897, 0.017..."
39996,Supply Chain Manager,Internal Job Title: Supply Chain ManagerBusine...,Experienced (Non-Manager),Internal Job Title: Supply Chain ManagerBusine...,"[-0.050753713, -0.020308822, 0.023562765, 0.01..."
39997,Reservation Manager,"Job DescriptionCompany Description""Why work fo...",Experienced (Non-Manager),"Job DescriptionCompany Description""Why work fo...","[-0.068315364, 0.03444628, 0.024130749, 0.0296..."
39998,Supply Planning Officer,Duties & Responsibilities: Monitors status an...,Entry Level,Duties & Responsibilities: Monitors status an...,"[-0.027419504, 0.025750343, 0.014524775, 0.027..."


# Faiss Indexing

In [25]:
# Combine all embeddings into a single matrix
all_embeddings = np.vstack(data['embeddings'].values)

# Initialize FAISS index
dimension = all_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)

# Add embeddings to the index
index.add(all_embeddings)

# Save IDs for retrieval
data['id'] = range(len(data))
id_mapping = data.set_index('id').to_dict('index')

# Setup LLM Model

In [27]:
# Load the tokenizer and model in half-precision
model_name = "microsoft/Phi-3-mini-4k-instruct"
tokenizer = LlamaTokenizer.from_pretrained(model_name)
model = LlamaForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You are using a model of type phi3 to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at microsoft/Phi-3-mini-4k-instruct and are newly initialized: ['model.layers.0.mlp.gate_proj.weight', 'model.layers.0.mlp.up_proj.weight', 'model.layers.0.self_attn.k_proj.weight', 'model.layers.0.self_attn.q_proj.weight', 'model.layers.0.self_attn.v_proj.weight', 'model.layers.1.mlp.gate_proj.weight', 'model.layers.1.mlp.up_proj.weight', 'model.layers.1.self_attn.k_proj.weight', 'model.layers.1.self_attn.q_proj.weight', 'model.layers.1.self_attn.v_proj.weight', 'model.layers.10.mlp.gate_proj.weight', 'model.layers.10.mlp.up_proj.weight', 'model.layers.10.self_attn.k_proj.weight', 'model.layers.10.self_attn.q_proj.weight', 'model.layers.10.self_attn.v_proj.weight', 'model.layers.11.mlp.gate_proj.weight', 'model.layers.11.mlp.up_proj.weight', 'model.layers.11.self_attn.k_proj.weight', 'model.layers.11.self_attn.q_proj.weight', 'model.layers.11.self_attn.v_proj.weight', 'model.layers.12.mlp.gate_proj.weight'

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (v_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()


# Model Retrival

In [28]:
def retrieve_jobs_faiss(query, top_k=4):
    query_embedding = embedding_model.encode(query).reshape(1, -1)
    distances, indices = index.search(query_embedding, top_k)
    results = [id_mapping[idx] for idx in indices[0]]
    return results

def generate_response_faiss(retrieved_jobs, query):
    job_details = [job['combined_text'] for job in retrieved_jobs]
    prompt = f"User query: {query}\nHere are some job recommendations:\n"
    for i, job in enumerate(job_details):
        prompt += f"{i+1}. {job}\n"

    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=150)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

def process_query_faiss(user_query):
    retrieved_jobs = retrieve_jobs_faiss(user_query)
    response = generate_response_faiss(retrieved_jobs, user_query)
    return response

In [29]:
# Example usage
user_query = "I want to find a job in data science"
retriJobs = retrieve_jobs_faiss(user_query)
print(retriJobs)

[{'job_title': 'Remote Data Science Analyst (Python)', 'description': 'A renowned US-based client is seeking a committed Data Scientist to join their dynamic team.As part of the team, your responsibilities as a Data Scientist will include conducting detailed peer code reviews and providing valuable feedback for ongoing enhancement. You will also collaborate closely with researchers and stakeholders to ensure alignment with project specifications.Job Responsibilities:Craft Python code to explore public datasets and derive meaningful insights.Collaborate effectively with researchers to ensure projects are aligned with company goals.Produce clear and well-documented code using Jupyter notebooks for easy understanding and maintenance.Utilize public data sources like Kaggle to tackle business-related inquiries.Job Requirements:Bachelor’s/Master’s degree in Engineering, Computer Science, or equivalent practical experience.Data scientist with a minimum of 2 years of industry experience.Demons

# Generate & fine tune the Model

In [37]:
top_k = 5
max_length_per_job = 200
job_details = [job['combined_text'][:max_length_per_job] for job in retriJobs[:top_k]]
job_details

['A renowned US-based client is seeking a committed Data Scientist to join their dynamic team.As part of the team, your responsibilities as a Data Scientist will include conducting detailed peer code re',
 '\xa0Job Description:\nOur client, a vanguard in the retail tech realm based in Riyadh with a global influence, is at the cusp of data-driven innovation. As they harness the power of data to drive decision',
 '\xa0Job Description:\nOur client, a vanguard in the retail tech realm based in Riyadh with a global influence, is at the cusp of data-driven innovation. As they harness the power of data to drive decision',
 'A prominent US client is seeking Data Scientists.As a Data Scientist on our team, your daily responsibilities will involve conducting thorough peer code reviews and offering insightful feedback for im']

In [38]:
# Limit the number of job details and truncate their length
prompt = f"User query: {user_query}\nHere are some job recommendations:\n"
for i, job in enumerate(job_details):
    prompt += f"{i+1}. {job}\n"

In [42]:
# Tokenize the prompt and generate response
inputs = tokenizer(prompt, return_tensors="pt", max_length=200, truncation=True).to(device)
outputs = model.generate(**inputs, max_new_tokens=30, do_sample=True, temperature=0.7,num_beams=2, early_stopping=True)
#outputs = model.generate(**inputs, max_new_tokens=50)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

In [43]:
response

'User query: I want to find a job in data science\nHere are some job recommendations:\n1. A renowned US-based client is seeking a committed Data Scientist to join their dynamic team.As part of the team, your responsibilities as a Data Scientist will include conducting detailed peer code re\n2. \xa0Job Description:\nOur client, a vanguard in the retail tech realm based in Riyadh with a global influence, is at the cusp of data-driven innovation. As they harness the power of data to drive decision\n3. \xa0Job Description:\nOur client, a vanguard in the retail tech realm based in Riyadh with a global influence, is at the cusp of data-driven innovation. As they harness the power of data to drive decision\n4. A prominent US client is seeking Data Scientists.As left left left left left left left left left left left left left left left left left left left left left left left left left left left left left left'

# Evaluation

In [63]:
if not isinstance(all_embeddings, torch.Tensor):
        data_embeddings = torch.tensor(all_embeddings)
data_embeddings


tensor([[-0.0471,  0.0003,  0.0034,  ..., -0.0909, -0.0383,  0.0591],
        [-0.0063,  0.0842, -0.0782,  ..., -0.0299,  0.0153,  0.0152],
        [ 0.0294,  0.0500,  0.0451,  ..., -0.0083, -0.0714,  0.0066],
        ...,
        [-0.0683,  0.0344,  0.0241,  ...,  0.0743, -0.0818, -0.0826],
        [-0.0274,  0.0258,  0.0145,  ..., -0.0334,  0.0040, -0.0289],
        [ 0.0062, -0.0121,  0.0055,  ..., -0.0561, -0.1086,  0.0105]])

In [67]:
if not isinstance(retriJobs, torch.Tensor):
    embeddings_list = [job['embeddings'] for job in retriJobs]  # Extract embeddings
    response_embeddings = torch.tensor(embeddings_list)  # Convert to tensor
response_embeddings


tensor([[-0.0550, -0.0291,  0.0340,  ..., -0.0160, -0.0253, -0.0335],
        [-0.0467, -0.0226,  0.0602,  ..., -0.0756, -0.0191,  0.0092],
        [-0.0467, -0.0226,  0.0602,  ..., -0.0756, -0.0191,  0.0092],
        [-0.0446, -0.0124,  0.0364,  ..., -0.0079, -0.0333, -0.0178]])

In [70]:
from sentence_transformers import util

# Calculate cosine similarities
cosine_similarities = util.pytorch_cos_sim(data_embeddings,response_embeddings)

# Convert to numpy array for easy handling
cosine_similarities = cosine_similarities.numpy()

# Average similarity score
avg_similarity = np.mean(cosine_similarities)

In [44]:
def precision_at_k(retrieved_docs, relevant_docs, k):
    retrieved_at_k = retrieved_docs[:k]
    relevant_at_k = [doc for doc in retrieved_at_k if doc in relevant_docs]
    return len(relevant_at_k) / k


In [45]:
def evaluate_system(all_retrieved_docs, all_relevant_docs, reference_responses, generated_responses, k=10):
    relevance_scores = {
        "Precision@k": [precision_at_k(retrieved, relevant, k) for retrieved, relevant in zip(all_retrieved_docs, all_relevant_docs)],

    }

    return {
        "Relevance Scores": relevance_scores,
    }
