In [2]:
import re
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

In [3]:
import os

job_file = '/Users/garv/PROJECTS/rsa1/data/job_title_des.csv'
resume_file = '/Users/garv/PROJECTS/rsa1/data/UpdatedResumeDataSet.csv'

if not os.path.exists(job_file):
	print(f"File not found: {job_file}")
else:
	job_df = pd.read_csv(job_file)

if not os.path.exists(resume_file):
	print(f"File not found: {resume_file}")
else:
	resume_df = pd.read_csv(resume_file)

In [4]:
display(job_df.head())
display(job_df.info())
display(resume_df.head())
display(resume_df.info())

Unnamed: 0.1,Unnamed: 0,Job Title,Job Description
0,0,Flutter Developer,We are looking for hire experts flutter develo...
1,1,Django Developer,PYTHON/DJANGO (Developer/Lead) - Job Code(PDJ ...
2,2,Machine Learning,"Data Scientist (Contractor)\n\nBangalore, IN\n..."
3,3,iOS Developer,JOB DESCRIPTION:\n\nStrong framework outside o...
4,4,Full Stack Developer,job responsibility full stack engineer – react...


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2277 entries, 0 to 2276
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       2277 non-null   int64 
 1   Job Title        2277 non-null   object
 2   Job Description  2277 non-null   object
dtypes: int64(1), object(2)
memory usage: 53.5+ KB


None

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 962 entries, 0 to 961
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  962 non-null    object
 1   Resume    962 non-null    object
dtypes: object(2)
memory usage: 15.2+ KB


None

In [5]:
job_df.drop(columns=["Unnamed: 0"],axis=1,inplace=True)
job_df.drop_duplicates(inplace=True)
job_df.dropna(subset=["Job Description"], inplace=True)
job_df.reset_index(drop=True, inplace=True)
resume_df.drop_duplicates(inplace=True)
resume_df.dropna(subset=["Resume"], inplace=True)
resume_df.reset_index(drop=True, inplace=True)

In [6]:
def clean_text(df,column):
    """Cleans text by converting to lowercase, removing special characters,
    and handling whitespace."""
    try:
        df[column] = df[column].astype(str)
    except Exception as e:
        print(f"Error converting column {column} to string: {e}")
        return df[column]
    df[column] = df[column].apply(lambda row: row.lower())
    df[column] = df[column].apply(lambda row: re.sub(r"[^\x00-\x7f]", r"", row))
    df[column] = df[column].apply(lambda row: re.sub(r"\t", r"", row).strip())
    df[column] = df[column].apply(lambda row: re.sub(r"(\n|\r)+", r"\n", row).strip())
    df[column] = df[column].apply(lambda row: re.sub(r" +", r" ", row).strip())
    return df[column]

job_df['cleaned_description'] = clean_text(job_df, 'Job Description')
resume_df['cleaned_resume'] = clean_text(resume_df, 'Resume')

display(job_df[['Job Description', 'cleaned_description']].head())
display(resume_df[['Resume', 'cleaned_resume']].head())

Unnamed: 0,Job Description,cleaned_description
0,we are looking for hire experts flutter develo...,we are looking for hire experts flutter develo...
1,python/django (developer/lead) - job code(pdj ...,python/django (developer/lead) - job code(pdj ...
2,"data scientist (contractor)\nbangalore, in\nre...","data scientist (contractor)\nbangalore, in\nre..."
3,job description:\nstrong framework outside of ...,job description:\nstrong framework outside of ...
4,job responsibility full stack engineer react r...,job responsibility full stack engineer react r...


Unnamed: 0,Resume,cleaned_resume
0,skills * programming languages: python (pandas...,skills * programming languages: python (pandas...
1,education details \nmay 2013 to may 2017 b.e u...,education details \nmay 2013 to may 2017 b.e u...
2,"areas of interest deep learning, control syste...","areas of interest deep learning, control syste..."
3,skills r python sap hana tableau sap hana sql ...,skills r python sap hana tableau sap hana sql ...
4,"education details \n mca ymcaust, faridabad, h...","education details \n mca ymcaust, faridabad, h..."


In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)

In [10]:
from langchain.schema import Document

job_descriptions = job_df['cleaned_description'].tolist()
job_titles = job_df['Job Title'].tolist() # Get job titles

# Create Document objects with text and metadata
job_documents = [Document(page_content=desc, metadata={"title": title}) for desc, title in zip(job_descriptions, job_titles)]

resumes = resume_df['cleaned_resume'].tolist()

# For resumes, we can include the category as metadata
resume_categories = resume_df['Category'].tolist()
resume_documents = [Document(page_content=resume_text, metadata={"category": category}) for resume_text, category in zip(resumes, resume_categories)]


job_chunks = text_splitter.split_documents(job_documents)
resume_chunks = text_splitter.split_documents(resume_documents)

print(f"Created {len(job_documents)} job documents and {len(job_chunks)} job chunks.")
print(f"Created {len(resume_documents)} resume documents and {len(resume_chunks)} resume chunks.")

Created 2277 job documents and 6318 job chunks.
Created 166 resume documents and 651 resume chunks.


In [12]:
from langchain_huggingface import HuggingFaceEmbeddings

# Instantiate HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create embeddings for job and resume chunks
job_embeddings = embeddings.embed_documents([chunk.page_content for chunk in job_chunks])
resume_embeddings = embeddings.embed_documents([chunk.page_content for chunk in resume_chunks])

print(f"Generated {len(job_embeddings)} embeddings for job descriptions.")
print(f"Generated {len(resume_embeddings)} embeddings for resumes.")

Generated 6318 embeddings for job descriptions.
Generated 651 embeddings for resumes.


In [13]:
from langchain_community.vectorstores import FAISS

# Create a FAISS vector store for job chunks
job_vector_store = FAISS.from_documents(job_chunks, embeddings)

# Create a FAISS vector store for resume chunks
resume_vector_store = FAISS.from_documents(resume_chunks, embeddings)

print("FAISS vector stores created for job descriptions and resumes.")

FAISS vector stores created for job descriptions and resumes.


In [15]:
import os

# Define the directory to save the vector stores
vector_store_dir = "/Users/garv/PROJECTS/rsa1/vector_store"

# Create the directory if it doesn't exist
if not os.path.exists(vector_store_dir):
    os.makedirs(vector_store_dir)

# Save the job vector store
job_vector_store_path = os.path.join(vector_store_dir, "job_faiss")
job_vector_store.save_local(job_vector_store_path)

# Save the resume vector store
resume_vector_store_path = os.path.join(vector_store_dir, "resume_faiss")
resume_vector_store.save_local(resume_vector_store_path)

print(f"Job vector store saved to: {job_vector_store_path}")
print(f"Resume vector store saved to: {resume_vector_store_path}")

Job vector store saved to: /Users/garv/PROJECTS/rsa1/vector_store/job_faiss
Resume vector store saved to: /Users/garv/PROJECTS/rsa1/vector_store/resume_faiss
