In [1]:
!pip install pinecone

Collecting pinecone
  Using cached pinecone-6.0.2-py3-none-any.whl.metadata (9.0 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone)
  Using cached pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Using cached pinecone-6.0.2-py3-none-any.whl (421 kB)
Using cached pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone
Successfully installed pinecone-6.0.2 pinecone-plugin-interface-0.0.7


In [2]:
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
import os

In [5]:
load_dotenv()

True

In [8]:
import pinecone

In [11]:
pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_cloud = os.getenv("PINECONE_CLOUD")  # usually "aws" or "gcp"
pinecone_region = os.getenv("PINECONE_REGION")  # like "us-west-2"

In [12]:
pc = Pinecone(
    api_key=pinecone_api_key,
)

In [14]:
pc.create_index(
    name="career-paths",
    dimension=1536,  # depends on your embeddings
    metric="cosine",
    spec=ServerlessSpec(
        cloud=pinecone_cloud,   # 'aws' or 'gcp'
        region=pinecone_region  # like 'us-west-2'
    )
)

{
    "name": "career-paths",
    "metric": "cosine",
    "host": "career-paths-ex6oxxq.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 1536,
    "deletion_protection": "disabled",
    "tags": null
}

In [15]:
!pip install langchain-pinecone

Collecting langchain-pinecone
  Downloading langchain_pinecone-0.2.5-py3-none-any.whl.metadata (1.3 kB)
Collecting aiohttp<3.11,>=3.10 (from langchain-pinecone)
  Downloading aiohttp-3.10.11-cp311-cp311-win_amd64.whl.metadata (8.0 kB)
Collecting langchain-tests<1.0.0,>=0.3.7 (from langchain-pinecone)
  Downloading langchain_tests-0.3.19-py3-none-any.whl.metadata (3.2 kB)
Collecting langchain-core<1.0.0,>=0.3.34 (from langchain-pinecone)
  Using cached langchain_core-0.3.56-py3-none-any.whl.metadata (5.9 kB)
Collecting pytest<9,>=7 (from langchain-tests<1.0.0,>=0.3.7->langchain-pinecone)
  Downloading pytest-8.3.5-py3-none-any.whl.metadata (7.6 kB)
Collecting pytest-asyncio<1,>=0.20 (from langchain-tests<1.0.0,>=0.3.7->langchain-pinecone)
  Downloading pytest_asyncio-0.26.0-py3-none-any.whl.metadata (4.0 kB)
Collecting syrupy<5,>=4 (from langchain-tests<1.0.0,>=0.3.7->langchain-pinecone)
  Downloading syrupy-4.9.1-py3-none-any.whl.metadata (38 kB)
Collecting pytest-socket<1,>=0.6.0 (fro



In [16]:
# using openai embedding to convert job roles into vector and store in pinecone
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore

In [17]:
embeddings = OpenAIEmbeddings()

In [18]:
import pandas as pd

In [19]:
# loading the dataset
df = pd.read_csv("cleaned_job_skills.csv")

In [20]:
df.head()

Unnamed: 0,Job_Role,Company,Location,Job Experience,Skills/Description
0,Senior Data Scientist,UPL,"Bangalore/Bengaluru, Mumbai (All Areas)",3-6,"python, MLT, statistical modeling, machine lea..."
1,Senior Data Scientist,Walmart,Bangalore/Bengaluru,5-9,"Data Science, Machine learning, Python, Azure,..."
2,Applied Data Scientist / ML Senior Engineer (P...,SAP India Pvt.Ltd,Bangalore/Bengaluru,5-10,"Python, IT Skills, Testing, Cloud, Product Man..."
3,Data Scientist,UPL,"Bangalore/Bengaluru, Mumbai (All Areas)",1-4,"python, machine learning, Data Science, data a..."
4,Data Scientist,Walmart,Bangalore/Bengaluru,4-8,"IT Skills, Python, Data Science, Machine Learn..."


In [24]:
documents = df["Job_Role"]+": "+ df["Skills/Description"]+": "+df["Company"]

Our documents payload is too large for Pinecone's input size limits.So, we will do chunking to break the size of
document in order to fit it in pinecone

In [28]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

# chunk each document
chunked_documents = []
for doc in documents:
    chunks = text_splitter.split_text(doc)
    chunked_documents.extend(chunks)


In [32]:
from tqdm import tqdm
import time

# Create the vector store object from the existing index.
vector_store = PineconeVectorStore.from_existing_index(
    index_name="career-paths",
    embedding=embeddings
)

batch_size = 50  # Adjust as needed
for i in tqdm(range(0, len(chunked_documents), batch_size)):
    batch = chunked_documents[i:i + batch_size]
    for attempt in range(3):  # Retry up to 3 times
        try:
            vector_store.add_texts(batch)
            break  # Success; exit the retry loop for this batch.
        except Exception as e:
            print(f"⚠️ Batch {i}-{i+batch_size} failed on attempt {attempt + 1}: {e}")
            time.sleep(5)  # Wait before retrying.
    else:
        print(f"❌ Batch {i}-{i+batch_size} failed after 3 retries.")


100%|██████████| 240/240 [13:26<00:00,  3.36s/it]


In [35]:
!pip install nbformat

Collecting nbformat
  Using cached nbformat-5.10.4-py3-none-any.whl.metadata (3.6 kB)
Collecting fastjsonschema>=2.15 (from nbformat)
  Downloading fastjsonschema-2.21.1-py3-none-any.whl.metadata (2.2 kB)
Using cached nbformat-5.10.4-py3-none-any.whl (78 kB)
Downloading fastjsonschema-2.21.1-py3-none-any.whl (23 kB)
Installing collected packages: fastjsonschema, nbformat
Successfully installed fastjsonschema-2.21.1 nbformat-5.10.4


In [36]:
%run resume_parse.ipynb

  validate(nb)


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     -- ------------------------------------- 0.8/12.8 MB 6.7 MB/s eta 0:00:02
     ------------- -------------------------- 4.2/12.8 MB 14.0 MB/s eta 0:00:01
     --------------------------- ------------ 8.9/12.8 MB 16.3 MB/s eta 0:00:01
     --------------------------------------  12.6/12.8 MB 16.8 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 16.1 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can no

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox



Parsed Resume Data:
name: Kushagra Srivastava
email: kushagra843srivastava@gmail.com
phone: 9555009525
skills: ['Python', 'Java', 'JavaScript', 'SQL', 'Machine Learning', 'NLP', 'Data Science', 'Excel', 'Power BI', 'Pandas', 'Git', 'GitHub']
experience: [{'position': 'Data Science Intern, Agrix y 2024', 'company': '', 'date': 'May 2024 – Jul', 'description': '\uf0b7 Revolutionized agriculture with predictive Python models, boosting yields and reducing losses by up to 75 %.\n\uf0b7 Developed interactive dashboards using Power BI, Excel, and Python to visualize KPIs, resulting in a 30% reduction in\nreport preparation time and empowering stakeholders with real-time insights for strategic decisions.\n\uf0b7 Analyzed agricultural reports and datasets with Seaborn and Matplotlib, transforming complex data into actionable insights\non crop health, soil conditions, and market trends; increased yield prediction accuracy by 25% and stakeholder understanding\nby 40%.\n\uf0b7 Utilized data prepr

In [37]:
query = resume_text

In [38]:
results = vector_store.similarity_search(query, k=3)  # Get top 3 matches

In [39]:
# Extract job roles
job_roles = [result.page_content.split(":")[0] for result in results]
print("Recommended Job Roles:", job_roles)

Recommended Job Roles: ['Data Scientist V', 'Data Scientist V', 'Data Scientist Intern']


#### Now we will set up our llm in order to generate roadmap to qualify for these jobs

In [40]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate

In [41]:
# Initialize LLM
llm = ChatOpenAI(openai_api_key=os.getenv("OPENAI_API_KEY"), model="gpt-3.5-turbo")

In [42]:
# Define prompt
prompt_template = PromptTemplate(
    template="""
    Based on the following resume:
    {resume}

    And the recommended job roles: {job_roles}

    Generate a personalized learning roadmap to help the user transition to one of these roles.
    Include specific courses, certifications, and projects they should pursue.
    """,
    input_variables=["resume", "job_roles"]
)

In [43]:
# Create chain
chain = prompt_template | llm

In [44]:
# Run the chain
roadmap = chain.invoke({"resume": resume_text, "job_roles": ", ".join(job_roles)})
print("Learning Roadmap:\n", roadmap.content)

Learning Roadmap:
 Personalized Learning Roadmap to Transition to a Data Scientist Role:

1. Enhance Python Skills:
   - Take online courses such as "Python for Data Science and Machine Learning Bootcamp" on Udemy
   - Complete certification in Python programming on platforms like Coursera or edX

2. Improve Data Science Skills:
   - Enroll in a course on "Data Science and Machine Learning Bootcamp with R" to expand your data science knowledge
   - Obtain a certification in Data Science from reputable organizations like IBM or Harvard University

3. Master Machine Learning and NLP:
   - Take courses on Machine Learning and Natural Language Processing on platforms like Coursera or DataCamp
   - Work on projects related to Machine Learning and NLP to showcase your skills

4. Further SQL Proficiency:
   - Complete advanced SQL courses on platforms like Codecademy or Khan Academy
   - Obtain certifications in SQL querying and database management

5. Gain Experience in Excel and Power BI:
 

In [None]:
indexes = pc.list_indexes()
print(indexes)


[
    {
        "name": "your-index-name",
        "metric": "cosine",
        "host": "your-index-name-ex6oxxq.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "vector_type": "dense",
        "dimension": 1536,
        "deletion_protection": "disabled",
        "tags": null
    },
    {
        "name": "career-paths",
        "metric": "cosine",
        "host": "career-paths-ex6oxxq.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "vector_type": "dense",
        "dimension": 1536,
        "deletion_protection": "disabled",
        "tags": null
    }
]