<a href="https://colab.research.google.com/github/HimaVarshini-Pasupuleti/MLproject/blob/main/_ML_FinalProject_EDUNUTSHELL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:


import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


def clean_text(text):
    text = text.lower()
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


resume_text = """
Experienced data analyst skilled in Python, Machine Learning,
SQL, and Tableau. Worked on fraud detection and data visualization projects.
"""

job_description = """
We are hiring a Data Analyst proficient in Python, SQL, and Machine Learning.
Experience with data visualization tools like Tableau is preferred.
"""


resume_clean = clean_text(resume_text)
jd_clean = clean_text(job_description)

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([resume_clean, jd_clean])


similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0]
score = round(similarity * 100, 2)


print("Resume - JD Match Score:", score, "%")

if score >= 70:
    print(" Candidate shortlisted")
else:
    print(" Candidate not shortlisted")


Resume - JD Match Score: 44.79 %
❌ Candidate not shortlisted


In [2]:
import spacy


nlp = spacy.load("en_core_web_sm")


job_description = """
We are looking for a Data Analyst who is proficient in Python, SQL, and data visualization tools.
Responsibilities include analyzing business data, building dashboards, and preparing reports.
The candidate should have experience with Machine Learning, Pandas, and Power BI.
Good communication skills and teamwork are essential.
"""


doc = nlp(job_description)

# EXTRACT KEYWORDS / SKILLS
skills = set()
for chunk in doc.noun_chunks:
    if len(chunk.text.split()) <= 3:   # short useful phrases
        skills.add(chunk.text.lower())

# EXTRACT RESPONSIBILITIES
responsibilities = []
for sent in doc.sents:
    if sent.text.strip().lower().startswith(("responsible", "responsibilities", "develop", "analyze", "manage", "create", "prepare")):
        responsibilities.append(sent.text.strip())


print("=== Extracted Keywords / Skills ===")
print(skills)
print("\n=== Responsibilities ===")
for r in responsibilities:
    print("-", r)


=== Extracted Keywords / Skills ===
{'the candidate', 'teamwork', 'reports', 'experience', 'building dashboards', 'pandas', 'responsibilities', 'python', 'good communication skills', 'business data', 'data visualization tools', 'a data analyst', 'sql', '\nwe', 'who', 'power bi', 'machine learning'}

=== Responsibilities ===
- Responsibilities include analyzing business data, building dashboards, and preparing reports.


In [7]:


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


resume_text = """
Experienced Data Analyst skilled in Python, SQL, Power BI, and Machine Learning.
Worked on fraud detection and data visualization projects using pandas and matplotlib.
"""

job_description = """
Looking for a Data Analyst with experience in Python, SQL, Power BI, and data visualization.
Should understand machine learning concepts and have strong analytical skills.
"""

resume_clean = clean_text(resume_text)
jd_clean = clean_text(job_description)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([resume_clean, jd_clean])


similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0]
score = round(similarity * 100, 2)


print("Resume Match Score:", score, "%")
if score >= 70:
    print(" Shortlisted Resume")
else:
    print(" Not Shortlisted Resume")



Resume Match Score: 47.25 %
 Not Shortlisted Resume


In [6]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.50.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.50.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m47.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m91.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.50.0


In [11]:
import streamlit as st
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\\s]', '', text)
    text = re.sub(r'\\s+', ' ', text)
    return text.strip()

def extract_skills(text):
    skills_list = ['python', 'java', 'sql', 'excel', 'machine learning',
                   'data analysis', 'power bi', 'tableau', 'communication',
                   'pandas', 'numpy', 'deep learning']
    found = [skill for skill in skills_list if skill in text.lower()]
    return found


st.title(" Simple Resume Matcher Dashboard")

jd_text = st.text_area("Paste Job Description Here", height=150)

uploaded_files = st.file_uploader("Upload Multiple Resumes (TXT files)", type=["txt"], accept_multiple_files=True)

threshold = st.slider("Shortlist Threshold (%)", 0, 100, 70)

if st.button("Match Resumes"):
    if not jd_text or not uploaded_files:
        st.warning("Please upload resumes and paste a job description.")
    else:
        jd_clean = clean_text(jd_text)
        resumes = []
        names = []
        skills_found = []

        for f in uploaded_files:
            text = f.read().decode("utf-8", errors="ignore")
            clean_resume = clean_text(text)
            skills = extract_skills(clean_resume)
            resumes.append(clean_resume)
            names.append(f.name)
            skills_found.append(", ".join(skills))

        docs = [jd_clean] + resumes
        vectorizer = TfidfVectorizer()
        vectors = vectorizer.fit_transform(docs)


        jd_vec = vectors[0:1]
        resume_vecs = vectors[1:]
        similarities = cosine_similarity(resume_vecs, jd_vec).flatten()


        df = pd.DataFrame({
            "Candidate Name": names,
            "Extracted Skills": skills_found,
            "Score (%)": (similarities * 100).round(2),
            "Shortlisted": [" Yes" if s*100 >= threshold else " No" for s in similarities]
        })


        df = df.sort_values(by="Score (%)", ascending=False).reset_index(drop=True)

        st.subheader(" Matching Results")
        st.dataframe(df)

        st.subheader(" Top Candidates")
        top_n = st.number_input("Select how many top candidates to view", min_value=1, max_value=len(df), value=3)
        st.table(df.head(top_n))


        csv = df.to_csv(index=False).encode("utf-8")
        st.download_button("Download Results as CSV", csv, "results.csv", "text/csv")


