In [None]:
%pip install docx2txt

In [None]:
!pip install textract

In [None]:
#%pip install pdfminer.six
#pip show pdfminer
pip show pdfminer.six

In [None]:
%pip install docx2txt

In [None]:
%pip install openpyxl

In [None]:
!pip install nltk

In [None]:
!pip install --upgrade --force-reinstall pdfminer.six==20231228

!pip show pdfminer.six

In [1]:

import re
import nltk
import pandas as pd
import docx2txt
from pdfminer.high_level import extract_text
#import pdfminer.high_level
from nltk.corpus import stopwords
import os
import joblib
from joblib import load
import pickle
import numpy as np

MODEL_PATH = "ats_model.pkl"
VECTORIZER_PATH = "tfidf_vectorizer.pkl"

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')

if os.path.exists(MODEL_PATH) and os.path.exists(VECTORIZER_PATH):
    model = joblib.load(MODEL_PATH)
    vectorizer = joblib.load(VECTORIZER_PATH)
    MODEL_AVAILABLE = True
else:
    print("ATS model not found!")
    MODEL_AVAILABLE = False


skills_df = pd.read_excel("skills.xlsx")
SKILLS_DB = skills_df.iloc[:, 0].dropna().str.lower().tolist()

RESERVED_EDU_WORDS = ['university', 'college', 'institute', 'school', 'academy', 'faculty']


def extract_text_from_docx(docx_path):
    txt = docx2txt.process(docx_path)
    if txt:
        return txt.replace('\t', ' ')
    return ""

def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path) or ""



#  Extract Email & Phone

def extract_contact_info(text):
    email = None
    phone = None

    # Email
    email_match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
    if email_match:
        email = email_match.group(0)

    # Phone
    phone_match = re.search(r'(\+?\d{1,3}[\s\-]?)?\(?\d{2,4}\)?[\s\-]?\d{3,5}[\s\-]?\d{3,5}', text)
    if phone_match:
        phone = phone_match.group(0)

    return email, phone



#  Extract Name 

def extract_name(text):
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    first_line = lines[0]
    words = nltk.word_tokenize(first_line)
    tagged = nltk.pos_tag(words)

    proper_nouns = [word for word, pos in tagged if pos == 'NNP']
    if len(proper_nouns) >= 2:
        return f"{proper_nouns[0]} {proper_nouns[1]}"
    elif proper_nouns:
        return proper_nouns[0]
    return first_line

#  Extract Education

def extract_education(text):
    RESERVED_EDU_WORDS = ['university', 'college', 'institute', 'school', 'academy', 'faculty']
    DEGREE_KEYWORDS = [
        'b.tech', 'b.e', 'bsc', 'b.s', 'bachelor',
        'm.tech', 'm.e', 'msc', 'm.s', 'master',
        'mba', 'phd', 'diploma', 'degree', 'b.ed'
    ]

    lines = [line.strip() for line in text.split('\n') if line.strip()]
    education_entries = set()

    i = 0
    while i < len(lines):
        lower_line = lines[i].lower()

        # Start only if we explicitly hit "Education" heading
        if re.match(r'education', lower_line):
            j = i + 1
            entry = ""

            while j < len(lines):
                next_line = lines[j].strip()
                # Stop if new section begins
                if re.search(r'about\s*me|technical skills|certifications|projects|experience|interests|extracurricular|email|phone', next_line, re.I):
                    break
                entry += " " + next_line
                j += 1

            education_entries.add(entry.strip())
            i = j
        else:
            i += 1

    return education_entries



def extract_skills(text):


    stop_words = set(nltk.corpus.stopwords.words('english'))
    word_tokens = nltk.tokenize.word_tokenize(text)


    filtered_tokens = [w for w in word_tokens if w not in stop_words]


    filtered_tokens = [w for w in word_tokens if w.isalpha()]


    bigrams_trigrams = list(map(' '.join, nltk.everygrams(filtered_tokens, 2, 3)))


    found_skills = set()


    for token in filtered_tokens:
        if token.lower() in SKILLS_DB:
            found_skills.add(token)


    for ngram in bigrams_trigrams:
        if ngram.lower() in SKILLS_DB:
            found_skills.add(ngram)

    return found_skills

# extract About me

def extract_about_me(text):
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    about_me = [] 

   
    ABOUT_SECTION_HEADERS = [
        r'about\s*me',
        r'career\s*objective',
        r'career\s*objectives',
        r'objective',
        r'summary',
        r'professional\s*summary',
        r'profile',
        r'personal\s*statement',
        r'future\s*objectives',
        r'future\s*goals'
    ]

  
    SECTION_ENDERS = [
        r'education', r'technical\s*skills', r'certifications',
        r'projects', r'experience', r'work\s*experience',
        r'interests', r'extracurricular', r'skills'
    ]

    about_start_regex = re.compile('|'.join(ABOUT_SECTION_HEADERS), re.I)
    
   
    about_end_regex = re.compile(r'^\s*(' + '|'.join(SECTION_ENDERS) + r')\b', re.I)

    for i, line in enumerate(lines):
        # If this line indicates the start of the About/Objective/Summary section
        if about_start_regex.search(line):
            j = i + 1
            
            while j < len(lines):
                next_line = lines[j].strip()

              
                is_header_match = about_end_regex.search(next_line)
                is_short_line = len(next_line.split()) <= 5
                
                # Stop only if it looks like a distinct header
                if is_header_match and is_short_line:
                    break

                about_me.append(next_line)
                j += 1
            
            break # stop after extracting the section

    return " ".join(about_me).strip()

    
# clean the text (remove punctuation and digits).
def clean_text(text):
    if not isinstance(text, str):
        text = str(text)

   
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    # lowercase
    text = text.lower()
    # remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    
    stop = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop])

    return text


# ATS score prediction
def predict_ats_score(text):
    if not MODEL_AVAILABLE:
        return None
    cleaned = clean_text(text)
    vector = vectorizer.transform([cleaned])
    score = model.predict(vector)[0]
    return round(float(score), 2)



if __name__ == '__main__':
    print("ðŸ“„ Enter the path to your resume (PDF or DOCX):")
    resume_path = input("> ").strip()

    if resume_path.lower().endswith('.pdf'):
        text = extract_text_from_pdf(resume_path)
    elif resume_path.lower().endswith('.docx'):
        text = extract_text_from_docx(resume_path)
    else:
        print("Unsupported file format. Please use PDF or DOCX.")
        exit()

    name = extract_name(text)
    email, phone = extract_contact_info(text)
    skills = extract_skills(text)
    education = extract_education(text)
    about_me = extract_about_me(text)
    cleaned_text = clean_text(text)

    print("\n===============================")
    print("ðŸ“‹ Resume Summary")
    print("===============================")
    print(f"Name: {name}")
   
    print(f"Email: {email}")
    print(f"Phone: {phone}")
    print(f"Education: {', '.join(education) if education else 'Not found'}")
    print(f"Skills ({len(skills)}): {', '.join(skills)}")
    print(f"About Me: {about_me if about_me else 'Not found'}")
    print("===============================")

    if MODEL_AVAILABLE:
            vector = vectorizer.transform([cleaned_text])
            raw_pred = model.predict(vector)[0]

        
            pred = max(0, min(100, raw_pred))

            print(f"ATS Score: {pred:.2f}/100")

            #print(f"ATS Score: {pred}/100")
    else:
            print("ðŸ’¡ ATS model unavailable. Train it first to enable scoring.")


[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading averaged_perceptron_tagger_eng: <urlopen
[nltk_data]     error [Errno 11001] getaddrinfo failed>
[nltk_data] Error loading maxent_ne_chunker_tab: <urlopen error [Errno
[nltk_data]     11001] getaddrinfo failed>
[nltk_data] Error loading words: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


ðŸ“„ Enter the path to your resume (PDF or DOCX):


>  Divi_Kaustubh_Abhinand_Resume_Black.pdf



ðŸ“‹ Resume Summary
Name: Divi Kaustubh
Email: kaustabhinanddivi@gmail.com
Phone: None
Education: Vasavi College of Engineering B.Tech in Computer Science (Expected Graduation: 2027) Relevant Coursework: Data Structures & Algorithms, Operating Systems, Database Management Systems
Skills (11): Machine Learning, Java, Operating Systems, classification, SQL, Data Structures, MATLAB, C, Artificial Intelligence, scheduling, Python
About Me: Motivated and passionate Computer Science student with a strong foundation in programming, problem-solving, and software development. Eager to explore opportunities in AI & ML, software engineering, and database management. Enthusiastic about continuous learning and leveraging technical skills to solve real-world challenges.
ATS Score: 63.29/100


In [None]:
nltk.download('punkt_tab')

In [None]:
!pip install pdfminer.six



In [None]:
import pandas as pd

df = pd.read_json("master_resumes.jsonl", lines=True)
df.shape        # shows number of rows and columns
df.head()       # displays first 5 rows
df.columns      # shows available columns
print(df.columns)
df.iloc[0]

In [None]:
#convert dict object into raw text
def flatten_field(field):
    
    if isinstance(field, dict):
        parts = []
        for k, v in field.items():
            parts.append(flatten_field(v))
        return ' '.join(parts)
    elif isinstance(field, list):
        return ' '.join(flatten_field(item) for item in field)
    elif isinstance(field, str):
        return field
    elif pd.isna(field):
        return ''
    else:
        return str(field)


In [None]:
df["text"] = (
    df["personal_info"].apply(flatten_field) + " " +
    df["experience"].apply(flatten_field) + " " +
    df["education"].apply(flatten_field) + " " +
    df["skills"].apply(flatten_field) + " " +
    df["projects"].apply(flatten_field) + " " +
    df["certifications"].apply(flatten_field) + " " +
    df["achievements"].apply(flatten_field) + " " +
    df["workshops"].apply(flatten_field) + " " +
    df["publications"].apply(flatten_field) + " " +
    df["teaching_experience"].apply(flatten_field) + " " +
    df["internships"].apply(flatten_field)
)


In [None]:
print(df["text"].iloc[0][:1000])

In [None]:
# generate a synthetic set of scores for training purpose. Act as true values
def synthetic_score(text):
    text = str(text).lower()
    score = 0

    # core sections
    if "education" in text or "university" in text or "b.tech" in text:
        score += 10
    if "experience" in text or "developer" in text or "engineer" in text:
        score += 10
    if "project" in text:
        score += 10
    if "skill" in text or "programming" in text:
        score += 10

    # tech stack keywords
    if any(k in text for k in ["python", "java", "c++", "sql", "machine learning", "ai", "data"]):
        score += 10

    # extras
    if "intern" in text:
        score += 5
    if "certificate" in text or "certification" in text:
        score += 5
    if "achievement" in text or "award" in text:
        score += 5

    # word count (resume length)
    wc = len(text.split())
    if 200 < wc < 600:
        score += 10
    elif wc >= 600:
        score += 5

    # leadership/initiative
    if any(k in text for k in ["lead", "manage", "developed", "designed", "built"]):
        score += 10

    return min(score, 100)

    
df["score"] = df["text"].apply(synthetic_score)
print("Successful!")

In [None]:
df[["text", "score"]].head(5)


In [None]:
#!pip install matplotlib

import matplotlib.pyplot as plt

plt.hist(df["score"], bins=10)
plt.title("Synthetic ATS Score Distribution")
plt.xlabel("Score")
plt.ylabel("Number of Resumes")
plt.show()


In [None]:
# cleaning the data
import nltk
from nltk.corpus import stopwords
import regex as re
def clean_text(text):
    if not isinstance(text, str):
        text = str(text)

    # remove punctuation and digits
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    # lowercase
    text = text.lower()
    # remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # remove stopwords
    stop = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop])

    return text

df["clean_text"] = df["text"].apply(clean_text)
print("Successfully cleaned!")
print(df["clean_text"].iloc[:300])

In [None]:
# training the ATS model
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge   
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import joblib




vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df["clean_text"])
y = df["score"].values 


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


model = Ridge(alpha=1.0)   
model.fit(X_train, y_train)


y_pred = model.predict(X_test)


y_pred_clipped = np.clip(y_pred, 0, 100)

print("MAE:", mean_absolute_error(y_test, y_pred_clipped))


joblib.dump(model, "ats_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print("Model & vectorizer saved.")

In [None]:
import matplotlib.pyplot as plt
import numpy as np


plt.figure(figsize=(12,5))
plt.plot(y_test, label="True Scores")
plt.plot(y_pred, label="Predicted Scores")
plt.xlabel("Resume Index")
plt.ylabel("Score")
plt.title("True vs Predicted Scores")
plt.legend()
plt.grid(True)
plt.show()



In [None]:
# save the model
import joblib

joblib.dump(model, "ats_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print(" Model and vectorizer saved successfully!")


In [None]:
import os
print(os.listdir())

In [None]:

import re
import nltk
import pandas as pd
import docx2txt
from pdfminer.high_level import extract_text
#import pdfminer.high_level
from nltk.corpus import stopwords
import os
import joblib
from joblib import load
import pickle
import numpy as np

MODEL_PATH = "ats_model.pkl"
VECTORIZER_PATH = "tfidf_vectorizer.pkl"

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')

if os.path.exists(MODEL_PATH) and os.path.exists(VECTORIZER_PATH):
    model = joblib.load(MODEL_PATH)
    vectorizer = joblib.load(VECTORIZER_PATH)
    MODEL_AVAILABLE = True
else:
    print("ATS model not found!")
    MODEL_AVAILABLE = False


skills_df = pd.read_excel("skills.xlsx")
SKILLS_DB = skills_df.iloc[:, 0].dropna().str.lower().tolist()

RESERVED_EDU_WORDS = ['university', 'college', 'institute', 'school', 'academy', 'faculty']


def extract_text_from_docx(docx_path):
    txt = docx2txt.process(docx_path)
    if txt:
        return txt.replace('\t', ' ')
    return ""

def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path) or ""



#  Extract Email & Phone

def extract_contact_info(text):
    email = None
    phone = None

    # Email
    email_match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
    if email_match:
        email = email_match.group(0)

    # Phone
    phone_match = re.search(r'(\+?\d{1,3}[\s\-]?)?\(?\d{2,4}\)?[\s\-]?\d{3,5}[\s\-]?\d{3,5}', text)
    if phone_match:
        phone = phone_match.group(0)

    return email, phone



#  Extract Name (first line or first proper noun pair)

def extract_name(text):
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    first_line = lines[0]
    words = nltk.word_tokenize(first_line)
    tagged = nltk.pos_tag(words)

    proper_nouns = [word for word, pos in tagged if pos == 'NNP']
    if len(proper_nouns) >= 2:
        return f"{proper_nouns[0]} {proper_nouns[1]}"
    elif proper_nouns:
        return proper_nouns[0]
    return first_line

#  Extract Education

def extract_education(text):
    RESERVED_EDU_WORDS = ['university', 'college', 'institute', 'school', 'academy', 'faculty']
    DEGREE_KEYWORDS = [
        'b.tech', 'b.e', 'bsc', 'b.s', 'bachelor',
        'm.tech', 'm.e', 'msc', 'm.s', 'master',
        'mba', 'phd', 'diploma', 'degree', 'b.ed'
    ]

    lines = [line.strip() for line in text.split('\n') if line.strip()]
    education_entries = set()

    i = 0
    while i < len(lines):
        lower_line = lines[i].lower()

        # Start only if we explicitly hit "Education" heading
        if re.match(r'education', lower_line):
            j = i + 1
            entry = ""

            while j < len(lines):
                next_line = lines[j].strip()
                # Stop if new section begins
                if re.search(r'about\s*me|technical skills|certifications|projects|experience|interests|extracurricular|email|phone', next_line, re.I):
                    break
                entry += " " + next_line
                j += 1

            education_entries.add(entry.strip())
            i = j
        else:
            i += 1

    return education_entries



def extract_skills(text):


    stop_words = set(nltk.corpus.stopwords.words('english'))
    word_tokens = nltk.tokenize.word_tokenize(text)


    filtered_tokens = [w for w in word_tokens if w not in stop_words]


    filtered_tokens = [w for w in word_tokens if w.isalpha()]


    bigrams_trigrams = list(map(' '.join, nltk.everygrams(filtered_tokens, 2, 3)))


    found_skills = set()


    for token in filtered_tokens:
        if token.lower() in SKILLS_DB:
            found_skills.add(token)


    for ngram in bigrams_trigrams:
        if ngram.lower() in SKILLS_DB:
            found_skills.add(ngram)

    return found_skills

# extract About me

def extract_about_me(text):
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    about_me = ""
    
    for i, line in enumerate(lines):
        if re.search(r'about\s*me', line, re.I):
            j = i + 1
            while j < len(lines):
                next_line = lines[j].strip()
                # Stop if we reach a new section
                if re.search(r'education|technical skills|certifications|projects|experience|interests|extracurricular', next_line, re.I):
                    break
                about_me += " " + next_line
                j += 1
            break
    return about_me.strip()

def clean_text(text):
    if not isinstance(text, str):
        text = str(text)

    # remove punctuation and digits
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    # lowercase
    text = text.lower()
    # remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # remove stopwords
    stop = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop])

    return text

# generate a synthetic set of scores for training purpose. Act as true values
def synthetic_score(text):
    text = str(text).lower()
    score = 0

    # core sections
    if "education" in text or "university" in text or "b.tech" in text:
        score += 10
    if "experience" in text or "developer" in text or "engineer" in text:
        score += 10
    if "project" in text:
        score += 10
    if "skill" in text or "programming" in text:
        score += 10

    # tech stack keywords
    if any(k in text for k in ["python", "java", "c++", "sql", "machine learning", "ai", "data"]):
        score += 10

    # extras
    if "intern" in text:
        score += 5
    if "certificate" in text or "certification" in text:
        score += 5
    if "achievement" in text or "award" in text:
        score += 5

    # word count (resume length)
    wc = len(text.split())
    if 200 < wc < 600:
        score += 10
    elif wc >= 600:
        score += 5

    # leadership/initiative
    if any(k in text for k in ["lead", "manage", "developed", "designed", "built"]):
        score += 10

    return min(score, 100)

    
#df["score"] = df["text"].apply(synthetic_score)
#print("Successful!")


if __name__ == '__main__':
    print("ðŸ“„ Enter the path to your resume (PDF or DOCX):")
    resume_path = input("> ").strip()

    if resume_path.lower().endswith('.pdf'):
        text = extract_text_from_pdf(resume_path)
    elif resume_path.lower().endswith('.docx'):
        text = extract_text_from_docx(resume_path)
    else:
        print("Unsupported file format. Please use PDF or DOCX.")
        exit()

    name = extract_name(text)
    email, phone = extract_contact_info(text)
    skills = extract_skills(text)
    education = extract_education(text)
    about_me = extract_about_me(text)

    print("\n===============================")
    print("ðŸ“‹ Resume Summary")
    print("===============================")
    print(f"Name: {name}")
    #print(f"About Me: {about_me if about_me else 'Not found'}")
    print(f"Email: {email}")
    print(f"Phone: {phone}")
    print(f"Education: {', '.join(education) if education else 'Not found'}")
    print(f"Skills ({len(skills)}): {', '.join(skills)}")
    print(f"About Me: {about_me if about_me else 'Not found'}")
    print("===============================")

    ats_score = synthetic_score(text)
    print(f"ATS score: {ats_score}/100")
    

In [2]:
!git --version


git version 2.52.0.windows.1


In [4]:
!git config --global user.name "KaustAbhinand"
!git config --global user.email "kaustubhabhinan@gmail.com"


In [6]:
!pwd


'pwd' is not recognized as an internal or external command,
operable program or batch file.


In [7]:
!git init


Initialized empty Git repository in C:/Users/RAGUNATH/.git/


In [9]:
!git remote add origin https://github.com/KaustAbhinand/resume_parser-rating.git

error: remote origin already exists.
