In [3]:
import os
import fitz  # PyMuPDF
import re
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import joblib
from sklearn.linear_model import LinearRegression



In [4]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\guruk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\guruk\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    doc.close()
    return text


def extract_multiline_field(start, text):
    pattern = re.compile(rf"{start}:(.*?)(?=\n\w+:|\Z)", re.DOTALL | re.IGNORECASE)
    match = pattern.search(text)
    return match.group(1).strip().replace('\n', ' ') if match else ""


In [6]:
folder_path = r"D:\PrectisePython\ai_interview_app\resume_module\dataset\resume_detail"
all_data = []

for file in os.listdir(folder_path):
    if file.endswith(".pdf"):
        pdf_path = os.path.join(folder_path, file)
        text = extract_text_from_pdf(pdf_path)

        row = {
            "Education": extract_multiline_field("Education", text),
            "Skills": extract_multiline_field("Skills", text),
            "Projects": extract_multiline_field("Projects", text),
            "Certifications": extract_multiline_field("Certifications", text),
            "Experience" : extract_multiline_field("Experience" , text),
            "Achievements" : extract_multiline_field("Achievements" , text)

        }

        all_data.append(row)


In [7]:
print(all_data[2]['Education'])

B.Tech in Computer Science, Delhi Technological University, 2022 — 9.0 CGPA


In [8]:
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Lowercase
    text = text.lower()

    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)

    # Tokenize
    tokens = nltk.word_tokenize(text)

    # Lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)


In [9]:
df = pd.DataFrame(all_data)  # from your earlier code

print("Before Cleaning:\n", df.head())

for col in df.columns:
    df[col] = df[col].fillna('').apply(clean_text)

print("After Cleaning:\n", df.head())    


Before Cleaning:
                                            Education  \
0  B.Tech in Computer Science, NIT Trichy, 2023 —...   
1           B.Tech, unknown college, 2021 — 6.2 CGPA   
2  B.Tech in Computer Science, Delhi Technologica...   
3  B.E. in Information Science, PES University, 2...   
4  B.Tech in Data Science, SRM Institute, 2022 — ...   

                                              Skills  \
0  - Python, Scikit-Learn, Pandas, Numpy    - Mac...   
1                           - ML (basic)    - Python   
2  - Python, Scikit-learn, NumPy, Pandas    - Ten...   
3  - PyTorch, TensorFlow    - FastAPI, GitHub, Do...   
4  - Python, Pandas, NLTK, SpaCy    - BERT, SVM, ...   

                                            Projects  \
0  1. House Price Prediction (Linear Regression) ...   
1            - College project: Car price prediction   
2  1. Traffic Sign Recognition using CNN and Open...   
3  1. Real-time Face Mask Detection using YOLOv5 ...   
4  1. Resume Screening Autom

In [10]:
df["combined_text"] = df["Education"] + " " + df["Skills"] + " " + df["Projects"] + " " + df["Certifications"] + " " + df["Experience"] + " " + df["Achievements"]


In [11]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(df['combined_text'])


In [13]:
resume_outputs = pd.read_csv(r"D:\PrectisePython\ai_interview_app\resume_module\dataset\resume_score.csv")

y = resume_outputs['score']
model = LinearRegression()

model.fit(X, y)


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [14]:
joblib.dump(model, "resume_model.pkl")
joblib.dump(vectorizer, 'vectorizer.pkl')


['vectorizer.pkl']