In [None]:
!pip install -q pandas scikit-learn PyMuPDF


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m85.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import fitz  # PyMuPDF
import re
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [None]:
df = pd.read_csv("ResumeWithScores.csv")
df = df[['Resume_str', 'Score']].dropna()
df['Resume_str'] = df['Resume_str'].apply(lambda x: re.sub(r'\s+', ' ', str(x)).strip())


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['Resume_str'], df['Score'], test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=3000, stop_words='english')),
    ('reg', LinearRegression())
])

pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)
print(f"Model RMSE: {mean_squared_error(y_test, predictions):.2f}")


Model RMSE: 179.36


In [None]:
def extract_text_from_pdf(pdf_pah):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return re.sub(r'\s+', ' ', text).strip()


In [None]:
def generate_suggestions(resume_text):
    score = pipeline.predict([resume_text])[0]

    # Example keywords for suggestions
    improvements = {
        'handled': 'managed',
        'worked on': 'developed',
        'responsible for': 'led',
        'helped': 'contributed to',
        'tasked with': 'spearheaded'
    }

    word_replacements = {k: v for k, v in improvements.items() if k in resume_text.lower()}

    missing_sections = []
    if 'certification' not in resume_text.lower():
        missing_sections.append('Certifications')
    if 'project' not in resume_text.lower():
        missing_sections.append('Projects')
    if 'summary' not in resume_text.lower():
        missing_sections.append('Summary')

    feedback = []
    if len(resume_text.split()) < 150:
        feedback.append("Add more detail to expand resume content")
    if not word_replacements:
        feedback.append("Use more powerful action verbs")

    result = {
        "score": round(score, 2),
        "suggestions": {
            "word_replacements": word_replacements,
            "missing_sections": missing_sections,
            "style_feedback": feedback
        }
    }

    return json.dumps(result, indent=2)


In [None]:
pdf_path = "SuryaResume.pdf"  # Replace with the actual filename
resume_text = extract_text_from_pdf(pdf_path)
print(generate_suggestions(resume_text))


{
  "score": 34.01,
  "suggestions": {
    "word_replacements": {},
    "missing_sections": [
      "Certifications",
      "Summary"
    ],
    "style_feedback": [
      "Use more powerful action verbs"
    ]
  }
}


In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Compute predictions
predictions = pipeline.predict(X_test)

# Evaluation metrics
mae = mean_absolute_error(y_test, predictions)
rmse = mean_squared_error(y_test, predictions) ** 0.5
r2 = r2_score(y_test, predictions)

# Print results
print(f"Model Evaluation:")
print(f"  MAE : {mae:.2f}")
print(f"  RMSE: {rmse:.2f}")
print(f"  R² Score: {r2:.2f}")


Model Evaluation:
  MAE : 10.48
  RMSE: 13.39
  R² Score: -2.21


In [None]:
import numpy as np

def extract_features(text):
    text = text.lower()
    features = {
        'length': len(text),
        'word_count': len(text.split()),
        'num_skills': text.count('skill'),
        'num_projects': text.count('project'),
        'num_certifications':
        text.count('certification'),
        'has_summary': int('summary' in text),
        'has_experience': int('experience' in text),
        'has_education': int('education' in text),
        'has_awards': int('award' in text or 'achievement' in text),
        'num_action_words': sum(text.count(word) for word in ['led', 'developed', 'managed', 'spearheaded']),
    }
    return features

feature_df = df['Resume_str'].apply(extract_features).apply(pd.Series)
X = feature_df
y = df['Score']


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
predictions = rf.predict(X_test)


In [None]:
mae = mean_absolute_error(y_test, predictions)
rmse = mean_squared_error(y_test, predictions) ** 0.5
r2 = r2_score(y_test, predictions)

print("Random Forest Regressor Evaluation:")
print(f"  MAE : {mae:.2f}")
print(f"  RMSE: {rmse:.2f}")
print(f"  R² Score: {r2:.2f}")


Random Forest Regressor Evaluation:
  MAE : 2.50
  RMSE: 3.07
  R² Score: 0.83


In [None]:
import json

def generate_resume_feedback(pdf_path):
    # Extract text from PDF
    text = extract_text_from_pdf(pdf_path)

    # Extract features and reshape for prediction
    features = pd.DataFrame([extract_features(text)])
    score = rf.predict(features)[0]

    # Smart suggestions
    improvements = {
        'handled': 'managed',
        'worked on': 'developed',
        'responsible for': 'led',
        'helped': 'contributed to',
        'tasked with': 'spearheaded'
    }
    word_replacements = {k: v for k, v in improvements.items() if k in text.lower()}

    missing_sections = []
    if 'certificates' not in text.lower():
        missing_sections.append('Certifications')
    if 'project' not in text.lower():
        missing_sections.append('Projects')
    if 'education' not in text.lower():
        missing_sections.append('Education')
    if 'experience' not in text.lower():
        missing_sections.append('Experience')
    if 'skills' not in text.lower():
        missing_sections.append('Skills')


    feedback = []
    if len(text.split()) < 150:
        feedback.append("Add more detail to expand resume content")
    if not word_replacements:
        feedback.append("Use more powerful action verbs")

    result = {
        "score": round(score, 2),
        "suggestions": {
            "word_replacements": word_replacements,
            "missing_sections": missing_sections,
            "style_feedback": feedback
        }
    }

    return json.dumps(result, indent=2)


In [None]:
resume_path = "SuryaResume.pdf"  # change this to match the uploaded file name
print(generate_resume_feedback(resume_path))


{
  "score": 53.3,
  "suggestions": {
    "word_replacements": {},
    "missing_sections": [
      "Summary"
    ],
    "style_feedback": [
      "Use more powerful action verbs"
    ]
  }
}


In [None]:
import joblib

# Save the trained model
joblib.dump(rf, "resume_score_model.pkl")


['resume_score_model.pkl']

In [None]:
import joblib

# Load the saved model
model = joblib.load("resume_score_model.pkl")
