In [None]:
!pip install --upgrade nltk scikit-learn pandas matplotlib PyPDF2

# Force clean install of nltk data
import nltk
nltk.download('punkt', force=True)
nltk.download('stopwords', force=True)


In [None]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import PyPDF2

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


In [None]:
from google.colab import files

# Upload your resume (PDF format)
uploaded = files.upload()


In [None]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            if page.extract_text():
                text += page.extract_text()
        return text

# Auto-detect uploaded file
pdf_filename = list(uploaded.keys())[0]
cv_text = extract_text_from_pdf(pdf_filename)

print("📄 Extracted CV Text Preview:\n")
print(cv_text[:1000])


In [None]:
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters and digits
    text = re.sub(r'[^a-z\s]', '', text)

    # Tokenize using regex split (no nltk)
    tokens = text.split()

    # Remove stopwords using scikit-learn's stopwords
    tokens = [word for word in tokens if word not in ENGLISH_STOP_WORDS]

    return " ".join(tokens)

# Apply preprocessing safely
processed_cv_text = preprocess_text(cv_text)

# Show preview
print("🧹 Preprocessed CV Text:\n")
print(processed_cv_text[:500])


In [None]:
vectorizer = TfidfVectorizer(max_features=100)
X_cv = vectorizer.fit_transform([processed_cv_text]).toarray()

print("✅ TF-IDF Feature Vector Shape:", X_cv.shape)


In [None]:
traits_keywords = {
    "Openness": ["creative", "curious", "imaginative", "innovation", "design"],
    "Conscientiousness": ["organized", "responsible", "dependable", "punctual", "plan"],
    "Extraversion": ["outgoing", "energetic", "team", "communication", "public"],
    "Agreeableness": ["cooperative", "friendly", "kind", "empathetic", "collaboration"],
    "Neuroticism": ["anxious", "moody", "tense", "insecure", "nervous"]
}

trait_scores = {}
for trait, keywords in traits_keywords.items():
    trait_scores[trait] = sum(processed_cv_text.count(word) for word in keywords)

trait_scores


In [None]:
plt.figure(figsize=(10, 5))
plt.bar(trait_scores.keys(), trait_scores.values(), color='teal')
plt.title("🧠 Predicted Personality Traits from CV")
plt.xlabel("Personality Trait")
plt.ylabel("Keyword Frequency")
plt.grid(True)
plt.show()
