# CV Analytics Dashboard

This notebook provides comprehensive CV analysis including:
- Word Cloud generation
- Top words analysis
- Year mentions tracking
- Keyword frequency analysis
- Summary statistics

In [None]:
# Install required packages
!pip install python-docx pdfplumber PyPDF2 wordcloud nltk unidecode matplotlib pandas plotly

In [None]:
# Import libraries and setup
import os, re, io
import pdfplumber, PyPDF2, docx
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import Counter
from datetime import datetime
from unidecode import unidecode
import nltk

# Download NLTK data
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

from nltk.corpus import stopwords

print("✅ All libraries loaded successfully!")

In [None]:
# File reading functions
def read_txt(path: str) -> str:
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        return f.read()

def read_docx(path: str) -> str:
    d = docx.Document(path)
    return "\n".join(p.text for p in d.paragraphs)

def read_pdf(path: str) -> str:
    # Primary: pdfplumber; fallback: PyPDF2
    try:
        text = []
        with pdfplumber.open(path) as pdf:
            for page in pdf.pages:
                text.append(page.extract_text() or "")
        out = "\n".join(text)
        if out.strip():
            return out
    except Exception as e:
        print(f"pdfplumber failed: {e}, trying PyPDF2...")
    
    text = []
    with open(path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        for p in reader.pages:
            text.append(p.extract_text() or "")
    return "\n".join(text)

def read_any(path: str) -> str:
    ext = os.path.splitext(path.lower())[1]
    if ext == ".pdf":
        return read_pdf(path)
    elif ext == ".docx":
        return read_docx(path)
    elif ext == ".txt":
        return read_txt(path)
    else:
        # Fallback to text reading
        try:
            return read_txt(path)
        except:
            return ""

print("✅ File reading functions defined!")

In [None]:
# Tokenization and text processing
_TOKEN_RE = re.compile(r"[A-Za-z0-9]+(?:[-_][A-Za-z0-9]+)*")
EN_STOP = set(stopwords.words("english"))

# Custom stopwords for CV analysis
CUSTOM_STOP = {
    'cv', 'resume', 'responsible', 'worked', 'using', 'project', 'projects', 
    'tool', 'performed', 'objective', 'summary', 'include', 'data', 
    'management', 'experience', 'senior', 'lead', 'team', 'develop', 
    'developed', 'design', 'designed', 'implement', 'implemented', 'build', 
    'built', 'created', 'skills', 'skill', 'abilities', 'ability', 
    'proficient', 'knowledge', 'strong', 'understanding', 'etc'
}

def tokenize(text: str, extra_stop: set = None):
    text = unidecode(text or "")
    text = re.sub(r"\s+", " ", text)
    tokens = [t.lower() for t in _TOKEN_RE.findall(text)]
    
    # Remove pure numbers
    tokens = [t for t in tokens if not t.isdigit()]
    
    # Apply stopwords
    stop = EN_STOP.copy()
    stop.update(CUSTOM_STOP)
    if extra_stop:
        stop.update({w.lower() for w in extra_stop})
    
    tokens = [t for t in tokens if t not in stop and len(t) > 2]
    return tokens, text

print("✅ Tokenization functions defined!")

In [None]:
# Year extraction functions
YEAR_RE = re.compile(r"\b(19\d{2}|20\d{2})\b|\b(19\d{2}|20\d{2})\s*(?:-|–|to)\s*(19\d{2}|20\d{2}|present|now)\b", re.IGNORECASE)

def extract_year_mentions(raw_text: str):
    years = []
    found_patterns = YEAR_RE.findall(raw_text)
    
    for pattern in found_patterns:
        for year_str in pattern:
            if year_str and year_str.lower() not in ("present", "now"):
                try:
                    y = int(year_str)
                    if 1950 <= y <= datetime.now().year + 1:
                        years.append(y)
                except ValueError:
                    pass
    return years

print("✅ Year extraction functions defined!")

In [None]:
# Main CV Analysis Class
class CVAnalyzer:
    def __init__(self, file_path: str):
        self.file_path = file_path
        self.file_name = os.path.basename(file_path)
        self.raw_text = read_any(file_path)
        self.tokens, self.normalized_text = tokenize(self.raw_text)
        self.word_counts = Counter(self.tokens)
        self.year_mentions = extract_year_mentions(self.raw_text)
        
    def get_summary(self):
        return {
            'file_name': self.file_name,
            'total_words': len(self.tokens),
            'unique_words': len(set(self.tokens)),
            'raw_text_length': len(self.raw_text),
            'years_mentioned': len(self.year_mentions)
        }
    
    def generate_word_cloud(self, max_words=300, width=1600, height=900):
        # Merge all stopwords
        wc_stop = STOPWORDS.union(EN_STOP).union({w.lower() for w in CUSTOM_STOP})
        
        cloud = WordCloud(
            width=width, height=height,
            background_color="white",
            max_words=max_words,
            collocations=False,
            stopwords=wc_stop
        ).generate(" ".join(self.tokens))
        
        plt.figure(figsize=(16, 9))
        plt.imshow(cloud, interpolation="bilinear")
        plt.axis("off")
        plt.title(f"Word Cloud — {self.file_name}", fontsize=16, pad=20)
        plt.tight_layout()
        plt.show()
        
        return cloud
    
    def plot_top_words(self, n=20):
        top_words = self.word_counts.most_common(n)
        words = [word for word, count in top_words]
        counts = [count for word, count in top_words]
        
        plt.figure(figsize=(12, 8))
        bars = plt.bar(range(len(words)), counts, color='skyblue')
        plt.xlabel("Words", fontsize=12)
        plt.ylabel("Frequency", fontsize=12)
        plt.title(f"Top {n} Words in {self.file_name}", fontsize=14)
        plt.xticks(range(len(words)), words, rotation=45, ha="right")
        
        # Add value labels on bars
        for bar, count in zip(bars, counts):
            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1, 
                    str(count), ha='center', va='bottom')
        
        plt.tight_layout()
        plt.show()
        
        return top_words
    
    def plot_year_mentions(self):
        if not self.year_mentions:
            print("No year mentions found in the CV.")
            return
        
        year_counts = Counter(self.year_mentions)
        years = sorted(year_counts.keys())
        counts = [year_counts[year] for year in years]
        
        plt.figure(figsize=(12, 6))
        plt.plot(years, counts, marker='o', linewidth=2, markersize=8, color='orange')
        plt.xlabel("Year", fontsize=12)
        plt.ylabel("Number of Mentions", fontsize=12)
        plt.title(f"Year Mentions in {self.file_name}", fontsize=14)
        plt.grid(True, alpha=0.3)
        
        # Add value labels on points
        for year, count in zip(years, counts):
            plt.text(year, count + 0.1, str(count), ha='center', va='bottom')
        
        plt.tight_layout()
        plt.show()
        
        return year_counts
    
    def analyze_keywords(self, keyword_lists):
        """Analyze keyword frequency for different categories"""
        def count_keywords(text: str, keywords: list):
            text_lower = text.lower()
            counts = {}
            for keyword in keywords:
                pattern = r"\b" + re.escape(keyword.lower()) + r"\b"
                counts[keyword] = len(re.findall(pattern, text_lower))
            return counts
        
        results = {}
        all_keywords = []
        all_counts = []
        
        for category, keywords in keyword_lists.items():
            results[category] = count_keywords(self.raw_text, keywords)
            for keyword, count in results[category].items():
                all_keywords.append(f"{category}: {keyword}")
                all_counts.append(count)
        
        # Sort by count
        sorted_data = sorted(zip(all_keywords, all_counts), key=lambda x: x[1], reverse=True)
        sorted_keywords, sorted_counts = zip(*sorted_data) if sorted_data else ([], [])
        
        # Plot
        plt.figure(figsize=(12, max(8, len(sorted_keywords) * 0.4)))
        bars = plt.barh(range(len(sorted_keywords)), sorted_counts, color='lightcoral')
        plt.xlabel("Frequency", fontsize=12)
        plt.ylabel("Keywords", fontsize=12)
        plt.title(f"Keyword Analysis for {self.file_name}", fontsize=14)
        plt.yticks(range(len(sorted_keywords)), sorted_keywords)
        plt.gca().invert_yaxis()
        
        # Add value labels
        for i, count in enumerate(sorted_counts):
            plt.text(count + 0.1, i, str(count), va='center')
        
        plt.tight_layout()
        plt.show()
        
        return results
    
    def create_summary_table(self):
        summary = self.get_summary()
        top_10 = self.word_counts.most_common(10)
        
        print(f"📄 CV Analysis Summary for: {summary['file_name']}")
        print(f"📊 Total Words: {summary['total_words']:,}")
        print(f"🔤 Unique Words: {summary['unique_words']:,}")
        print(f"📝 Raw Text Length: {summary['raw_text_length']:,} characters")
        print(f"📅 Years Mentioned: {summary['years_mentioned']}")
        print("\n🔝 Top 10 Words:")
        for i, (word, count) in enumerate(top_10, 1):
            print(f"{i:2d}. {word:<15} ({count:>3} times)")

print("✅ CVAnalyzer class defined!")

In [None]:
# File upload and analysis
# Replace this path with your CV file path
CV_FILE_PATH = r"c:\Users\Hussain\Downloads\Sattam Alotaibi - Resume  2025.pdf"

# Alternative: use file dialog for interactive selection
# from tkinter import filedialog
# import tkinter as tk
# root = tk.Tk()
# root.withdraw()
# CV_FILE_PATH = filedialog.askopenfilename(
#     title="Select CV file",
#     filetypes=[("All supported", "*.pdf *.docx *.txt"), ("PDF files", "*.pdf"), ("Word files", "*.docx"), ("Text files", "*.txt")]
# )

if os.path.exists(CV_FILE_PATH):
    print(f"✅ Loading CV: {os.path.basename(CV_FILE_PATH)}")
    analyzer = CVAnalyzer(CV_FILE_PATH)
    print(f"✅ CV loaded successfully! Extracted {len(analyzer.raw_text)} characters of text.")
else:
    print(f"❌ File not found: {CV_FILE_PATH}")
    print("Please update the CV_FILE_PATH variable with the correct path to your CV file.")

In [None]:
# Generate comprehensive analysis
if 'analyzer' in locals():
    # 1. Summary Statistics
    print("=" * 60)
    analyzer.create_summary_table()
    print("=" * 60)
else:
    print("❌ Analyzer not loaded. Please run the previous cell first.")

In [None]:
# 2. Word Cloud Generation
if 'analyzer' in locals():
    print("🎨 Generating Word Cloud...")
    word_cloud = analyzer.generate_word_cloud(max_words=300)
else:
    print("❌ Analyzer not loaded. Please run the file loading cell first.")

In [None]:
# 3. Top Words Analysis
if 'analyzer' in locals():
    print("📊 Analyzing Top Words...")
    top_words = analyzer.plot_top_words(n=20)
else:
    print("❌ Analyzer not loaded. Please run the file loading cell first.")

In [None]:
# 4. Year Mentions Analysis
if 'analyzer' in locals():
    print("📅 Analyzing Year Mentions...")
    year_data = analyzer.plot_year_mentions()
else:
    print("❌ Analyzer not loaded. Please run the file loading cell first.")

In [None]:
# 5. Keyword Analysis
if 'analyzer' in locals():
    # Define keyword categories for analysis
    keyword_lists = {
        "Programming Languages": ["python", "sql", "java", "javascript", "r", "scala", "c++", "c#"],
        "Data & Analytics": ["data", "analytics", "machine learning", "ai", "statistics", "visualization", "tableau", "power bi"],
        "Cloud Platforms": ["aws", "azure", "gcp", "google cloud", "cloud"],
        "Databases": ["mysql", "postgresql", "mongodb", "oracle", "sql server", "redis"],
        "Management Skills": ["leadership", "project management", "team", "strategy", "planning"],
        "Technical Skills": ["api", "microservices", "docker", "kubernetes", "git", "ci/cd"]
    }
    
    print("🔍 Analyzing Keywords by Category...")
    keyword_results = analyzer.analyze_keywords(keyword_lists)
    
    # Print detailed results
    print("\n📋 Detailed Keyword Results:")
    for category, results in keyword_results.items():
        print(f"\n{category}:")
        for keyword, count in sorted(results.items(), key=lambda x: x[1], reverse=True):
            if count > 0:
                print(f"  • {keyword}: {count}")
else:
    print("❌ Analyzer not loaded. Please run the file loading cell first.")

In [None]:
# 6. Export Results to CSV
if 'analyzer' in locals():
    print("💾 Exporting results to CSV...")
    
    # Export top words
    top_words_df = pd.DataFrame(analyzer.word_counts.most_common(50), columns=['Word', 'Frequency'])
    top_words_df.to_csv('cv_top_words.csv', index=False)
    
    # Export year mentions
    if analyzer.year_mentions:
        year_counts = Counter(analyzer.year_mentions)
        years_df = pd.DataFrame(list(year_counts.items()), columns=['Year', 'Mentions'])
        years_df = years_df.sort_values('Year')
        years_df.to_csv('cv_year_mentions.csv', index=False)
    
    # Export keyword analysis
    if 'keyword_results' in locals():
        keyword_data = []
        for category, results in keyword_results.items():
            for keyword, count in results.items():
                keyword_data.append({'Category': category, 'Keyword': keyword, 'Count': count})
        
        keywords_df = pd.DataFrame(keyword_data)
        keywords_df.to_csv('cv_keyword_analysis.csv', index=False)
    
    print("✅ Results exported to CSV files:")
    print("  • cv_top_words.csv")
    print("  • cv_year_mentions.csv")
    print("  • cv_keyword_analysis.csv")
else:
    print("❌ Analyzer not loaded. Please run the file loading cell first.")

## 🎯 Analysis Complete!

This notebook has analyzed your CV and provided:

1. **📊 Summary Statistics** - Total words, unique words, character count
2. **🎨 Word Cloud** - Visual representation of most frequent words
3. **📈 Top Words Chart** - Bar chart of most common terms
4. **📅 Year Mentions** - Timeline of years mentioned in the CV
5. **🔍 Keyword Analysis** - Frequency of technical and professional terms
6. **💾 CSV Exports** - Data exported for further analysis

### 🔧 Customization Options:

- **Add more keywords**: Modify the `keyword_lists` dictionary
- **Change word cloud settings**: Adjust `max_words`, `width`, `height` parameters
- **Filter stopwords**: Add custom words to `CUSTOM_STOP` set
- **Adjust chart parameters**: Modify `n` parameter for top words count

### 📝 Next Steps:

1. Review the generated visualizations
2. Use the CSV exports for additional analysis
3. Customize the keyword categories for your specific needs
4. Run the analysis on multiple CVs for comparison