In [24]:
import sys
import subprocess

def install_packages():
    """Install packages with proper dependency handling"""
    print("=" * 80)
    print("üöÄ INSTALLING DEPENDENCIES")
    print("=" * 80)

    packages = [
        'numpy==2.0.2',
        'pandas==2.1.4',
        'torch',
        'torchvision',
        'scikit-learn',
        'matplotlib',
        'seaborn',
        'tqdm',
        'fuzzywuzzy',
        'python-Levenshtein',
        'pdfplumber',
        'python-docx',
        'pillow',
        'requests',
        'beautifulsoup4',
        'python-jobspy',
        'ipywidgets'
    ]

    for package in packages:
        try:
            subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', package])
            print(f"{package.split('==')[0]}")
        except:
            print(f"{package.split('==')[0]} (may need runtime restart)")

    print("\nInstallation complete!\n")

# Install packages
install_packages()



üöÄ INSTALLING DEPENDENCIES
numpy
pandas
torch
torchvision
scikit-learn
matplotlib
seaborn
tqdm
fuzzywuzzy
python-Levenshtein
pdfplumber
python-docx
pillow
requests
beautifulsoup4
python-jobspy
ipywidgets

Installation complete!



In [2]:
# ============================================================================
# IMPORTS
# ============================================================================

import sys
import subprocess
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import json
import re
import os
from typing import List, Dict, Tuple, Optional
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from datetime import datetime
import warnings



# Real job scraping
!pip install jobspy
from jobspy import scrape_jobs


# Resume parsing
import pdfplumber
import docx
PDF_SUPPORT = True

# File upload for Colab
try:
    from google.colab import files
    from IPython.display import display, HTML, clear_output
    COLAB_ENV = True
except:
    COLAB_ENV = False
    print("Not in Colab - file upload will use file paths")

print("‚úÖ All libraries imported successfully!")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"üíª Device: {device}\n")


‚úÖ All libraries imported successfully!
üíª Device: cpu



In [3]:
# ============================================================================
# SKILLS DATABASE
# ============================================================================

SKILLS_DATABASE = [
    # Programming Languages
    'Python', 'Java', 'JavaScript', 'TypeScript', 'C++', 'C#', 'C', 'PHP',
    'Ruby', 'Swift', 'Kotlin', 'Go', 'Rust', 'Scala', 'R', 'MATLAB',

    # Web Frontend
    'HTML', 'CSS', 'React', 'Angular', 'Vue.js', 'jQuery', 'Bootstrap',
    'Tailwind CSS', 'SASS', 'LESS', 'Redux', 'Next.js',

    # Web Backend
    'Node.js', 'Express.js', 'Django', 'Flask', 'FastAPI', 'Spring Boot',
    'ASP.NET', 'Ruby on Rails', 'Laravel', 'NestJS',

    # Databases
    'MySQL', 'PostgreSQL', 'MongoDB', 'Redis', 'Oracle', 'SQLite',
    'Cassandra', 'DynamoDB', 'Elasticsearch', 'Neo4j', 'MariaDB',

    # Data Science & ML
    'TensorFlow', 'PyTorch', 'Keras', 'Scikit-learn', 'Pandas', 'NumPy',
    'Matplotlib', 'Seaborn', 'OpenCV', 'NLTK', 'SpaCy', 'Transformers',
    'Machine Learning', 'Deep Learning', 'Neural Networks', 'NLP',
    'Computer Vision', 'Data Analysis',

    # Big Data
    'Spark', 'Hadoop', 'Kafka', 'Airflow', 'Databricks', 'Hive',

    # Cloud Platforms
    'AWS', 'Azure', 'GCP', 'Heroku', 'DigitalOcean', 'Firebase',

    # DevOps & Tools
    'Docker', 'Kubernetes', 'Jenkins', 'Git', 'GitHub', 'GitLab',
    'CI/CD', 'Terraform', 'Ansible', 'Linux', 'Unix', 'Nginx',

    # Mobile Development
    'Android', 'iOS', 'React Native', 'Flutter', 'Xamarin', 'Ionic',

    # Other Technologies
    'REST API', 'GraphQL', 'Microservices', 'WebSocket', 'gRPC',
    'Agile', 'Scrum', 'JIRA', 'Tableau', 'Power BI'
]


In [4]:
# ============================================================================
# RESUME PARSER
# ============================================================================

class ResumeParser:
    """Parse resume and extract skills"""

    def __init__(self):
        self.skills_db = SKILLS_DATABASE
        self.email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
        self.phone_pattern = re.compile(r'[\+]?[(]?[0-9]{1,4}[)]?[-\s\.]?[(]?[0-9]{1,4}[)]?[-\s\.]?[0-9]{1,9}')

    def extract_text_from_pdf(self, pdf_path):
        """Extract text from PDF"""
        text = ""
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
        return text

    def extract_text_from_docx(self, docx_path):
        """Extract text from DOCX"""
        doc = docx.Document(docx_path)
        return "\n".join([para.text for para in doc.paragraphs])

    def extract_skills(self, text):
        """Extract skills using word boundary matching"""
        text_lower = ' ' + text.lower() + ' '
        found_skills = set()

        for skill in self.skills_db:
            pattern = r'\b' + re.escape(skill.lower()) + r'\b'
            if re.search(pattern, text_lower):
                found_skills.add(skill)

        return sorted(list(found_skills))

    def extract_experience(self, text):
        """Extract years of experience"""
        patterns = [
            r'(\d+)\+?\s*(?:years?|yrs?)(?:\s+of)?\s+(?:experience|exp)',
            r'(?:experience|exp)(?:\s+of)?\s+(\d+)\+?\s*(?:years?|yrs?)',
        ]

        for pattern in patterns:
            match = re.search(pattern, text.lower())
            if match:
                return int(match.group(1))
        return 0

    def extract_education(self, text):
        """Extract education level"""
        text_lower = text.lower()

        if any(kw in text_lower for kw in ['ph.d', 'phd', 'doctorate']):
            return 'PhD'
        elif any(kw in text_lower for kw in ['master', 'm.s', 'm.tech', 'mba', 'mca']):
            return 'Masters'
        elif any(kw in text_lower for kw in ['bachelor', 'b.e', 'b.tech', 'b.s', 'bca']):
            return 'Bachelors'
        return 'Bachelors'

    def extract_contact(self, text):
        """Extract email and phone"""
        emails = re.findall(self.email_pattern, text)
        phones = re.findall(self.phone_pattern, text)

        return {
            'email': emails[0] if emails else 'not_found@example.com',
            'phone': phones[0] if phones else 'Not provided'
        }

    def extract_name(self, text):
        """Extract candidate name"""
        lines = [l.strip() for l in text.split('\n') if l.strip()]
        for line in lines[:5]:
            if 2 <= len(line.split()) <= 4 and len(line) < 50:
                return line
        return "Candidate"

    def parse_resume(self, file_path, file_type='pdf'):
        """Main parsing function"""
        print(f"\nüìÑ Parsing resume: {os.path.basename(file_path)}")
        print("=" * 60)

        # Extract text
        if file_type.lower() == 'pdf':
            text = self.extract_text_from_pdf(file_path)
        elif file_type.lower() in ['docx', 'doc']:
            text = self.extract_text_from_docx(file_path)
        else:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                text = f.read()

        if not text or len(text) < 50:
            raise ValueError("Could not extract meaningful text from resume")

        # Extract all information
        contact = self.extract_contact(text)
        skills = self.extract_skills(text)
        experience = self.extract_experience(text)
        education = self.extract_education(text)
        name = self.extract_name(text)

        resume_data = {
            'name': name,
            'email': contact['email'],
            'phone': contact['phone'],
            'skills': skills,
            'experience_years': experience,
            'education_level': education,
            'resume_text': text[:1000],
            'full_text': text,
            'parsed_date': datetime.now().isoformat()
        }

        print(f"‚úÖ Successfully parsed resume!\n")
        print(f"Name: {name}")
        print(f"Email: {contact['email']}")
        print(f"Phone: {contact['phone']}")
        print(f"Skills Found: {len(skills)}")
        print(f"Experience: {experience} years")
        print(f"Education: {education}")

        if skills:
            print(f"\nüîß Top Skills: {', '.join(skills[:10])}")
            if len(skills) > 10:
                print(f"       ... and {len(skills) - 10} more")

        return resume_data


In [5]:
# ============================================================================
# REAL-TIME JOB SCRAPER
# ============================================================================

class RealTimeJobScraper:
    """Real-time job scraping using JobSpy"""

    def scrape_jobs(self, location="India", results_wanted=100, site_names=['indeed', 'linkedin'], is_remote=False):
        """Scrape jobs by location only"""
        print(f"\nüîç SCRAPING REAL-TIME JOBS")
        print("=" * 60)
        print(f"   Location: {location}")
        print(f"   Sites: {', '.join(site_names)}")
        print(f"   Target: {results_wanted} jobs")

        try:
            # Search with generic terms to get broad results
            search_terms = ["Software Developer", "Engineer", "Developer"]
            all_jobs = []

            for term in search_terms:
                jobs_df = scrape_jobs(
                    site_name=site_names,
                    search_term=term,
                    location=location,
                    results_wanted=results_wanted // len(search_terms),
                    hours_old=168,  # Last week
                    country_indeed='india',
                    is_remote=is_remote,
                    description_format='markdown'
                )

                if jobs_df is not None and len(jobs_df) > 0:
                    all_jobs.append(jobs_df)

            if not all_jobs:
                print("\n‚ùå No jobs found!")
                return []

            # Combine all results
            jobs_df = pd.concat(all_jobs, ignore_index=True)
            jobs_df = jobs_df.drop_duplicates(subset=['title', 'company'], keep='first')

            jobs = jobs_df.to_dict('records')

            print(f"\n‚úÖ Successfully scraped {len(jobs)} REAL jobs!")
            print(f"üìä Breakdown:")

            site_counts = jobs_df['site'].value_counts()
            for site, count in site_counts.items():
                print(f"      - {site}: {count} jobs")

            return jobs

        except Exception as e:
            print(f"\n‚ùå Scraping failed: {str(e)}")
            return []


In [6]:
# ============================================================================
# JOB PREPROCESSOR
# ============================================================================

class JobPreprocessor:
    """Process scraped jobs and extract features"""

    def __init__(self):
        self.skills_db = SKILLS_DATABASE

    def extract_skills_from_description(self, description):
        """Extract skills from job description"""
        if not description:
            return []

        text_lower = ' ' + str(description).lower() + ' '
        found_skills = []

        for skill in self.skills_db:
            pattern = r'\b' + re.escape(skill.lower()) + r'\b'
            if re.search(pattern, text_lower):
                found_skills.append(skill)

        return sorted(list(set(found_skills)))

    def process_jobs(self, jobs_list):
        """Process and standardize job listings"""
        print(f"\nüîß PROCESSING {len(jobs_list)} JOBS")
        print("=" * 60)

        processed = []

        for i, job in enumerate(jobs_list):
            try:
                desc = job.get('description', '')
                if not desc:
                    continue

                skills = self.extract_skills_from_description(desc)

                processed_job = {
                    'job_id': f"job_{i}_{job.get('site', 'unknown')}",
                    'title': job.get('title', 'Unknown Position'),
                    'company': job.get('company', 'Unknown Company'),
                    'location': job.get('location', 'Not specified'),
                    'job_type': job.get('job_type', 'Full-time'),
                    'description': str(desc)[:500],
                    'requirements': str(desc)[:300],
                    'skills': skills,
                    'salary': self._format_salary(job),
                    'date_posted': job.get('date_posted', 'Recently'),
                    'job_url': job.get('job_url', '#'),
                    'site': job.get('site', 'unknown')
                }

                processed.append(processed_job)

            except Exception as e:
                continue

        print(f"‚úÖ Processed {len(processed)} jobs successfully")

        if processed:
            skill_counts = [len(j['skills']) for j in processed]
            print(f"üìä Skills per job: avg {np.mean(skill_counts):.1f}, max {max(skill_counts)}")

        return processed

    def _format_salary(self, job):
        """Format salary information"""
        if job.get('min_amount') and job.get('max_amount'):
            interval = job.get('interval', 'yearly')
            return f"${job['min_amount']:,} - ${job['max_amount']:,} ({interval})"
        return "Not specified"

In [7]:
# ============================================================================
# ConvFM MODEL COMPONENTS (From Untitled34.ipynb)
# ============================================================================

class TextCNN(nn.Module):
    """CNN for text feature extraction"""

    def __init__(self, vocab_size, embedding_dim=128, num_filters=64, filter_sizes=[3, 4, 5], dropout=0.5):
        super(TextCNN, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (fs, embedding_dim))
            for fs in filter_sizes
        ])

        self.dropout = nn.Dropout(dropout)
        self.output_dim = num_filters * len(filter_sizes)

    def forward(self, text):
        embedded = self.embedding(text).unsqueeze(1)

        conv_outputs = []
        for conv in self.convs:
            conv_out = F.relu(conv(embedded))
            pooled = F.max_pool2d(conv_out, (conv_out.size(2), 1))
            conv_outputs.append(pooled.squeeze(3).squeeze(2))

        features = torch.cat(conv_outputs, dim=1)
        return self.dropout(features)

class SkillsEncoder(nn.Module):
    """Encode skills into dense representation"""

    def __init__(self, num_skills, embedding_dim=32, dropout=0.3):
        super(SkillsEncoder, self).__init__()

        self.encoder = nn.Sequential(
            nn.Linear(num_skills, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, embedding_dim),
            nn.ReLU()
        )

    def forward(self, skill_vector):
        return self.encoder(skill_vector)

class FactorizationMachine(nn.Module):
    """Factorization Machine for modeling interactions"""

    def __init__(self, input_dim, factor_dim=32):
        super(FactorizationMachine, self).__init__()

        self.linear = nn.Linear(input_dim, 1, bias=True)
        self.factor_embeddings = nn.Parameter(torch.randn(input_dim, factor_dim) * 0.01)

        nn.init.xavier_uniform_(self.factor_embeddings)
        nn.init.xavier_uniform_(self.linear.weight)

    def forward(self, x):
        linear_term = self.linear(x)

        x_expanded = x.unsqueeze(2)
        factor_mul = x_expanded * self.factor_embeddings.unsqueeze(0)

        sum_square = torch.sum(factor_mul, dim=1) ** 2
        square_sum = torch.sum(factor_mul ** 2, dim=1)

        interaction_term = 0.5 * torch.sum(sum_square - square_sum, dim=1, keepdim=True)

        output = linear_term + interaction_term
        return output

class ConvFM(nn.Module):
    """Complete ConvFM Model"""

    def __init__(self, vocab_size, num_skills, embedding_dim=128, cnn_num_filters=64,
                 cnn_filter_sizes=[3, 4, 5], fm_factor_dim=32, skills_embed_dim=32, dropout=0.5):
        super(ConvFM, self).__init__()

        self.text_cnn = TextCNN(vocab_size, embedding_dim, cnn_num_filters, cnn_filter_sizes, dropout)
        self.skills_encoder = SkillsEncoder(num_skills, skills_embed_dim, dropout * 0.6)

        self.cat_embeddings = nn.ModuleDict({
            'job_type': nn.Embedding(20, 16),
            'location': nn.Embedding(500, 32),
            'education_level': nn.Embedding(10, 8)
        })

        self.cnn_dim = self.text_cnn.output_dim
        self.skills_dim = skills_embed_dim
        self.cat_dim = sum(emb.embedding_dim for emb in self.cat_embeddings.values()) # Calculate cat_dim dynamically
        self.total_dim = (self.cnn_dim + self.skills_dim + self.cat_dim) * 2

        self.fm = FactorizationMachine(self.total_dim, fm_factor_dim)
        self.batch_norm = nn.BatchNorm1d(self.total_dim)

    def extract_features(self, text, skills, categorical):
        text_features = self.text_cnn(text)
        skills_features = self.skills_encoder(skills)

        cat_features = []
        # Ensure all categorical embeddings are processed, even if input is None
        for name, emb_layer in self.cat_embeddings.items():
             if name in categorical and categorical[name] is not None:
                cat_features.append(emb_layer(categorical[name]))
             else:
                 # Create a zero tensor with the correct embedding dimension if category is missing
                 cat_features.append(torch.zeros(text.size(0), emb_layer.embedding_dim, device=text.device))

        cat_features = torch.cat(cat_features, dim=1)
        combined = torch.cat([text_features, skills_features, cat_features], dim=1)
        return combined


    def forward(self, user_text, job_text, user_skills, job_skills, user_categorical, job_categorical):
        user_features = self.extract_features(user_text, user_skills, user_categorical)
        job_features = self.extract_features(job_text, job_skills, job_categorical)

        combined_features = torch.cat([user_features, job_features], dim=1)
        combined_features = self.batch_norm(combined_features)

        scores = self.fm(combined_features)
        scores = torch.sigmoid(scores)

        return scores

In [10]:
# ============================================================================
# VOCABULARY BUILDER
# ============================================================================

class Vocabulary:
    """Build and manage vocabulary"""

    def __init__(self, max_vocab_size=5000):
        self.max_vocab_size = max_vocab_size
        self.word2idx = {'<PAD>': 0, '<UNK>': 1}
        self.idx2word = {0: '<PAD>', 1: '<UNK>'}
        self.word_freq = Counter()

    def build_from_texts(self, texts):
        for text in texts:
            tokens = self._tokenize(text)
            self.word_freq.update(tokens)

        most_common = self.word_freq.most_common(self.max_vocab_size - 2)

        for idx, (word, freq) in enumerate(most_common, start=2):
            self.word2idx[word] = idx
            self.idx2word[idx] = word

    def _tokenize(self, text):
        text = text.lower()
        text = re.sub(r'[^a-z0-9\s\+\#\.]', ' ', text)
        return text.split()

    def text_to_sequence(self, text, max_length=200):
        tokens = self._tokenize(text)
        indices = [self.word2idx.get(token, 1) for token in tokens]

        if len(indices) < max_length:
            indices += [0] * (max_length - len(indices))
        else:
            indices = indices[:max_length]

        return indices

In [17]:
# ============================================================================
# ConvFM-BASED JOB RECOMMENDER
# ============================================================================

class ConvFMJobRecommender:
    """Job recommender using trained ConvFM model"""

    def __init__(self):
        self.parser = ResumeParser()
        self.scraper = RealTimeJobScraper()
        self.processor = JobPreprocessor()
        self.model = None
        self.vocabulary = None
        self.label_encoders = {}
        self.device = device

    def initialize_model(self, user_profile, jobs, location):
        """Initialize and train ConvFM model"""
        print("\n" + "=" * 80)
        print("üß† INITIALIZING ConvFM MODEL")
        print("=" * 80)

        # Build vocabulary
        print("\nüìö Building vocabulary...")
        self.vocabulary = Vocabulary(max_vocab_size=5000)
        all_texts = [user_profile['resume_text']]
        all_texts.extend([j['description'] + ' ' + j['requirements'] for j in jobs])
        self.vocabulary.build_from_texts(all_texts)
        print(f"‚úÖ Vocabulary size: {len(self.vocabulary.word2idx)}")

        # Initialize label encoders
        self.label_encoders['job_type'] = LabelEncoder()
        self.label_encoders['location'] = LabelEncoder()
        self.label_encoders['education'] = LabelEncoder()

        all_job_types = [j.get('job_type', 'Full-time') for j in jobs] + ['Full-time']
        all_locations = [j.get('location', 'Remote') for j in jobs] + ['Remote', location] # Include specified location
        all_education = ['Bachelors', 'Masters', 'PhD']

        self.label_encoders['job_type'].fit(all_job_types)
        self.label_encoders['location'].fit(all_locations)
        self.label_encoders['education'].fit(all_education)

        # Initialize ConvFM model
        print("\nüèóÔ∏è Building ConvFM model...")
        self.model = ConvFM(
            vocab_size=len(self.vocabulary.word2idx),
            num_skills=len(SKILLS_DATABASE),
            embedding_dim=128,
            cnn_num_filters=64,
            cnn_filter_sizes=[3, 4, 5],
            fm_factor_dim=32,
            skills_embed_dim=32,
            dropout=0.5
        ).to(self.device)

        print(f"‚úÖ Model initialized with {sum(p.numel() for p in self.model.parameters()):,} parameters")

        # Train model with synthetic interactions
        self._train_model_quick(user_profile, jobs)

    def _train_model_quick(self, user_profile, jobs):
        """Quick training with synthetic data"""
        print("\nüéØ Quick training ConvFM...")

        # Generate synthetic training data
        user_skills_set = set(user_profile['skills'])
        train_data = []

        for job in jobs[:min(len(jobs), 50)]:  # Use subset for quick training
            job_skills_set = set(job['skills'])
            skill_match = len(user_skills_set & job_skills_set) / max(len(job_skills_set), 1)

            # Add more noise and scale the label to provide more variation
            label = min(1.0, max(0.0, skill_match * 0.7 + np.random.normal(0, 0.3))) # Increased noise and scaling
            train_data.append((job, label))

        if not train_data:
            print("‚ö†Ô∏è No training data generated, skipping training")
            return

        # Create DataLoader for batching
        class TrainingDataset(Dataset):
            def __init__(self, user_profile, jobs, vocabulary, label_encoders):
                self.user_profile = user_profile
                self.jobs = jobs
                self.vocabulary = vocabulary
                self.label_encoders = label_encoders
                self.user_skills_vec = self._get_skills_vector(user_profile['skills'])
                self.user_cat = {
                    'job_type': label_encoders['job_type'].transform(['Full-time'])[0],
                    'location': label_encoders['location'].transform([user_profile.get('location', 'Remote')])[0], # Use user profile location or default
                    'education_level': label_encoders['education'].transform([user_profile['education_level']])[0]
                }

            def __len__(self):
                return len(self.jobs)

            def __getitem__(self, idx):
                job = self.jobs[idx]
                job_skills_set = set(job['skills'])
                skill_match = len(set(self.user_profile['skills']) & job_skills_set) / max(len(job_skills_set), 1)
                label = min(1.0, max(0.0, skill_match * 0.7 + np.random.normal(0, 0.3))) # Consistent label generation

                user_text = self.vocabulary.text_to_sequence(self.user_profile['resume_text'])
                job_text = self.vocabulary.text_to_sequence(job['description'] + ' ' + job['requirements'])

                job_skills_vec = self._get_skills_vector(job['skills'])

                job_cat = {
                    'job_type': self.label_encoders['job_type'].transform([job.get('job_type', 'Full-time')])[0],
                    'location': self.label_encoders['location'].transform([job.get('location', 'Remote')])[0],
                    'education_level': 0 # Default for job education
                }

                return (torch.LongTensor(user_text), torch.LongTensor(job_text),
                        torch.FloatTensor(self.user_skills_vec), torch.FloatTensor(job_skills_vec),
                        self.user_cat, job_cat, torch.FloatTensor([label]))

            def _get_skills_vector(self, skills):
                vector = np.zeros(len(SKILLS_DATABASE))
                for skill in skills:
                    if skill in SKILLS_DATABASE:
                        idx = SKILLS_DATABASE.index(skill)
                        vector[idx] = 1
                return vector


        train_dataset = TrainingDataset(user_profile, [job for job, _ in train_data], self.vocabulary, self.label_encoders)
        train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True) # Use batch_size > 1

        # Training loop
        optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001)
        criterion = nn.BCELoss()

        self.model.train()
        num_epochs = 15 # Increased epochs

        for epoch in range(num_epochs):
            total_loss = 0
            for user_text, job_text, user_skills_vec, job_skills_vec, user_cat, job_cat, labels in train_dataloader:

                user_text = user_text.to(self.device)
                job_text = job_text.to(self.device)
                user_skills_vec = user_skills_vec.to(self.device)
                job_skills_vec = job_skills_vec.to(self.device)
                labels = labels.to(self.device)

                user_cat_processed = {k: v.to(self.device) for k, v in user_cat.items()}
                job_cat_processed = {k: v.to(self.device) for k, v in job_cat.items()}


                # Forward pass
                pred = self.model(user_text, job_text, user_skills_vec, job_skills_vec, user_cat_processed, job_cat_processed)
                loss = criterion(pred, labels)

                # Backward pass
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                total_loss += loss.item()

            avg_loss = total_loss / len(train_dataloader)
            print(f"   Epoch {epoch + 1}/{num_epochs} - Loss: {avg_loss:.4f}")

        print("‚úÖ Model training complete!")

    def _get_skills_vector(self, skills):
        """Convert skills to one-hot vector"""
        vector = np.zeros(len(SKILLS_DATABASE))
        for skill in skills:
            if skill in SKILLS_DATABASE:
                idx = SKILLS_DATABASE.index(skill)
                vector[idx] = 1
        return vector

    def recommend_from_resume(self, resume_path, resume_type='pdf', location="India",
                            num_jobs=100, top_k=15, min_skill_match=0.3, site_names=['indeed', 'linkedin']):
        """Complete recommendation pipeline using ConvFM"""

        print("\n" + "=" * 80)
        print("üöÄ STARTING ConvFM JOB RECOMMENDATION")
        print("=" * 80)

        # STEP 1: Parse Resume
        user_profile = self.parser.parse_resume(resume_path, resume_type)

        if not user_profile['skills']:
            print("\n‚ùå ERROR: No skills found in resume!")
            print("Please ensure your resume contains technical skills.")
            return None

        # STEP 2: Scrape Real Jobs
        raw_jobs = self.scraper.scrape_jobs(
            location=location,
            results_wanted=num_jobs,
            site_names=site_names,
            is_remote=False
        )

        if not raw_jobs:
            print("\n‚ùå ERROR: No jobs found!")
            return None

        # STEP 3: Process Jobs
        jobs = self.processor.process_jobs(raw_jobs)

        if not jobs:
            print("\n‚ùå ERROR: No valid jobs after processing!")
            return None

        # STEP 4: Initialize and Train ConvFM Model
        self.initialize_model(user_profile, jobs, location) # Pass location to initialize_model

        # STEP 5: Get ConvFM Predictions and Match Jobs
        print("\n" + "=" * 80)
        print("üéØ GENERATING RECOMMENDATIONS WITH ConvFM")
        print("=" * 80)

        recommendations = []
        user_skills_set = set(user_profile['skills'])

        self.model.eval()
        with torch.no_grad():
            # Prepare data for batch processing
            job_texts = []
            job_skills_vectors = []
            job_categorical_data = {k: [] for k in self.label_encoders.keys()}
            job_list_filtered = [] # Store jobs that pass skill match threshold

            for job in jobs:
                job_skills_set = set(job['skills'])
                if not job_skills_set:
                    skill_match = 0.0
                else:
                    skill_match = len(user_skills_set & job_skills_set) / len(job_skills_set)

                if skill_match < min_skill_match:
                    continue

                job_texts.append(self.vocabulary.text_to_sequence(job['description'] + ' ' + job['requirements']))
                job_skills_vectors.append(self._get_skills_vector(job['skills']))
                job_categorical_data['job_type'].append(self.label_encoders['job_type'].transform([job.get('job_type', 'Full-time')])[0])
                job_categorical_data['location'].append(self.label_encoders['location'].transform([job.get('location', 'Remote')])[0])
                job_categorical_data['education'].append(0) # Default for job education
                job_list_filtered.append(job)


            if not job_list_filtered:
                print(f"\n‚ùå No jobs found with at least {min_skill_match*100:.0f}% skill match!")
                print("Try lowering the minimum skill match threshold.")
                return None

            # Convert lists to tensors
            job_texts_tensor = torch.LongTensor(job_texts).to(self.device)
            job_skills_vectors_tensor = torch.FloatTensor(job_skills_vectors).to(self.device)
            job_categorical_tensors = {k: torch.LongTensor(v).to(self.device) for k, v in job_categorical_data.items()}

            user_text_tensor = torch.LongTensor(
                self.vocabulary.text_to_sequence(user_profile['resume_text'])
            ).unsqueeze(0).repeat(len(job_list_filtered), 1).to(self.device) # Repeat for batch

            user_skills_vec_tensor = torch.FloatTensor(
                self._get_skills_vector(user_profile['skills'])
            ).unsqueeze(0).repeat(len(job_list_filtered), 1).to(self.device) # Repeat for batch

            user_cat_tensor = {
                'job_type': torch.LongTensor([self.label_encoders['job_type'].transform(['Full-time'])[0]]).repeat(len(job_list_filtered)).to(self.device),
                'location': torch.LongTensor([self.label_encoders['location'].transform([location])[0]]).repeat(len(job_list_filtered)).to(self.device),
                'education_level': torch.LongTensor([self.label_encoders['education'].transform([user_profile['education_level']])[0]]).repeat(len(job_list_filtered)).to(self.device)
            }


            # Process in batches
            batch_size = 16 # Choose a suitable batch size
            all_model_scores = []

            for i in tqdm(range(0, len(job_list_filtered), batch_size), desc="Scoring jobs"):
                batch_job_texts = job_texts_tensor[i:i+batch_size]
                batch_job_skills_vectors = job_skills_vectors_tensor[i:i+batch_size]
                batch_job_categorical = {k: v[i:i+batch_size] for k, v in job_categorical_tensors.items()}

                batch_user_text = user_text_tensor[i:i+batch_size]
                batch_user_skills_vec = user_skills_vec_tensor[i:i+batch_size]
                batch_user_cat = {k: v[i:i+batch_size] for k, v in user_cat_tensor.items()}


                model_scores = self.model(
                    batch_user_text, batch_job_texts,
                    batch_user_skills_vec, batch_job_skills_vectors,
                    batch_user_cat, batch_job_categorical
                ).squeeze(1).tolist()
                all_model_scores.extend(model_scores)

            # Combine scores and create recommendations
            for i, job in enumerate(job_list_filtered):
                job_skills_set = set(job['skills'])
                user_skills_set = set(user_profile['skills'])
                matching_skills = list(user_skills_set & job_skills_set)
                missing_skills = list(job_skills_set - user_skills_set)

                skill_match = len(matching_skills) / max(len(job_skills_set), 1)

                model_score = all_model_scores[i]
                final_score = 0.7 * model_score + 0.3 * skill_match

                recommendations.append({
                    'job': job,
                    'convfm_score': model_score,
                    'skill_match_score': skill_match,
                    'final_score': final_score,
                    'matching_skills': matching_skills,
                    'missing_skills': missing_skills
                })


        # Sort by final score
        recommendations.sort(key=lambda x: x['final_score'], reverse=True)
        recommendations = recommendations[:top_k]


        # Calculate evaluation metrics
        self._display_evaluation_metrics(recommendations)

        # Display results
        result = {
            'user_profile': user_profile,
            'recommendations': recommendations,
            'total_jobs_analyzed': len(jobs),
            'location': location,
            'min_skill_match': min_skill_match,
            'timestamp': datetime.now().isoformat()
        }

        self._display_results(result)
        self._display_skill_gap_analysis(result)

        return result

    def _display_evaluation_metrics(self, recommendations):
        """Display ConvFM model evaluation metrics"""
        print("\n" + "=" * 80)
        print("üìä ConvFM MODEL EVALUATION METRICS")
        print("=" * 80)

        if not recommendations:
            print("No recommendations to evaluate")
            return

        convfm_scores = [r['convfm_score'] for r in recommendations]
        skill_scores = [r['skill_match_score'] for r in recommendations]
        final_scores = [r['final_score'] for r in recommendations]

        print(f"\nüéØ ConvFM Model Scores:")
        print(f"   Mean Score: {np.mean(convfm_scores):.4f}")
        print(f"   Std Dev: {np.std(convfm_scores):.4f}")
        print(f"   Min Score: {np.min(convfm_scores):.4f}")
        print(f"   Max Score: {np.max(convfm_scores):.4f}")

        print(f"\nüîß Skill Match Scores:")
        print(f"   Mean Match: {np.mean(skill_scores):.1%}")
        print(f"   Std Dev: {np.std(skill_scores):.4f}")
        print(f"   Min Match: {np.min(skill_scores):.1%}")
        print(f"   Max Match: {np.max(skill_scores):.1%}")

        print(f"\nüìà Final Combined Scores (70% ConvFM + 30% Skill Match):")
        print(f"   Mean Score: {np.mean(final_scores):.4f}")
        print(f"   Std Dev: {np.std(final_scores):.4f}")
        print(f"   Min Score: {np.min(final_scores):.4f}")
        print(f"   Max Score: {np.max(final_scores):.4f}")

        # Precision metrics
        high_quality = sum(1 for s in final_scores if s >= 0.7)
        medium_quality = sum(1 for s in final_scores if 0.5 <= s < 0.7)
        low_quality = sum(1 for s in final_scores if s < 0.5)

        print(f"\n‚ú® Recommendation Quality Distribution:")
        print(f"   High Quality (‚â•0.7): {high_quality} jobs ({high_quality/len(recommendations)*100:.1f}%)")
        print(f"   Medium Quality (0.5-0.7): {medium_quality} jobs ({medium_quality/len(recommendations)*100:.1f}%)")
        print(f"   Lower Quality (<0.5): {low_quality} jobs ({low_quality/len(recommendations)*100:.1f}%)")

    def _display_results(self, result):
        """Display recommendations"""
        user = result['user_profile']
        recs = result['recommendations']

        print("\n" + "=" * 80)
        print("üéØ TOP JOB RECOMMENDATIONS")
        print("=" * 80)

        print(f"\nüë§ CANDIDATE: {user['name']}")
        print(f"üìß {user['email']}")
        print(f"üì± {user['phone']}")
        print(f"üíº {user['experience_years']} years experience")
        print(f"üéì {user['education_level']}")
        print(f"üîß {len(user['skills'])} skills")

        print(f"\nüìä ANALYSIS:")
        print(f"   ‚úì Jobs Analyzed: {result['total_jobs_analyzed']}")
        print(f"   ‚úì Top Matches: {len(recs)}")
        print(f"   ‚úì Location: {result['location']}")
        print(f"   ‚úì Min Skill Match: {result['min_skill_match']*100:.0f}%")

        print(f"\n" + "=" * 80)
        print(f"TOP {len(recs)} JOB RECOMMENDATIONS (Ranked by ConvFM)")
        print("=" * 80)

        for i, rec in enumerate(recs, 1):
            job = rec['job']

            print(f"\n{i}. {job['title']}")
            print(f"üè¢ {job['company']}")
            print(f"üìç {job['location']} | {job['job_type']}")
            print(f"üí∞ {job['salary']}")
            print(f"üìÖ Posted: {job['date_posted']}")
            print(f"üåê Source: {job['site']}")
            print(f"\nüìä Scores:")
            print(f"   ‚Ä¢ Final Score: {rec['final_score']:.1%}")
            print(f"   ‚Ä¢ ConvFM Model: {rec['convfm_score']:.1%}")
            print(f"   ‚Ä¢ Skill Match: {rec['skill_match_score']:.1%}")

            if rec['matching_skills']:
                print(f"\n‚úÖ Matching Skills ({len(rec['matching_skills'])}): {', '.join(rec['matching_skills'][:8])}")
                if len(rec['matching_skills']) > 8:
                    print(f"       ... and {len(rec['matching_skills']) - 8} more")

            if rec['missing_skills'][:3]:
                print(f"üìö Skills to Learn: {', '.join(rec['missing_skills'][:3])}")

            print(f"\nüîó Apply: {job['job_url'][:80]}...")
            print("-" * 80)

        print("\n" + "=" * 80)

    def _display_skill_gap_analysis(self, result):
        """Display detailed skill gap analysis"""
        user = result['user_profile']
        recs = result['recommendations']

        print("\n" + "=" * 80)
        print("üìö SKILL GAP ANALYSIS")
        print("=" * 80)

        # Collect all required skills
        all_required_skills = set()
        skill_frequency = Counter()

        for rec in recs:
            for skill in rec['missing_skills']:
                all_required_skills.add(skill)
                skill_frequency[skill] += 1

        user_skills = set(user['skills'])

        print(f"\nüìä Overview:")
        print(f"   Your Skills: {len(user_skills)}")
        print(f"   Skills in Demand: {len(all_required_skills)}")
        print(f"   Average Match Rate: {np.mean([r['skill_match_score'] for r in recs]):.1%}")

        if skill_frequency:
            print(f"\nüéØ TOP 10 SKILLS TO LEARN (by frequency in job postings):")
            print("-" * 80)

            for i, (skill, count) in enumerate(skill_frequency.most_common(10), 1):
                percentage = (count / len(recs)) * 100
                bar = "‚ñà" * int(percentage / 5)
                print(f"   {i:2d}. {skill:20s} | {bar:20s} {count:2d}/{len(recs)} jobs ({percentage:5.1f}%)")
        else:
            print("\n‚úÖ Great! You have all the skills required for these positions!")

        # Skills you have that are valuable
        valuable_skills = Counter()
        for rec in recs:
            for skill in rec['matching_skills']:
                valuable_skills[skill] += 1

        if valuable_skills:
            print(f"\nüíé YOUR MOST VALUABLE SKILLS (appearing in recommendations):")
            print("-" * 80)

            for i, (skill, count) in enumerate(valuable_skills.most_common(10), 1):
                percentage = (count / len(recs)) * 100
                bar = "‚ñà" * int(percentage / 5)
                print(f"   {i:2d}. {skill:20s} | {bar:20s} {count:2d}/{len(recs)} jobs ({percentage:5.1f}%)")

        # Learning path recommendation
        print(f"\nüéì RECOMMENDED LEARNING PATH:")
        print("-" * 80)

        top_skills = [skill for skill, _ in skill_frequency.most_common(5)]

        if top_skills:
            print("Based on your job matches, focus on learning these skills in order:")
            for i, skill in enumerate(top_skills, 1):
                count = skill_frequency[skill]
                print(f"   {i}. {skill} - Required by {count} of your top matches")
        else:
            print("   ‚úÖ You're well-prepared for these positions!")

        print("\n" + "=" * 80)

In [18]:
# ============================================================================
# FILE UPLOAD INTERFACE
# ============================================================================

def upload_resume():
    """Interactive resume upload in Colab"""
    if not COLAB_ENV:
        print("‚ö†Ô∏è  Not in Colab. Use: recommend_from_resume('path/to/resume.pdf', 'pdf')")
        return None, None

    print("=" * 80)
    print("üì§ UPLOAD YOUR RESUME")
    print("=" * 80)
    print("Supported formats: PDF, DOCX, TXT")
    print("Click 'Choose Files' button below...\n")

    uploaded = files.upload()

    if not uploaded:
        print("‚ùå No file uploaded!")
        return None, None

    filename = list(uploaded.keys())[0]
    file_ext = filename.split('.')[-1].lower()

    print(f"\n‚úÖ Uploaded: {filename}")
    print(f"üìã File type: {file_ext}")
    print(f"üì¶ Size: {len(uploaded[filename]) / 1024:.1f} KB")

    return filename, file_ext

# ============================================================================
# MAIN EXECUTION
# ============================================================================

print("\n" + "=" * 80)
print("‚úÖ ConvFM JOB RECOMMENDATION SYSTEM READY!")
print("=" * 80)

print("\nüöÄ USAGE:")

# Step 1: Upload your resume
filename, file_type = upload_resume()

# Step 2: Get recommendations
engine = ConvFMJobRecommender()

result = engine.recommend_from_resume(
    resume_path=filename,
    resume_type=file_type,
    location='India',          # Specify location
    num_jobs=100,              # Number of jobs to analyze
    top_k=15,                  # Number of recommendations
    min_skill_match=0.3,       # Minimum 30% skill match
    site_names=['indeed', 'linkedin']
)

# Step 3: Save results
if result:
    with open('job_recommendations.json', 'w') as f:
        json.dump(result, f, indent=2, default=str)
    print("‚úÖ Results saved to job_recommendations.json")


print("\n" + "=" * 80)
print("Key Features:")
print("  ‚úì Uses ConvFM neural network for intelligent matching")
print("  ‚úì Real-time job scraping from Indeed & LinkedIn")
print("  ‚úì Skills-based matching (minimum 30% threshold)")
print("  ‚úì Location-based filtering")
print("  ‚úì Detailed evaluation metrics")
print("  ‚úì Comprehensive skill gap analysis")
print("=" * 80)


‚úÖ ConvFM JOB RECOMMENDATION SYSTEM READY!

üöÄ USAGE:
üì§ UPLOAD YOUR RESUME
Supported formats: PDF, DOCX, TXT
Click 'Choose Files' button below...



Saving KiranDhanvate-resume.pdf to KiranDhanvate-resume (3).pdf

‚úÖ Uploaded: KiranDhanvate-resume (3).pdf
üìã File type: pdf
üì¶ Size: 4042.3 KB

üöÄ STARTING ConvFM JOB RECOMMENDATION

üìÑ Parsing resume: KiranDhanvate-resume (3).pdf
‚úÖ Successfully parsed resume!

Name: Kiran Dhanvate
Email: kirandhanvate735@gmail.com
Phone: +91 9373791110
Skills Found: 28
Experience: 0 years
Education: Bachelors

üîß Top Skills: Azure, C, CI/CD, Computer Vision, Deep Learning, Django, Docker, FastAPI, Flask, Git
       ... and 18 more

üîç SCRAPING REAL-TIME JOBS
   Location: India
   Sites: indeed, linkedin
   Target: 100 jobs

‚úÖ Successfully scraped 113 REAL jobs!
üìä Breakdown:
      - indeed: 59 jobs
      - linkedin: 54 jobs

üîß PROCESSING 113 JOBS
‚úÖ Processed 113 jobs successfully
üìä Skills per job: avg 2.4, max 23

üß† INITIALIZING ConvFM MODEL

üìö Building vocabulary...
‚úÖ Vocabulary size: 1341

üèóÔ∏è Building ConvFM model...
‚úÖ Model initialized with 324,097 paramet

Scoring jobs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00, 16.42it/s]


üìä ConvFM MODEL EVALUATION METRICS

üéØ ConvFM Model Scores:
   Mean Score: 1.0000
   Std Dev: 0.0000
   Min Score: 1.0000
   Max Score: 1.0000

üîß Skill Match Scores:
   Mean Match: 63.2%
   Std Dev: 0.0919
   Min Match: 50.0%
   Max Match: 83.3%

üìà Final Combined Scores (70% ConvFM + 30% Skill Match):
   Mean Score: 0.8897
   Std Dev: 0.0276
   Min Score: 0.8500
   Max Score: 0.9500

‚ú® Recommendation Quality Distribution:
   High Quality (‚â•0.7): 15 jobs (100.0%)
   Medium Quality (0.5-0.7): 0 jobs (0.0%)
   Lower Quality (<0.5): 0 jobs (0.0%)

üéØ TOP JOB RECOMMENDATIONS

üë§ CANDIDATE: Kiran Dhanvate
üìß kirandhanvate735@gmail.com
üì± +91 9373791110
üíº 0 years experience
üéì Bachelors
üîß 28 skills

üìä ANALYSIS:
   ‚úì Jobs Analyzed: 113
   ‚úì Top Matches: 15
   ‚úì Location: India
   ‚úì Min Skill Match: 30%

TOP 15 JOB RECOMMENDATIONS (Ranked by ConvFM)

1. Sr. Dot Net Developer
üè¢ Whitelotus Corporation Pvt Ltd
üìç Remote, IN | fulltime
üí∞ Not specifi




In [20]:
# Transform data and run predictions
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Use preprocessor to transform features
features_np, _ = preprocessor.transform(df_eval)
features = torch.LongTensor(features_np).to(device)

model.eval()
preds = []
with torch.no_grad():
    for i in range(0, len(features), 1024):
        batch = features[i:i+1024]
        out = model(batch)
        preds.append(out.detach().cpu().numpy())

preds = np.concatenate(preds).reshape(-1)

# Attach predictions to dataframe
output_df = df_eval.copy()
output_df['prediction'] = preds

# Print same-style output + metrics (if label available)
print("\nSAMPLE OUTPUT (first 5 rows):")
print(output_df.head()[FEATURE_COLUMNS + ([TARGET_COLUMN] if TARGET_COLUMN else []) + ['prediction']])

if TARGET_COLUMN is not None:
    y_true = df_eval[TARGET_COLUMN].values
    # If target is binary {0,1}, compute classification metrics
    if set(np.unique(y_true)).issubset({0,1}):
        prob = 1 / (1 + np.exp(-preds))  # if logits; if already probs, it's okay
        y_pred = (prob > 0.5).astype(int)
        try:
            auc = roc_auc_score(y_true, prob)
        except:
            auc = float('nan')
        acc = accuracy_score(y_true, y_pred)
        prec = precision_score(y_true, y_pred, zero_division=0)
        rec = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)
        print("\nMETRICS (Binary Classification):")
        print(f"  AUC:        {auc:.6f}")
        print(f"  Accuracy:   {acc:.6f}")
        print(f"  Precision:  {prec:.6f}")
        print(f"  Recall:     {rec:.6f}")
        print(f"  F1-Score:   {f1:.6f}")
    else:
        # Regression metrics
        from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
        mse = mean_squared_error(y_true, preds)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_true, preds)
        r2 = r2_score(y_true, preds)
        print("\nMETRICS (Regression):")
        print(f"  MSE:   {mse:.6f}")
        print(f"  RMSE:  {rmse:.6f}")
        print(f"  MAE:   {mae:.6f}")
        print(f"  R^2:   {r2:.6f}")

# Save predictions
output_path = 'convfm_predictions.csv'
output_df.to_csv(output_path, index=False)
print(f"\n‚úì Predictions saved to: {output_path}")


NameError: name 'preprocessor' is not defined