In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import pickle
import time
from scipy.sparse import save_npz, load_npz
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import logging
import os
import warnings
import argparse
warnings.filterwarnings("ignore")

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

# Download required NLTK resources
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
    nltk.data.find('corpora/wordnet')
except LookupError:
    logger.info("Downloading required NLTK resources...")
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')

# Timer decorator for performance monitoring
def timer_decorator(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        logger.info(f"{func.__name__} took {end_time - start_time:.2f} seconds to execute")
        return result
    return wrapper

# Enhanced function to preprocess text
def preprocess_text(text, remove_stopwords=True, lemmatize=True):
    """
    Advanced text preprocessing function
    
    Parameters:
    -----------
    text : str
        Text to preprocess
    remove_stopwords : bool
        Whether to remove stopwords
    lemmatize : bool
        Whether to lemmatize words
        
    Returns:
    --------
    str
        Preprocessed text
    """
    if not isinstance(text, str) or pd.isna(text):
        return ''
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', ' ', text)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', ' ', text)
    
    # Remove special characters and numbers
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        # Add custom domain-specific stopwords
        custom_stopwords = {'job', 'work', 'company', 'position', 'required', 'requirements',
                           'experience', 'skill', 'skills', 'candidate', 'opportunity', 'role'}
        stop_words.update(custom_stopwords)
        tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize
    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Remove short words (length < 3)
    tokens = [word for word in tokens if len(word) >= 3]
    
    # Join tokens back into text
    return ' '.join(tokens)

@timer_decorator
def preprocess_dataframe(df, text_columns):
    """
    Preprocess text columns in a dataframe
    
    Parameters:
    -----------
    df : pandas DataFrame
        The dataframe to preprocess
    text_columns : list
        List of text columns to preprocess
        
    Returns:
    --------
    df : pandas DataFrame
        Preprocessed dataframe
    """
    logger.info("Preprocessing text columns...")
    
    # Create copy to avoid modifying original dataframe
    df = df.copy()
    
    # Fill missing values in text columns
    for col in text_columns:
        df[col] = df[col].fillna('')
    
    # Preprocess each text column with progress bar
    for col in text_columns:
        logger.info(f"Preprocessing column: {col}")
        processed_col = f"{col}_processed"
        
        # Process text column in batches for large datasets
        batch_size = 10000
        num_batches = (len(df) + batch_size - 1) // batch_size
        
        processed_series = pd.Series(index=df.index, dtype='object')
        
        for i in tqdm(range(num_batches), desc=f"Processing {col}"):
            start_idx = i * batch_size
            end_idx = min((i + 1) * batch_size, len(df))
            batch = df.iloc[start_idx:end_idx]
            
            processed_batch = batch[col].apply(preprocess_text)
            processed_series.iloc[start_idx:end_idx] = processed_batch.values
        
        df[processed_col] = processed_series
    
    return df

@timer_decorator
def create_feature_combinations(df, processed_columns, weights=None):
    """
    Create weighted combinations of processed text columns
    
    Parameters:
    -----------
    df : pandas DataFrame
        The dataframe with processed text columns
    processed_columns : list
        List of processed text columns to combine
    weights : dict
        Dictionary of weights for each column
        
    Returns:
    --------
    df : pandas DataFrame
        Dataframe with combined features
    """
    logger.info("Creating feature combinations...")
    
    # Default weights
    if weights is None:
        weights = {
            'job_title_processed': 3.0,
            'descriptions_processed': 2.0,
            'category_processed': 1.5,
            'subcategory_processed': 1.0,
            'role_processed': 2.0
        }
    
    # Filter weights to only include columns that exist
    weights = {col: weight for col, weight in weights.items() if col in df.columns}
    
    # For any processed columns not in weights, assign default weight of 1.0
    for col in processed_columns:
        if col not in weights:
            weights[col] = 1.0
    
    # Create weighted combination
    df['combined_features'] = ''
    
    for col, weight in weights.items():
        if col in df.columns:
            # Repeat the text according to weight
            if weight > 1:
                repeated_text = df[col].apply(lambda x: ' '.join([str(x)] * int(weight)))
                df['combined_features'] += ' ' + repeated_text
            else:
                df['combined_features'] += ' ' + df[col].astype(str)
    
    # Clean up the combined features
    df['combined_features'] = df['combined_features'].str.strip()
    
    return df

class JobRecommendationSystem:
    """
    Job Recommendation System class that handles training, evaluation, and recommendations
    """
    def __init__(self):
        self.vectorizer = None
        self.tfidf_matrix = None
        self.train_indices = None
        self.test_indices = None
        self.df = None
        self.svd_model = None
        self.reduced_tfidf_matrix = None
        self.kmeans_model = None
        self.job_clusters = None
        
    @timer_decorator
    def train_recommendation_model(self, df, feature_column='combined_features', 
                                test_size=0.2, random_state=42, 
                                max_features=10000, ngram_range=(1, 2), 
                                min_df=2, max_df=0.85, use_idf=True):
        """
        Train a TF-IDF based recommendation model
        
        Parameters:
        -----------
        df : pandas DataFrame
            The data to train the model on
        feature_column : str
            The column to use for training
        test_size : float
            The proportion of the dataset to include in the test split
        random_state : int
            Random state for reproducibility
        max_features : int
            Maximum number of features for TF-IDF
        ngram_range : tuple
            Range of n-grams to consider
        min_df : int or float
            Minimum document frequency
        max_df : float
            Maximum document frequency
        use_idf : bool
            Whether to use inverse document frequency
            
        Returns:
        --------
        self : JobRecommendationSystem
            The trained model
        """
        logger.info(f"Training recommendation model with {len(df)} samples...")
        
        # Store the dataframe
        self.df = df.copy()
        
        # Split the data into training and testing sets
        self.train_indices, self.test_indices = train_test_split(
            range(len(df)), 
            test_size=test_size, 
            random_state=random_state
        )
        
        logger.info(f"Training set size: {len(self.train_indices)}, Test set size: {len(self.test_indices)}")
        
        # Get the training data
        train_corpus = df.iloc[self.train_indices][feature_column].tolist()
        
        # Create and fit TF-IDF Vectorizer
        self.vectorizer = TfidfVectorizer(
            max_features=max_features,
            ngram_range=ngram_range,
            min_df=min_df,
            max_df=max_df,
            use_idf=use_idf,
            strip_accents='unicode',
            analyzer='word',
            sublinear_tf=True  # Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf)
        )
        
        logger.info("Fitting TF-IDF vectorizer...")
        self.tfidf_matrix = self.vectorizer.fit_transform(train_corpus)
        logger.info(f"TF-IDF matrix shape: {self.tfidf_matrix.shape}")
        
        return self
    
    @timer_decorator
    def apply_dimensionality_reduction(self, n_components=300):
        """
        Apply dimensionality reduction to the TF-IDF matrix
        
        Parameters:
        -----------
        n_components : int
            Number of components for SVD
            
        Returns:
        --------
        self : JobRecommendationSystem
            The model with reduced dimensionality
        """
        logger.info(f"Applying SVD dimensionality reduction to {n_components} components...")
        
        self.svd_model = TruncatedSVD(n_components=n_components, random_state=42)
        self.reduced_tfidf_matrix = self.svd_model.fit_transform(self.tfidf_matrix)
        
        variance_ratio = self.svd_model.explained_variance_ratio_.sum()
        logger.info(f"Explained variance ratio: {variance_ratio:.4f}")
        
        return self
    
    @timer_decorator
    def cluster_jobs(self, n_clusters=15):
        """
        Cluster jobs using K-means
        
        Parameters:
        -----------
        n_clusters : int
            Number of clusters
            
        Returns:
        --------
        self : JobRecommendationSystem
            The model with job clusters
        """
        logger.info(f"Clustering jobs into {n_clusters} clusters...")
        
        # Use the reduced matrix if available, otherwise use the original
        matrix_to_cluster = self.reduced_tfidf_matrix if self.reduced_tfidf_matrix is not None else self.tfidf_matrix
        
        self.kmeans_model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        self.job_clusters = self.kmeans_model.fit_predict(matrix_to_cluster)
        
        # Add cluster information to the dataframe
        cluster_df = pd.DataFrame({
            'job_idx': self.train_indices,
            'cluster': self.job_clusters
        })
        
        # Count jobs per cluster
        cluster_counts = cluster_df['cluster'].value_counts().to_dict()
        logger.info(f"Jobs per cluster: {cluster_counts}")
        
        return self
    
    @timer_decorator
    def tune_hyperparameters(self, param_grid=None):
        """
        Tune the hyperparameters of the TF-IDF vectorizer
        
        Parameters:
        -----------
        param_grid : dict
            Dictionary with parameters names as keys and lists of parameter settings
            
        Returns:
        --------
        best_params : dict
            Best parameters
        """
        # Default parameter grid
        if param_grid is None:
            param_grid = {
                'max_features': [5000, 10000, 15000],
                'ngram_range': [(1, 1), (1, 2)],
                'min_df': [2, 5],
                'max_df': [0.8, 0.9],
                'use_idf': [True, False]
            }
        
        logger.info("Tuning hyperparameters...")
        
        # Use a smaller sample for tuning
        sample_size = min(5000, len(self.df))
        sample_df = self.df.sample(sample_size, random_state=42)
        
        # Prepare the data
        X = sample_df['combined_features']
        
        # Create a simple pipeline
        vectorizer = TfidfVectorizer()
        
        # Grid search
        grid_search = GridSearchCV(
            vectorizer,
            param_grid,
            cv=3,
            scoring='accuracy',  # Not ideal but works for unsupervised
            n_jobs=-1
        )
        
        # Since we don't have labels, we'll use the same X as both X and y
        # This is just to make GridSearchCV work
        grid_search.fit(X, X)
        
        best_params = grid_search.best_params_
        logger.info(f"Best parameters: {best_params}")
        
        return best_params
    
    def get_recommendations(self, job_id=None, job_text=None, top_n=5, use_clusters=False):
        """
        Get job recommendations based on job ID or job text
        
        Parameters:
        -----------
        job_id : int or None
            The job ID to get recommendations for
        job_text : str or None
            The job text to get recommendations for
        top_n : int
            The number of recommendations to return
        use_clusters : bool
            Whether to use clusters for recommendations
            
        Returns:
        --------
        recommendations : pandas DataFrame
            The recommended jobs
        """
        if job_id is not None:
            # Get the job index
            try:
                job_idx = self.df[self.df['job_id'] == job_id].index[0]
            except IndexError:
                logger.error(f"Job ID {job_id} not found in the dataset")
                return None
            
            # Get the job features
            job_features = self.df.iloc[job_idx]['combined_features']
            
        elif job_text is not None:
            # Preprocess the job text
            job_features = preprocess_text(job_text)
            
            # If the job text is empty after preprocessing, return None
            if not job_features:
                logger.error("Job text is empty after preprocessing")
                return None
            
            # Set job_idx to None to indicate it's a new job
            job_idx = None
            
        else:
            logger.error("Either job_id or job_text must be provided")
            return None
        
        # Transform the job features
        job_vector = self.vectorizer.transform([job_features])
        
        # Apply dimensionality reduction if available
        if self.svd_model is not None:
            job_vector = self.svd_model.transform(job_vector)
        
        # Use clusters for recommendations if requested
        if use_clusters and self.kmeans_model is not None:
            # Predict the cluster of the job
            job_cluster = self.kmeans_model.predict(job_vector)[0]
            
            # Get the indices of the jobs in the same cluster
            cluster_job_indices = [idx for idx, cluster in zip(self.train_indices, self.job_clusters) 
                                if cluster == job_cluster]
            
            # If the job itself is in the training set, exclude it
            if job_idx is not None and job_idx in cluster_job_indices:
                cluster_job_indices.remove(job_idx)
            
            # Get the matrix for jobs in the same cluster
            cluster_matrix = self.reduced_tfidf_matrix[[i for i, idx in enumerate(self.train_indices) 
                                                    if idx in cluster_job_indices]]
            
            # Calculate cosine similarity
            similarity_scores = cosine_similarity(job_vector, cluster_matrix).flatten()
            
            # Get the indices of the top N similar jobs
            top_indices = similarity_scores.argsort()[:-top_n-1:-1]
            
            # Map the indices back to the original dataframe indices
            similar_job_indices = [cluster_job_indices[i] for i in top_indices]
            
        else:
            # Calculate cosine similarity with all jobs in the training set
            matrix_to_use = self.reduced_tfidf_matrix if self.svd_model is not None else self.tfidf_matrix
            similarity_scores = cosine_similarity(job_vector, matrix_to_use).flatten()
            
            # Get the indices of the top N similar jobs (excluding the job itself)
            top_indices = similarity_scores.argsort()[:-top_n-1:-1]
            
            # Map the indices back to the original dataframe indices
            similar_job_indices = [self.train_indices[i] for i in top_indices 
                            if self.train_indices[i] != job_idx]
        
        # Get the top N similar jobs - limit to actual length of similar_job_indices
        recommendations = self.df.iloc[similar_job_indices][
            ['job_id', 'job_title', 'company', 'location', 'category', 'role', 'type', 'salary']
        ].copy()
        
        # Create a list of the similarity scores corresponding to each job in similar_job_indices
        # Make sure the number of similarity scores matches the number of recommendations
        rec_scores = [similarity_scores[top_indices[i]] for i in range(len(similar_job_indices))]
        
        # Add similarity scores - now we're using a list of the right length
        recommendations['similarity_score'] = rec_scores
        
        # Sort by similarity score
        recommendations = recommendations.sort_values('similarity_score', ascending=False)
        
        return recommendations.head(top_n)
    
    @timer_decorator
    def evaluate_model(self, metric='cosine_similarity', sample_size=100):
        """
        Evaluate the model using various metrics
        
        Parameters:
        -----------
        metric : str
            The metric to use for evaluation
        sample_size : int
            The number of test samples to evaluate
            
        Returns:
        --------
        score : float
            The evaluation score
        """
        logger.info(f"Evaluating model using {metric} metric...")
        
        # Use only sample_size jobs from the test set for evaluation
        test_sample = min(sample_size, len(self.test_indices))
        test_sample_indices = np.random.choice(self.test_indices, test_sample, replace=False)
        
        if metric == 'cosine_similarity':
            avg_similarity = 0
            for idx in tqdm(test_sample_indices, desc="Evaluating"):
                job_features = self.df.iloc[idx]['combined_features']
                job_vector = self.vectorizer.transform([job_features])
                
                # Apply dimensionality reduction if available
                if self.svd_model is not None:
                    job_vector = self.svd_model.transform(job_vector)
                    matrix_to_use = self.reduced_tfidf_matrix
                else:
                    matrix_to_use = self.tfidf_matrix
                
                # Calculate cosine similarity
                similarity_scores = cosine_similarity(job_vector, matrix_to_use).flatten()
                
                # Find the 5 most similar jobs in the training set
                top_indices = similarity_scores.argsort()[:-6:-1]
                
                # Calculate average similarity for this job
                avg_job_similarity = similarity_scores[top_indices].mean()
                avg_similarity += avg_job_similarity
            
            avg_similarity /= test_sample
            return avg_similarity
        
        elif metric == 'category_match':
            # Evaluate based on category/role match rate
            match_rate = 0
            for idx in tqdm(test_sample_indices, desc="Evaluating"):
                job_id = self.df.iloc[idx]['job_id']
                test_job_category = self.df.iloc[idx]['category']
                test_job_role = self.df.iloc[idx]['role']
                
                # Get recommendations
                recommendations = self.get_recommendations(job_id=job_id, top_n=5)
                
                if recommendations is not None and not recommendations.empty:
                    # Calculate category match rate
                    category_matches = (recommendations['category'] == test_job_category).mean()
                    role_matches = (recommendations['role'] == test_job_role).mean()
                    
                    # Weighted average
                    job_match_rate = 0.5 * category_matches + 0.5 * role_matches
                    match_rate += job_match_rate
            
            match_rate /= test_sample
            return match_rate
        
        else:
            logger.error(f"Unknown metric: {metric}")
            return 0
    
    def save_model(self, model_dir="models"):
        """
        Save the model to disk
        
        Parameters:
        -----------
        model_dir : str
            Directory to save the model
        """
        # Create directory if it doesn't exist
        os.makedirs(model_dir, exist_ok=True)
        
        # Save the vectorizer
        with open(f"{model_dir}/vectorizer.pkl", "wb") as f:
            pickle.dump(self.vectorizer, f)
        
        # Save the TF-IDF matrix as a sparse matrix
        save_npz(f"{model_dir}/tfidf_matrix.npz", self.tfidf_matrix)
        
        # Save the train indices
        with open(f"{model_dir}/train_indices.pkl", "wb") as f:
            pickle.dump(self.train_indices, f)
        
        # Save the SVD model if available
        if self.svd_model is not None:
            with open(f"{model_dir}/svd_model.pkl", "wb") as f:
                pickle.dump(self.svd_model, f)
            
            # Save the reduced TF-IDF matrix
            np.save(f"{model_dir}/reduced_tfidf_matrix.npy", self.reduced_tfidf_matrix)
        
        # Save the K-means model if available
        if self.kmeans_model is not None:
            with open(f"{model_dir}/kmeans_model.pkl", "wb") as f:
                pickle.dump(self.kmeans_model, f)
            
            # Save the job clusters
            np.save(f"{model_dir}/job_clusters.npy", self.job_clusters)
        
        # Save the model metadata
        metadata = {
            'df_columns': self.df.columns.tolist(),
            'df_index': self.df.index.tolist(),
            'has_svd': self.svd_model is not None,
            'has_kmeans': self.kmeans_model is not None
        }
        
        with open(f"{model_dir}/metadata.pkl", "wb") as f:
            pickle.dump(metadata, f)
        
        logger.info(f"Model saved to {model_dir}")
    
    @classmethod
    def load_model(cls, model_dir="models", df=None):
        """
        Load the model from disk
        
        Parameters:
        -----------
        model_dir : str
            Directory to load the model from
        df : pandas DataFrame
            The dataframe to use (if not provided, must be saved with the model)
            
        Returns:
        --------
        model : JobRecommendationSystem
            The loaded model
        """
        model = cls()
        
        # Load the vectorizer
        with open(f"{model_dir}/vectorizer.pkl", "rb") as f:
            model.vectorizer = pickle.load(f)
        
        # Load the TF-IDF matrix
        model.tfidf_matrix = load_npz(f"{model_dir}/tfidf_matrix.npz")
        
        # Load the train indices
        with open(f"{model_dir}/train_indices.pkl", "rb") as f:
            model.train_indices = pickle.load(f)
        
        # Load the model metadata
        with open(f"{model_dir}/metadata.pkl", "rb") as f:
            metadata = pickle.load(f)
        
        # Load the SVD model if available
        if metadata['has_svd'] and os.path.exists(f"{model_dir}/svd_model.pkl"):
            with open(f"{model_dir}/svd_model.pkl", "rb") as f:
                model.svd_model = pickle.load(f)
            
            # Load the reduced TF-IDF matrix
            model.reduced_tfidf_matrix = np.load(f"{model_dir}/reduced_tfidf_matrix.npy")
        
        # Load the K-means model if available
        if metadata['has_kmeans'] and os.path.exists(f"{model_dir}/kmeans_model.pkl"):
            with open(f"{model_dir}/kmeans_model.pkl", "rb") as f:
                model.kmeans_model = pickle.load(f)
            
            # Load the job clusters
            model.job_clusters = np.load(f"{model_dir}/job_clusters.npy")
        
        # Set the dataframe
        model.df = df
        
        logger.info(f"Model loaded from {model_dir}")
        return model

# Train multiple models with different configurations
@timer_decorator
def train_multiple_models(df, config_grid, eval_metric='cosine_similarity'):
    """
    Train multiple models with different configurations
    
    Parameters:
    -----------
    df : pandas DataFrame
        The dataframe to use
    config_grid : dict
        Dictionary with parameter configurations
    eval_metric : str
        Metric to use for evaluation
        
    Returns:
    --------
    best_model : JobRecommendationSystem
        The best model
    all_models : dict
        Dictionary with all trained models
    results : dict
        Dictionary with evaluation results
    """
    logger.info("Training multiple models with different configurations...")
    
    all_models = {}
    results = {}
    
    # Iterate over all configurations
    for config_name, config in config_grid.items():
        logger.info(f"Training model with configuration: {config_name}")
        
        # Create and train model
        model = JobRecommendationSystem()
        
        # Extract parameters for training
        train_params = {k: v for k, v in config.items() 
                      if k in ['test_size', 'random_state', 'max_features', 
                               'ngram_range', 'min_df', 'max_df', 'use_idf']}
        
        model.train_recommendation_model(df, **train_params)
        
        # Apply dimensionality reduction if specified
        if 'n_components' in config and config['n_components'] > 0:
            model.apply_dimensionality_reduction(n_components=config['n_components'])
        
        # Apply clustering if specified
        if 'n_clusters' in config and config['n_clusters'] > 0:
            model.cluster_jobs(n_clusters=config['n_clusters'])
        
        # Evaluate model
        score = model.evaluate_model(metric=eval_metric)
        
        # Store model and results
        all_models[config_name] = model
        results[config_name] = {
            'score': score,
            'config': config
        }
        
        logger.info(f"Model {config_name} - {eval_metric}: {score:.4f}")
    
    # Find the best model
    best_config_name = max(results.keys(), key=lambda x: results[x]['score'])
    best_model = all_models[best_config_name]
    
    logger.info(f"Best model: {best_config_name} with {eval_metric}: {results[best_config_name]['score']:.4f}")
    
    return best_model, all_models, results

# Visualize model results
def visualize_results(results, metric_name='Score', fig_size=(12, 6)):
    """
    Visualize model results
    
    Parameters:
    -----------
    results : dict
        Dictionary with evaluation results
    metric_name : str
        Name of the metric
    fig_size : tuple
        Figure size
    """
    # Extract scores
    models = list(results.keys())
    scores = [results[model]['score'] for model in models]
    
    # Create figure
    plt.figure(figsize=fig_size)
    
    # Create bar chart
    plt.bar(models, scores)
    
    # Add labels and title
    plt.xlabel('Model Configuration')
    plt.ylabel(metric_name)
    plt.title(f'Model Performance Comparison ({metric_name})')
    
    # Rotate x-axis labels
    plt.xticks(rotation=45, ha='right')
    
    # Add value labels on top of bars
    for i, score in enumerate(scores):
        plt.text(i, score + 0.01, f'{score:.4f}', ha='center')
    
    # Adjust layout
    plt.tight_layout()
    
    # Save figure
    plt.savefig('model_comparison.png')
    plt.close()

# Main execution
def main():
    # Start timer
    start_time = time.time()
    
    # Load the dataset
    logger.info("Loading dataset...")
    df = pd.read_csv('jobstreet_all_job_dataset.csv')
    logger.info(f"Dataset loaded successfully! Shape: {df.shape}")
    
    # Basic data exploration
    logger.info("Performing basic data exploration...")
    
    # Check for missing values
    missing_values = df.isnull().sum()
    logger.info(f"Missing values:\n{missing_values}")
    
    # Fill missing values
    df = df.fillna({
        'job_title': '',
        'descriptions': '',
        'category': 'Unknown',
        'subcategory': 'Unknown',
        'role': 'Unknown'
    })
    
    # Display sample of data
    logger.info("Sample data:")
    logger.info(df.head())
    
    # Preprocess the dataframe
    text_columns = ['job_title', 'descriptions', 'category', 'subcategory', 'role']
    df = preprocess_dataframe(df, text_columns)
    
    # Create feature combinations with weighted columns
    weights = {
        'job_title_processed': 3.0,    # Job title is very important
        'descriptions_processed': 2.0,  # Job description contains detailed requirements
        'category_processed': 1.5,      # Category provides general field
        'subcategory_processed': 1.0,   # Subcategory has specific domain
        'role_processed': 2.0           # Role indicates position level
    }
    
    df = create_feature_combinations(df, 
                                   [col + '_processed' for col in text_columns],
                                   weights=weights)
    
    # Define model configurations to try
    config_grid = {
        'basic': {
            'test_size': 0.2,
            'random_state': 42,
            'max_features': 10000,
            'ngram_range': (1, 1),
            'min_df': 2,
            'max_df': 0.85,
            'use_idf': True
        },
        'advanced': {
            'test_size': 0.2,
            'random_state': 42,
            'max_features': 15000,
            'ngram_range': (1, 2),
            'min_df': 2,
            'max_df': 0.8,
            'use_idf': True,
            'n_components': 300  # Use SVD
        },
        'clustered': {
            'test_size': 0.2,
            'random_state': 42,
            'max_features': 15000,
            'ngram_range': (1, 2),
            'min_df': 2,
            'max_df': 0.8,
            'use_idf': True,
            'n_components': 300,  # Use SVD
            'n_clusters': 20      # Use clustering
        }
    }
    
    # Train multiple models and get the best one
    best_model, all_models, results = train_multiple_models(df, config_grid, eval_metric='cosine_similarity')
    
    # Visualize the results
    visualize_results(results, metric_name='Cosine Similarity')
    
    # Evaluate the best model with category match metric as well
    category_match_score = best_model.evaluate_model(metric='category_match')
    logger.info(f"Best model - Category match score: {category_match_score:.4f}")
    
    # Save the best model
    model_dir = "job_recommender_models"
    best_model.save_model(model_dir)
    
    # Save the processed dataframe for future use
    df[['job_id', 'job_title', 'company', 'descriptions', 'location', 'category', 
       'subcategory', 'role', 'type', 'salary', 'combined_features']].to_csv(
        f'{model_dir}/processed_jobs.csv', index=False
    )
    
    # Create a simple example recommendation
    logger.info("\nGenerating example recommendations...")
    
    # Get a sample job ID
    sample_job_id = df['job_id'].iloc[0]
    sample_job = df[df['job_id'] == sample_job_id].iloc[0]
    
    logger.info(f"\nSample Job: {sample_job['job_title']} at {sample_job['company']}")
    logger.info(f"Category: {sample_job['category']}, Role: {sample_job['role']}")
    
    # Get recommendations
    recommendations = best_model.get_recommendations(job_id=sample_job_id, top_n=5)
    
    # Display recommendations
    logger.info("\nTop 5 Recommendations:")
    for i, (_, rec) in enumerate(recommendations.iterrows(), 1):
        logger.info(f"{i}. {rec['job_title']} at {rec['company']}")
        logger.info(f"   Category: {rec['category']}, Role: {rec['role']}")
        logger.info(f"   Similarity Score: {rec['similarity_score']:.4f}")
    
    # Get recommendations using text
    logger.info("\nGenerating recommendations from job text...")
    
    # Create a sample job text
    sample_job_text = """
    Data Scientist - Lead a team of data scientists to develop and implement machine learning models.
    Requirements: 5+ years of experience in data science, Python, SQL, and machine learning.
    """
    
    # Get recommendations
    text_recommendations = best_model.get_recommendations(job_text=sample_job_text, top_n=5)
    
    # Display recommendations
    logger.info("\nTop 5 Recommendations for sample job text:")
    for i, (_, rec) in enumerate(text_recommendations.iterrows(), 1):
        logger.info(f"{i}. {rec['job_title']} at {rec['company']}")
        logger.info(f"   Category: {rec['category']}, Role: {rec['role']}")
        logger.info(f"   Similarity Score: {rec['similarity_score']:.4f}")
    
    # Calculate execution time
    end_time = time.time()
    execution_time = end_time - start_time
    logger.info(f"\nTotal execution time: {execution_time:.2f} seconds")
    
    logger.info("\nJob recommendation system completed successfully!")

if __name__ == "__main__":
    main()
    