# Understanding sequence prediction 

Sequence prediction is about guessing the next step in a series. It uses what happened before. It is key in making guesses right in many areas like guessing what you want to see on the web, what you might buy next, or what the weather will be.

There are several ways to guess in sequence prediction. Each way has its good points. Some use Markov models. These guess by looking at the last step. They are easy and good at guessing.

Markov models: These models guess the next step by looking at the current step only. They are simple and do their job well.

Directed graphs: These show how steps relate. They draw a picture of what comes next based on what happened before. This helps see how things are connected.

Recurrent neural networks (RNNs): RNNs are good at remembering what happened in a series step by step. They can spot long patterns well, so they make good guesses about the future.
Each way to guess has its own good and bad points. Picking the right way depends on what you need to think. In the next part, we will talk more about these ways. We will see how they work in different areas.

In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import bokeh as bk
import json
from ast import literal_eval

In [3]:
x_train = pd.read_csv(r"C:\Users\PC\Desktop\HRFlow - Recommandation d'emploi basée sur le comportement\x_train_Meacfjr.csv")
y_train = pd.read_csv(r"C:\Users\PC\Desktop\HRFlow - Recommandation d'emploi basée sur le comportement\y_train_SwJNMSu.csv")
x_test = pd.read_csv(r"C:\Users\PC\Desktop\HRFlow - Recommandation d'emploi basée sur le comportement\x_test_jCBBNP2.csv")
display(x_train.head())
display(y_train.head())

Unnamed: 0,session_id,job_ids,actions
0,0,"[305, 299, 300, 290, 282, 274, 264, 261]","['view', 'view', 'view', 'view', 'view', 'view..."
1,1,"[84, 257, 252, 250]","['view', 'view', 'view', 'view']"
2,2,"[241, 237, 221, 309, 310, 306, 301]","['view', 'view', 'apply', 'apply', 'apply', 'a..."
3,3,"[303, 297, 296, 298, 294, 295, 292, 293]","['apply', 'apply', 'apply', 'apply', 'apply', ..."
4,4,"[171, 291, 289, 166, 288, 155]","['apply', 'apply', 'apply', 'apply', 'apply', ..."


Unnamed: 0,session_id,job_id,action
0,0,84,view
1,1,241,view
2,2,303,apply
3,3,171,apply
4,4,286,apply


In [4]:
def convert_string_to_list(df, column):
    if df[column].dtype == 'object':
        df[column] = df[column].apply(lambda x: literal_eval(x) if isinstance(x, str) else x)
    return df

# Appliquer aux colonnes job_ids et actions
X_train = convert_string_to_list(x_train, 'job_ids')
X_train = convert_string_to_list(x_train, 'actions')

In [5]:
# Ajouter des features sur les séquences
X_train['session_length'] = X_train['job_ids'].apply(len)
X_train['unique_jobs_ratio'] = X_train['job_ids'].apply(lambda x: len(set(x))/len(x) if len(x) > 0 else 0)
X_train['apply_ratio'] = X_train['actions'].apply(lambda x: x.count('apply')/len(x) if len(x) > 0 else 0)


In [6]:
X_train

Unnamed: 0,session_id,job_ids,actions,session_length,unique_jobs_ratio,apply_ratio
0,0,"[305, 299, 300, 290, 282, 274, 264, 261]","[view, view, view, view, view, view, view, view]",8,1.0,0.000000
1,1,"[84, 257, 252, 250]","[view, view, view, view]",4,1.0,0.000000
2,2,"[241, 237, 221, 309, 310, 306, 301]","[view, view, apply, apply, apply, apply, apply]",7,1.0,0.714286
3,3,"[303, 297, 296, 298, 294, 295, 292, 293]","[apply, apply, apply, apply, apply, apply, app...",8,1.0,1.000000
4,4,"[171, 291, 289, 166, 288, 155]","[apply, apply, apply, apply, apply, apply]",6,1.0,1.000000
...,...,...,...,...,...,...
15877,15877,"[26581, 27314, 27305, 27327, 27138, 27153]","[apply, apply, apply, apply, apply, apply]",6,1.0,1.000000
15878,15878,"[27220, 27219, 27194]","[view, view, view]",3,1.0,0.000000
15879,15879,"[27211, 27210, 27209]","[view, view, view]",3,1.0,0.000000
15880,15880,"[27233, 27220, 27219, 27232, 27231]","[apply, view, view, view, view]",5,1.0,0.200000


In [7]:
with open('job_listings.json') as f:
    data = json.load(f)

emploi = pd.json_normalize(data)

In [8]:
emploi.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27359,27360,27361,27362,27363,27364,27365,27366,27367,27368
0,TITLE\nQA Intégration / Data Analyst - SalesF...,TITLE\nIngénieur Système\n\nSUMMARY\nNous re...,TITLE\nTesteur QA Automatisation Cypress\n\nSU...,TITLE\nIngénieur support N3 IP - PARIS \n\nSU...,TITLE\nBusiness Analyst MOA FRONT\n\nSUMMARY\n...,TITLE\nBusiness Analyst SAP S/4\n\nSUMMARY\nNo...,TITLE\nSalesforce Marketing Cloud Product Owne...,TITLE\nResponsable Sécurité Opérationnel \n...,TITLE\nArchitecte d'entreprise Data BI\n\nSUMM...,TITLE\nIngénieur.e QA web / mobile - (Haute-S...,...,TITLE\nParis-Administrateur Système DB2 - Z/O...,TITLE\nIngénieur d’intégration applicative\n...,"TITLE\nConsultant MS BI Confirmé Build & Run,...",TITLE\nData modeler / Senior\n\nSUMMARY\nNous ...,TITLE\nLyon-Développeur Cobol Mainframe-R2030...,TITLE\nIncident Manager e-commerce\n\nSUMMARY\...,TITLE\nConsultant Azure Security\n\nSUMMARY\nK...,TITLE\nChef de projet Supply Chain\n\nSUMMARY\...,TITLE\nPO Infrastructure\n\nSUMMARY\nUn Produc...,"TITLE\nData Engineer Senior Spark, Scala, Data..."


In [9]:
emploi = emploi.transpose()

In [10]:
# Rename column 0 to job_desciption
emploi.rename(columns={0: 'job_description'}, inplace=True)

In [11]:
def extract_job_sections(df, text_column='description'):
    """
    Extract job sections from text descriptions based on flags like TITLE, SUMMARY, etc.
    
    Parameters:
    -----------
    df : pandas DataFrame
        DataFrame containing the job descriptions
    text_column : str, default='description'
        The column containing the job description text
        
    Returns:
    --------
    pandas DataFrame
        Original DataFrame with added columns for job_title and summary
    """
    import re
    
    # Create copies of columns to avoid modifying the original DataFrame
    df = df.copy()
    
    # Initialize new columns
    df['job_title'] = ''
    df['summary'] = ''
    
    for idx, row in df.iterrows():
        text = row[text_column]
        
        # Extract job title (between TITLE and SUMMARY)
        title_match = re.search(r'TITLE\s*\n(.*?)(?=\s*\n\s*SUMMARY)', text, re.DOTALL)
        if title_match:
            df.at[idx, 'job_title'] = title_match.group(1).strip()
        
        # Extract summary (after SUMMARY and before any other section flag)
        summary_match = re.search(r'SUMMARY\s*\n(.*?)(?=\s*\n\s*SECTION|\Z)', text, re.DOTALL)
        if summary_match:
            df.at[idx, 'summary'] = summary_match.group(1).strip()
    
    return df


In [12]:

processed_emploi = extract_job_sections(emploi, text_column='job_description')

# Display the first few rows to check the results
processed_emploi[['job_title', 'summary']].head()


Unnamed: 0,job_title,summary
0,QA Intégration / Data Analyst - SalesForces S...,Responsabilités :\nAssurer la qualité des do...
1,Ingénieur Système,Nous recherchons un Ingénieur Système pour n...
2,Testeur QA Automatisation Cypress,Vous avez au moins une première expérience s...
3,Ingénieur support N3 IP - PARIS,Dans le cadre de cette mission :\nVous garanti...
4,Business Analyst MOA FRONT,Nous recherchons un (e) consultant(e) ayant un...


In [13]:
processed_emploi.drop(columns=["job_description"], inplace=True)

In [14]:
import re
processed_emploi.index = processed_emploi.index.astype(int)
processed_emploi.index.name = 'job_id'

# Create more features from the job text data
def extract_additional_features(df):
    """Extract additional features from job descriptions"""
    # Initialize new columns
    df['has_salary_info'] = df['summary'].str.contains('salary|compensation|pay', case=False, regex=True)
    df['has_remote_option'] = df['summary'].str.contains('remote|work from home|telecommute', case=False, regex=True)
    df['experience_level'] = df['summary'].apply(lambda x: 
        'senior' if re.search(r'senior|experienced|[5-9]\+?\s*years', str(x), re.I) else
        'mid' if re.search(r'mid|intermediate|[2-4]\+?\s*years', str(x), re.I) else
        'junior' if re.search(r'junior|entry|graduate|[0-1]\+?\s*years', str(x), re.I) else
        'unknown')
    
    # Extract potential job categories using keyword matching
    categories = ['engineering', 'marketing', 'sales', 'finance', 'hr', 'customer service', 
                 'data science', 'design', 'product', 'operations']
    
    for category in categories:
        df[f'is_{category.replace(" ", "_")}'] = df['job_title'].str.contains(
            category, case=False, regex=True) | df['summary'].str.contains(category, case=False, regex=True)
    
    return df

processed_emploi = extract_additional_features(processed_emploi)


In [15]:
# Create TF-IDF vectors for job titles and summaries
from sklearn.feature_extraction.text import TfidfVectorizer

# For job titles
title_vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
title_vectors = title_vectorizer.fit_transform(processed_emploi['job_title'].fillna(''))

# For job summaries
summary_vectorizer = TfidfVectorizer(max_features=200, stop_words='english')
summary_vectors = summary_vectorizer.fit_transform(processed_emploi['summary'].fillna(''))

# Convert to DataFrame for easier handling
title_features = pd.DataFrame(
    title_vectors.toarray(), 
    index=processed_emploi.index,
    columns=[f'title_term_{i}' for i in range(title_vectors.shape[1])]
)

summary_features = pd.DataFrame(
    summary_vectors.toarray(), 
    index=processed_emploi.index,
    columns=[f'summary_term_{i}' for i in range(summary_vectors.shape[1])]
)

# Merge with the main job dataframe
job_features = pd.concat([processed_emploi, title_features, summary_features], axis=1)

In [17]:
processed_emploi

Unnamed: 0_level_0,job_title,summary,has_salary_info,has_remote_option,experience_level,is_engineering,is_marketing,is_sales,is_finance,is_hr,is_customer_service,is_data_science,is_design,is_product,is_operations
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,QA Intégration / Data Analyst - SalesForces S...,Responsabilités :\nAssurer la qualité des do...,False,False,unknown,False,False,True,False,False,False,False,False,False,False
1,Ingénieur Système,Nous recherchons un Ingénieur Système pour n...,False,False,mid,False,False,False,False,False,False,False,False,False,False
2,Testeur QA Automatisation Cypress,Vous avez au moins une première expérience s...,False,False,unknown,False,False,False,False,False,False,False,False,False,False
3,Ingénieur support N3 IP - PARIS,Dans le cadre de cette mission :\nVous garanti...,False,False,unknown,False,False,False,False,False,False,False,False,False,False
4,Business Analyst MOA FRONT,Nous recherchons un (e) consultant(e) ayant un...,False,False,unknown,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27364,Incident Manager e-commerce,En charge de la gestion des différents incide...,False,False,unknown,False,False,True,False,False,False,False,False,True,False
27365,Consultant Azure Security,KatchMe est un cabinet de placement de consult...,False,False,unknown,False,False,False,False,False,False,False,False,False,False
27366,Chef de projet Supply Chain,"Beager recherche pour l’un de ses clients, act...",False,False,unknown,False,False,False,False,False,False,False,False,True,False
27367,PO Infrastructure,Un Product Owner Infrastructure confirmé (min...,False,False,unknown,False,False,False,False,False,False,False,False,True,False


In [18]:
X_train

Unnamed: 0,session_id,job_ids,actions,session_length,unique_jobs_ratio,apply_ratio
0,0,"[305, 299, 300, 290, 282, 274, 264, 261]","[view, view, view, view, view, view, view, view]",8,1.0,0.000000
1,1,"[84, 257, 252, 250]","[view, view, view, view]",4,1.0,0.000000
2,2,"[241, 237, 221, 309, 310, 306, 301]","[view, view, apply, apply, apply, apply, apply]",7,1.0,0.714286
3,3,"[303, 297, 296, 298, 294, 295, 292, 293]","[apply, apply, apply, apply, apply, apply, app...",8,1.0,1.000000
4,4,"[171, 291, 289, 166, 288, 155]","[apply, apply, apply, apply, apply, apply]",6,1.0,1.000000
...,...,...,...,...,...,...
15877,15877,"[26581, 27314, 27305, 27327, 27138, 27153]","[apply, apply, apply, apply, apply, apply]",6,1.0,1.000000
15878,15878,"[27220, 27219, 27194]","[view, view, view]",3,1.0,0.000000
15879,15879,"[27211, 27210, 27209]","[view, view, view]",3,1.0,0.000000
15880,15880,"[27233, 27220, 27219, 27232, 27231]","[apply, view, view, view, view]",5,1.0,0.200000


In [19]:
# Create features based on session behavior
def create_session_features(X_train, y_train):
    """Create features based on session behavior"""
    # Merge with target data to analyze patterns
    session_data = pd.merge(X_train, y_train, on='session_id', how='left')
    
    # Calculate session-level statistics
    session_stats = session_data.groupby('session_id').agg({
        'session_length': 'first',
        'unique_jobs_ratio': 'first',
        'apply_ratio': 'first',
        'action': lambda x: 'apply' if 'apply' in x.values else 'view'
    }).reset_index()
    
    # Calculate transition probabilities between jobs
    transitions = {}
    
    for _, row in X_train.iterrows():
        job_sequence = row['job_ids']
        
        for i in range(len(job_sequence) - 1):
            current_job = job_sequence[i]
            next_job = job_sequence[i + 1]
            
            if current_job not in transitions:
                transitions[current_job] = {}
            
            if next_job not in transitions[current_job]:
                transitions[current_job][next_job] = 0
                
            transitions[current_job][next_job] += 1
    
    # Normalize transition counts to probabilities
    for job_id in transitions:
        total = sum(transitions[job_id].values())
        for next_job in transitions[job_id]:
            transitions[job_id][next_job] /= total
    
    return session_stats, transitions

session_stats, transitions = create_session_features(X_train, y_train)


In [22]:
X_train

Unnamed: 0,session_id,job_ids,actions,session_length,unique_jobs_ratio,apply_ratio
0,0,"[305, 299, 300, 290, 282, 274, 264, 261]","[view, view, view, view, view, view, view, view]",8,1.0,0.000000
1,1,"[84, 257, 252, 250]","[view, view, view, view]",4,1.0,0.000000
2,2,"[241, 237, 221, 309, 310, 306, 301]","[view, view, apply, apply, apply, apply, apply]",7,1.0,0.714286
3,3,"[303, 297, 296, 298, 294, 295, 292, 293]","[apply, apply, apply, apply, apply, apply, app...",8,1.0,1.000000
4,4,"[171, 291, 289, 166, 288, 155]","[apply, apply, apply, apply, apply, apply]",6,1.0,1.000000
...,...,...,...,...,...,...
15877,15877,"[26581, 27314, 27305, 27327, 27138, 27153]","[apply, apply, apply, apply, apply, apply]",6,1.0,1.000000
15878,15878,"[27220, 27219, 27194]","[view, view, view]",3,1.0,0.000000
15879,15879,"[27211, 27210, 27209]","[view, view, view]",3,1.0,0.000000
15880,15880,"[27233, 27220, 27219, 27232, 27231]","[apply, view, view, view, view]",5,1.0,0.200000


In [25]:
# Create a co-occurrence matrix for job IDs
from collections import defaultdict
import numpy as np

# Count co-occurrences within sequences
co_occurrence = defaultdict(lambda: defaultdict(int))

for sequence in X_train['job_ids']:
    for i, job1 in enumerate(sequence):
        for j, job2 in enumerate(sequence):
            if i != j:
                # Weight by proximity (closer jobs have higher weight)
                weight = 1.0 / (abs(i - j) + 1)
                co_occurrence[job1][job2] += weight

# Get all unique job IDs
all_job_ids = list(set(job_id for seq in X_train['job_ids'] for job_id in seq))

# Create embedding matrix (50 dimensions)
embedding_size = 50
np.random.seed(42)
random_embeddings = {job_id: np.random.randn(embedding_size) for job_id in all_job_ids}

# Refine embeddings based on co-occurrence (simple approach)
job_embeddings = {}
for job_id in all_job_ids:
    if job_id in co_occurrence:
        # Get co-occurring jobs
        co_jobs = co_occurrence[job_id]
        if co_jobs:
            # Create weighted average of random embeddings
            weighted_sum = np.zeros(embedding_size)
            total_weight = 0
            
            for co_job, weight in co_jobs.items():
                if co_job in random_embeddings:
                    weighted_sum += weight * random_embeddings[co_job]
                    total_weight += weight
            
            if total_weight > 0:
                job_embeddings[job_id] = weighted_sum / total_weight
            else:
                job_embeddings[job_id] = random_embeddings[job_id]
        else:
            job_embeddings[job_id] = random_embeddings[job_id]
    else:
        job_embeddings[job_id] = random_embeddings[job_id]

# Function to create session vector remains the same
def create_session_vector(job_ids, job_embeddings, embedding_size=50):
    """Create a session vector by averaging job embeddings with recency weighting"""
    if not job_ids:
        return np.zeros(embedding_size)
    
    # Apply recency weighting - more recent jobs have higher weight
    weights = np.linspace(0.5, 1.0, len(job_ids))
    
    vectors = []
    for i, job_id in enumerate(job_ids):
        if job_id in job_embeddings:
            weighted_vector = job_embeddings[job_id] * weights[i]
            vectors.append(weighted_vector)
    
    if vectors:
        return np.mean(vectors, axis=0)
    return np.zeros(embedding_size)

# Apply to sessions using the same create_session_vector function
X_train['session_vector'] = X_train['job_ids'].apply(
    lambda x: create_session_vector(x, job_embeddings, embedding_size)
)


In [None]:
class MarkovJobRecommender:
    def __init__(self, transitions, job_embeddings, embedding_size=50):
        self.transitions = transitions
        self.job_embeddings = job_embeddings
        self.embedding_size = embedding_size
        
        # Create a similarity matrix between all jobs based on embeddings
        self.similarity_matrix = {}
        job_ids = list(job_embeddings.keys())
        
        for i, job1 in enumerate(job_ids):
            self.similarity_matrix[job1] = {}
            for job2 in job_ids:
                if job1 != job2:
                    # Cosine similarity between job embeddings
                    similarity = np.dot(job_embeddings[job1], job_embeddings[job2]) / (
                        np.linalg.norm(job_embeddings[job1]) * np.linalg.norm(job_embeddings[job2])
                    )
                    self.similarity_matrix[job1][job2] = similarity
    
    def recommend_next_jobs(self, job_sequence, top_n=10):
        """Recommend next jobs based on Markov transitions and embedding similarity"""
        if not job_sequence:
            return []
        
        # Get the last job in the sequence
        last_job = job_sequence[-1]
        
        # Get transition probabilities from the last job
        transition_probs = self.transitions.get(last_job, {})
        
        # Get similar jobs based on embeddings
        similar_jobs = self.similarity_matrix.get(last_job, {})
        
        # Combine transition probabilities and similarities
        scores = {}
        
        # Add scores from transitions (with higher weight)
        for job_id, prob in transition_probs.items():
            if job_id not in job_sequence:  # Avoid recommending already seen jobs
                scores[job_id] = 0.7 * prob
        
        # Add scores from similarities (with lower weight)
        for job_id, sim in similar_jobs.items():
            if job_id not in job_sequence:  # Avoid recommending already seen jobs
                scores[job_id] = scores.get(job_id, 0) + 0.3 * sim
        
        # Sort by score and get top_n
        recommended_jobs = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
        return [job_id for job_id, _ in recommended_jobs]

# Initialize the recommender
markov_recommender = MarkovJobRecommender(transitions, job_embeddings, embedding_size)


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Prepare data for LSTM
def prepare_sequence_data(X_train, y_train, job_embeddings, max_seq_length=20):
    """Prepare sequence data for LSTM model"""
    # Get all unique job IDs
    all_job_ids = list(set(job_id for seq in X_train['job_ids'] for job_id in seq))
    all_job_ids.sort()  # Ensure consistent ordering
    
    # Create job ID to index mapping
    job_to_idx = {job_id: idx + 1 for idx, job_id in enumerate(all_job_ids)}  # +1 for padding (0)
    idx_to_job = {idx + 1: job_id for idx, job_id in enumerate(all_job_ids)}
    
    # Convert job sequences to index sequences
    X_sequences = [
        [job_to_idx[job_id] for job_id in seq[-max_seq_length:]]  # Use only the last max_seq_length jobs
        for seq in X_train['job_ids']
    ]
    
    # Pad sequences
    X_padded = pad_sequences(X_sequences, maxlen=max_seq_length, padding='pre')
    
    # Convert target job IDs to indices
    y_indices = [job_to_idx.get(job_id, 0) for job_id in y_train['target_job_id']]
    
    # Convert target actions to binary (1 for 'apply', 0 for 'view')
    y_actions = (y_train['action'] == 'apply').astype(int).values
    
    return X_padded, y_indices, y_actions, job_to_idx, idx_to_job, len(all_job_ids) + 1

# Prepare data
X_padded, y_indices, y_actions, job_to_idx, idx_to_job, num_jobs = prepare_sequence_data(
    X_train, y_train, job_embeddings
)

# Create embedding matrix from pre-trained job embeddings
embedding_matrix = np.zeros((num_jobs, embedding_size))
for job_id, idx in job_to_idx.items():
    if job_id in job_embeddings:
        embedding_matrix[idx] = job_embeddings[job_id]

# Build LSTM model for job prediction
job_model = Sequential([
    Embedding(
        input_dim=num_jobs,
        output_dim=embedding_size,
        weights=[embedding_matrix],
        input_length=max_seq_length,
        trainable=False
    ),
    LSTM(128, return_sequences=True),
    Dropout(0.3),
    LSTM(64),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(num_jobs, activation='softmax')
])

job_model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

# Build model for action prediction
action_model = Sequential([
    Embedding(
        input_dim=num_jobs,
        output_dim=embedding_size,
        weights=[embedding_matrix],
        input_length=max_seq_length,
        trainable=False
    ),
    LSTM(64, return_sequences=True),
    Dropout(0.3),
    LSTM(32),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

action_model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

# Train models
job_history = job_model.fit(
    X_padded, y_indices,
    epochs=10,
    batch_size=64,
    validation_split=0.2
)

action_history = action_model.fit(
    X_padded, y_actions,
    epochs=10,
    batch_size=64,
    validation_split=0.2
)


In [None]:
from transformers import TFBertModel, BertTokenizer
import tensorflow as tf

# Function to prepare job descriptions for BERT
def prepare_job_descriptions(job_ids, processed_emploi):
    """Get job descriptions for a sequence of job IDs"""
    descriptions = []
    for job_id in job_ids:
        if job_id in processed_emploi.index:
            title = processed_emploi.loc[job_id, 'job_title']
            summary = processed_emploi.loc[job_id, 'summary']
            descriptions.append(f"{title} {summary}")
        else:
            descriptions.append("")
    return descriptions

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to prepare BERT inputs
def prepare_bert_input(job_ids, processed_emploi, max_length=512):
    """Prepare BERT inputs for a job sequence"""
    # Get descriptions for the last 5 jobs (to fit within BERT's context window)
    if len(job_ids) > 5:
        job_ids = job_ids[-5:]
    
    descriptions = prepare_job_descriptions(job_ids, processed_emploi)
    combined_text = " [SEP] ".join(descriptions)
    
    # Tokenize
    inputs = tokenizer(
        combined_text,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='tf'
    )
    
    return inputs
def build_bert_model(num_jobs):
    """Build a BERT-based model for job and action prediction"""
    # BERT base model
    bert = TFBertModel.from_pretrained('bert-base-uncased')
    
    # Inputs
    input_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_ids')
    attention_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='attention_mask')
    
    # BERT outputs
    bert_outputs = bert(input_ids, attention_mask=attention_mask)
    sequence_output = bert_outputs[0]
    pooled_output = bert_outputs[1]
    
    # Common layers
    x = tf.keras.layers.Dense(256, activation='relu')(pooled_output)
    x = tf.keras.layers.Dropout(0.3)(x)
    
    # Job prediction branch
    job_output = tf.keras.layers.Dense(128, activation='relu')(x)
    job_output = tf.keras.layers.Dropout(0.2)(job_output)
    job_output = tf.keras.layers.Dense(num_jobs, activation='softmax', name='job_prediction')(job_output)
    
    # Action prediction branch
    action_output = tf.keras.layers.Dense(64, activation='relu')(x)
    action_output = tf.keras.layers.Dropout(0.2)(action_output)
    action_output = tf.keras.layers.Dense(1, activation='sigmoid', name='action_prediction')(action_output)
    
    # Create model
    model = tf.keras.Model(
        inputs=[input_ids, attention_mask],
        outputs=[job_output, action_output]
    )
    
    # Compile model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
        loss={
            'job_prediction': 'sparse_categorical_crossentropy',
            'action_prediction': 'binary_crossentropy'
        },
        metrics={
            'job_prediction': 'accuracy',
            'action_prediction': 'accuracy'
        }
    )
    
    return model

# Create BERT model
bert_model = build_bert_model(num_jobs)


In [None]:
class JobRecommendationEnsemble:
    def __init__(self, markov_recommender, lstm_model, bert_model, job_to_idx, idx_to_job, processed_emploi):
        self.markov_recommender = markov_recommender
        self.lstm_model = lstm_model
        self.bert_model = bert_model
        self.job_to_idx = job_to_idx
        self.idx_to_job = idx_to_job
        self.processed_emploi = processed_emploi
        self.max_seq_length = 20  # Same as used in LSTM
    
    def predict_next_jobs(self, job_sequence, top_n=10):
        """Predict next jobs using ensemble of models"""
        # 1. Markov predictions
        markov_predictions = self.markov_recommender.recommend_next_jobs(job_sequence, top_n=top_n)
        
        # 2. LSTM predictions
        # Convert job sequence to indices and pad
        lstm_input = [self.job_to_idx.get(job_id, 0) for job_id in job_sequence[-self.max_seq_length:]]
        lstm_input = pad_sequences([lstm_input], maxlen=self.max_seq_length, padding='pre')
        
        # Get LSTM predictions
        lstm_probs = self.lstm_model.predict(lstm_input)[0]
        lstm_top_indices = np.argsort(lstm_probs)[-top_n*2:][::-1]  # Get more candidates
        lstm_predictions = [self.idx_to_job.get(idx, 0) for idx in lstm_top_indices 
                           if idx in self.idx_to_job and self.idx_to_job[idx] not in job_sequence][:top_n]
        
        # 3. BERT predictions
        # Prepare BERT input
        bert_inputs = prepare_bert_input(job_sequence, self.processed_emploi)
        
        # Get BERT predictions
        bert_outputs = self.bert_model.predict({
            'input_ids': bert_inputs['input_ids'],
            'attention_mask': bert_inputs['attention_mask']
        })
        
        bert_probs = bert_outputs[0][0]  # Job prediction probabilities
        bert_top_indices = np.argsort(bert_probs)[-top_n*2:][::-1]  # Get more candidates
        bert_predictions = [self.idx_to_job.get(idx, 0) for idx in bert_top_indices 
                           if idx in self.idx_to_job and self.idx_to_job[idx] not in job_sequence][:top_n]
        
        # 4. Ensemble predictions with weighted voting
        # Assign weights to each model
        weights = {
            'markov': 0.3,
            'lstm': 0.3,
            'bert': 0.4
        }
        
        # Combine predictions with weights
        job_scores = {}
        
        # Add Markov scores
        for i, job_id in enumerate(markov_predictions):
            score = weights['markov'] * (1.0 - i/len(markov_predictions))
            job_scores[job_id] = job_scores.get(job_id, 0) + score
        
        # Add LSTM scores
        for i, job_id in enumerate(lstm_predictions):
            score = weights['lstm'] * (1.0 - i/len(lstm_predictions))
            job_scores[job_id] = job_scores.get(job_id, 0) + score
        
        # Add BERT scores
        for i, job_id in enumerate(bert_predictions):
            score = weights['bert'] * (1.0 - i/len(bert_predictions))
            job_scores[job_id] = job_scores.get(job_id, 0) + score
        
        # Sort by score and get top_n
        final_predictions = sorted(job_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
        final_job_ids = [job_id for job_id, _ in final_predictions]
        
        # If we don't have enough predictions, fill with Markov and LSTM predictions
        if len(final_job_ids) < top_n:
            remaining = top_n - len(final_job_ids)
            additional_jobs = []
            
            for job_id in markov_predictions + lstm_predictions:
                if job_id not in final_job_ids and job_id not in additional_jobs:
                    additional_jobs.append(job_id)
                    if len(additional_jobs) >= remaining:
                        break
            
            final_job_ids.extend(additional_jobs)
        
        return final_job_ids[:top_n]
    
    def predict_action(self, job_sequence):
        """Predict next action (apply or view)"""
        # Prepare BERT input
        bert_inputs = prepare_bert_input(job_sequence, self.processed_emploi)
        
        # Get BERT action prediction
        bert_outputs = self.bert_model.predict({
            'input_ids': bert_inputs['input_ids'],
            'attention_mask': bert_inputs['attention_mask']
        })
        
        bert_action_prob = bert_outputs[1][0][0]  # Action prediction probability
        
        # Convert LSTM input
        lstm_input = [self.job_to_idx.get(job_id, 0) for job_id in job_sequence[-self.max_seq_length:]]
        lstm_input = pad_sequences([lstm_input], maxlen=self.max_seq_length, padding='pre')
        
        # Get LSTM action prediction
        lstm_action_prob = action_model.predict(lstm_input)[0][0]
        
        # Weighted average of predictions
        final_action_prob = 0.6 * bert_action_prob + 0.4 * lstm_action_prob
        
        # Return 'apply' if probability > 0.5, else 'view'
        return 'apply' if final_action_prob > 0.5 else 'view'

# Initialize ensemble
ensemble = JobRecommendationEnsemble(
    markov_recommender,
    job_model,
    bert_model,
    job_to_idx,
    idx_to_job,
    processed_emploi
)


In [None]:
def calculate_mrr(true_item, predicted_items):
    """Calculate Mean Reciprocal Rank"""
    if true_item in predicted_items:
        rank = predicted_items.index(true_item) + 1
        return 1.0 / rank
    return 0.0

def evaluate_model(X_test, y_test, ensemble):
    """Evaluate model performance"""
    mrr_scores = []
    action_predictions = []
    true_actions = []
    
    for i, (x_row, y_row) in enumerate(zip(X_test.iterrows(), y_test.iterrows())):
        _, x_data = x_row
        _, y_data = y_row
        
        job_sequence = x_data['job_ids']
        true_job = y_data['target_job_id']
        true_action = y_data['action']
        
        # Get predictions
        predicted_jobs = ensemble.predict_next_jobs(job_sequence, top_n=10)
        predicted_action = ensemble.predict_action(job_sequence)
        
        # Calculate MRR
        mrr = calculate_mrr(true_job, predicted_jobs)
        mrr_scores.append(mrr)
        
        # Record action predictions
        action_predictions.append(1 if predicted_action == 'apply' else 0)
        true_actions.append(1 if true_action == 'apply' else 0)
    
    # Calculate metrics
    avg_mrr = np.mean(mrr_scores)
    action_accuracy = np.mean(np.array(action_predictions) == np.array(true_actions))
    
    # Calculate final score (70% MRR, 30% action accuracy)
    final_score = 0.7 * avg_mrr + 0.3 * action_accuracy
    
    return {
        'MRR': avg_mrr,
        'Action Accuracy': action_accuracy,
        'Final Score': final_score
    }

def prepare_submission(X_test, ensemble, output_file='submission.csv'):
    """Prepare submission file"""
    results = []
    
    for i, row in X_test.iterrows():
        session_id = row['session_id']
        job_sequence = row['job_ids']
        
        # Get predictions
        predicted_jobs = ensemble.predict_next_jobs(job_sequence, top_n=10)
        predicted_action = ensemble.predict_action(job_sequence)
        
        # Ensure we have exactly 10 predictions
        if len(predicted_jobs) < 10:
            # Fill with random jobs not in the sequence
            all_jobs = list(processed_emploi.index)
            random_jobs = [job for job in all_jobs if job not in job_sequence and job not in predicted_jobs]
            np.random.shuffle(random_jobs)
            predicted_jobs.extend(random_jobs[:10-len(predicted_jobs)])
        
        # Limit to 10 predictions
        predicted_jobs = predicted_jobs[:10]
        
        # Add to results
        results.append({
            'session_id': session_id,
            'job_1': predicted_jobs[0] if len(predicted_jobs) > 0 else None,
            'job_2': predicted_jobs[1] if len(predicted_jobs) > 1 else None,
            'job_3': predicted_jobs[2] if len(predicted_jobs) > 2 else None,
            'job_4': predicted_jobs[3] if len(predicted_jobs) > 3 else None,
            'job_5': predicted_jobs[4] if len(predicted_jobs) > 4 else None,
            'job_6': predicted_jobs[5] if len(predicted_jobs) > 5 else None,
            'job_7': predicted_jobs[6] if len(predicted_jobs) > 6 else None,
            'job_8': predicted_jobs[7] if len(predicted_jobs) > 7 else None,
            'job_9': predicted_jobs[8] if len(predicted_jobs) > 8 else None,
            'job_10': predicted_jobs[9] if len(predicted_jobs) > 9 else None,
            'applies_for': 1 if predicted_action == 'apply' else 0
        })
    
    # Create submission DataFrame
    submission_df = pd.DataFrame(results)
    
    # Save to CSV
    submission_df.to_csv(output_file, index=False)
    print(f"Submission saved to {output_file}")
    
    return submission_df

# Evaluate on validation set
X_test_processed = convert_string_to_list(x_test, 'job_ids')
X_test_processed = convert_string_to_list(X_test_processed, 'actions')

# Add the same features as training data
X_test_processed['session_length'] = X_test_processed['job_ids'].apply(len)
X_test_processed['unique_jobs_ratio'] = X_test_processed['job_ids'].apply(lambda x: len(set(x))/len(x) if len(x) > 0 else 0)
X_test_processed['apply_ratio'] = X_test_processed['actions'].apply(lambda x: x.count('apply')/len(x) if len(x) > 0 else 0)

# Create submission
submission = prepare_submission(X_test_processed, ensemble, 'ensemble_submission.csv')


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visualize job embeddings using t-SNE
from sklearn.manifold import TSNE

# Get embeddings for visualization
job_ids = list(job_embeddings.keys())
embedding_vectors = np.array([job_embeddings[job_id] for job_id in job_ids])

# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42)
job_tsne = tsne.fit_transform(embedding_vectors)

# Create DataFrame for plotting
tsne_df = pd.DataFrame({
    'job_id': job_ids,
    'x': job_tsne[:, 0],
    'y': job_tsne[:, 1]
})

# Add job titles
tsne_df['job_title'] = tsne_df['job_id'].apply(
    lambda x: processed_emploi.loc[x, 'job_title'] if x in processed_emploi.index else ''
)

# Plot
plt.figure(figsize=(12, 10))
sns.scatterplot(data=tsne_df, x='x', y='y', alpha=0.7)

# Add labels for some points
for i, row in tsne_df.sample(20).iterrows():
    plt.text(row['x'], row['y'], row['job_title'][:20], fontsize=8)

plt.title('t-SNE Visualization of Job Embeddings')
plt.xlabel('t-SNE dimension 1')
plt.ylabel('t-SNE dimension 2')
plt.show()

# Visualize session lengths
plt.figure(figsize
