# JibJob Recommender System - Exploratory Data Analysis

This notebook explores the data used for the JibJob recommendation system, including user profiles, job listings, and their interactions.

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path

# Add project root to path for imports
project_root = str(Path().absolute().parent)
sys.path.append(project_root)

# Import project modules
from jibjob_recommender_system.data_handling.data_loader import DataLoader
from jibjob_recommender_system.config.config_loader import ConfigLoader

# Set up visualization settings
plt.style.use('ggplot')
sns.set_style('whitegrid')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

# Load configuration
config_path = os.path.join(project_root, 'jibjob_recommender_system', 'config', 'settings.yaml')
config = ConfigLoader.load_config(config_path)

## 1. Loading and Examining Data

First, let's load the data and examine its basic properties.

In [None]:
# Define data paths - adjust based on your data location
data_dir = os.path.join(project_root, 'sample_data')

# Create data loader
data_loader = DataLoader(config)

# Load data
data_dict = data_loader.load_data(data_dir)

# Extract DataFrames
users_df = data_dict.get('users')
jobs_df = data_dict.get('jobs')
job_applications_df = data_dict.get('job_applications')
categories_df = data_dict.get('categories')

print(f"Users data shape: {users_df.shape if users_df is not None else 'Not available'}")
print(f"Jobs data shape: {jobs_df.shape if jobs_df is not None else 'Not available'}")
print(f"Job applications data shape: {job_applications_df.shape if job_applications_df is not None else 'Not available'}")
print(f"Categories data shape: {categories_df.shape if categories_df is not None else 'Not available'}")

### 1.1 Exploring User Data

In [None]:
# Display user data sample
if users_df is not None:
    print("Sample of user data:")
    display(users_df.head())
    
    # Distribution of user types
    user_type_counts = users_df['user_type'].value_counts()
    plt.figure(figsize=(10, 6))
    sns.barplot(x=user_type_counts.index, y=user_type_counts.values)
    plt.title('Distribution of User Types')
    plt.ylabel('Count')
    plt.xlabel('User Type')
    plt.show()
    
    # Extract professionals for further analysis
    professionals = users_df[users_df['user_type'] == 'professional']
    
    # Analyze categories distribution for professionals
    if 'categories' in professionals.columns:
        # Flatten categories list
        all_categories = []
        for categories in professionals['categories']:
            if isinstance(categories, list):
                all_categories.extend(categories)
        
        category_counts = pd.Series(all_categories).value_counts().head(10)
        plt.figure(figsize=(12, 8))
        sns.barplot(x=category_counts.values, y=category_counts.index)
        plt.title('Top 10 Categories Among Professionals')
        plt.xlabel('Count')
        plt.show()
else:
    print("User data not available")

### 1.2 Exploring Job Data

In [None]:
# Display job data sample
if jobs_df is not None:
    print("Sample of job data:")
    display(jobs_df.head())
    
    # Analyze categories distribution for jobs
    if 'categories' in jobs_df.columns:
        # Flatten categories list
        all_job_categories = []
        for categories in jobs_df['categories']:
            if isinstance(categories, list):
                all_job_categories.extend(categories)
        
        job_category_counts = pd.Series(all_job_categories).value_counts().head(10)
        plt.figure(figsize=(12, 8))
        sns.barplot(x=job_category_counts.values, y=job_category_counts.index)
        plt.title('Top 10 Categories Among Jobs')
        plt.xlabel('Count')
        plt.show()
    
    # Analyze job distribution by employer
    if 'employer_id' in jobs_df.columns:
        employer_job_counts = jobs_df['employer_id'].value_counts().head(10)
        plt.figure(figsize=(12, 6))
        sns.barplot(x=employer_job_counts.index, y=employer_job_counts.values)
        plt.title('Number of Jobs Posted by Top 10 Employers')
        plt.ylabel('Number of Jobs')
        plt.xlabel('Employer ID')
        plt.xticks(rotation=45)
        plt.show()
else:
    print("Job data not available")

### 1.3 Exploring Job Applications (Interactions)

In [None]:
# Display job applications data sample
if job_applications_df is not None:
    print("Sample of job applications data:")
    display(job_applications_df.head())
    
    # Count applications by user
    user_application_counts = job_applications_df['user_id'].value_counts()
    
    # Plot distribution of applications per user
    plt.figure(figsize=(12, 6))
    sns.histplot(user_application_counts, bins=30, kde=True)
    plt.title('Distribution of Number of Applications per User')
    plt.xlabel('Number of Applications')
    plt.ylabel('Count of Users')
    plt.show()
    
    # Count applications by job
    job_application_counts = job_applications_df['job_id'].value_counts()
    
    # Plot top 20 jobs by applications
    plt.figure(figsize=(14, 8))
    top_jobs = job_application_counts.head(20)
    sns.barplot(x=top_jobs.index, y=top_jobs.values)
    plt.title('Top 20 Jobs by Application Count')
    plt.ylabel('Number of Applications')
    plt.xlabel('Job ID')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()
else:
    print("Job applications data not available")

## 2. Geographical Distribution Analysis

In [None]:
# Plot geographical distribution of users and jobs
if users_df is not None and 'latitude' in users_df.columns and 'longitude' in users_df.columns:
    professionals = users_df[users_df['user_type'] == 'professional']
    
    # Create map for professionals
    fig = px.scatter_mapbox(
        professionals,
        lat='latitude',
        lon='longitude',
        hover_name='user_id',
        color_discrete_sequence=["blue"],
        zoom=3,
        title='Geographical Distribution of Professionals'
    )
    
    # Add jobs to the map if available
    if jobs_df is not None and 'latitude' in jobs_df.columns and 'longitude' in jobs_df.columns:
        job_layer = px.scatter_mapbox(
            jobs_df,
            lat='latitude',
            lon='longitude',
            hover_name='job_id',
            color_discrete_sequence=["red"],
        ).data[0]
        job_layer.name = 'Jobs'
        fig.add_trace(job_layer)
    
    fig.update_layout(mapbox_style="open-street-map")
    fig.update_layout(margin={"r":0,"t":30,"l":0,"b":0}, height=600)
    fig.show()
else:
    print("Location data not available for mapping")

## 3. Category and Skills Analysis

In [None]:
# Analyze overlap between user and job categories
if users_df is not None and jobs_df is not None and \
   'categories' in users_df.columns and 'categories' in jobs_df.columns:
    
    # Get all unique categories from users
    user_categories = set()
    for cat_list in users_df['categories']:
        if isinstance(cat_list, list):
            user_categories.update(cat_list)
    
    # Get all unique categories from jobs
    job_categories = set()
    for cat_list in jobs_df['categories']:
        if isinstance(cat_list, list):
            job_categories.update(cat_list)
    
    # Find overlap and unique categories
    common_categories = user_categories.intersection(job_categories)
    user_only_categories = user_categories - job_categories
    job_only_categories = job_categories - user_categories
    
    # Create Venn diagram data
    venn_data = [
        len(common_categories),
        len(user_only_categories),
        len(job_only_categories)
    ]
    
    # Print category overlap statistics
    print(f"Total unique categories across users and jobs: {len(user_categories.union(job_categories))}")
    print(f"Categories shared between users and jobs: {len(common_categories)}")
    print(f"Categories unique to users: {len(user_only_categories)}")
    print(f"Categories unique to jobs: {len(job_only_categories)}")
    
    # List top common categories
    print("\nSome common categories:")
    for cat in list(common_categories)[:10]:  # Show up to 10
        print(f"- {cat}")
else:
    print("Categories data not available for analysis")

## 4. Generating Sample Embeddings for Text Data

In [None]:
# Sample code for generating embeddings
from sentence_transformers import SentenceTransformer

def generate_text_embeddings(texts, model_name='paraphrase-MiniLM-L6-v2'):
    """Generate embeddings for a list of texts."""
    model = SentenceTransformer(model_name)
    embeddings = model.encode(texts)
    return embeddings

# Generate sample embeddings for job titles
if jobs_df is not None and 'title' in jobs_df.columns:
    # Take a small sample for demonstration
    sample_jobs = jobs_df.head(5)
    
    # Generate embeddings
    sample_titles = sample_jobs['title'].tolist()
    try:
        embeddings = generate_text_embeddings(sample_titles)
        
        # Display embedding dimensions
        print(f"Generated embeddings with shape: {embeddings.shape}")
        
        # Visualize first two dimensions
        plt.figure(figsize=(10, 8))
        plt.scatter(embeddings[:, 0], embeddings[:, 1])
        
        # Label each point with job title
        for i, title in enumerate(sample_titles):
            plt.annotate(title, (embeddings[i, 0], embeddings[i, 1]))
            
        plt.title('2D Visualization of Job Title Embeddings')
        plt.xlabel('Dimension 1')
        plt.ylabel('Dimension 2')
        plt.grid(True)
        plt.show()
    except Exception as e:
        print(f"Could not generate embeddings: {e}")
else:
    print("Job title data not available for embedding generation")

## 5. Analyzing Data for Recommendation System

In [None]:
# Analyze interactions for recommendation patterns
if job_applications_df is not None and jobs_df is not None and 'categories' in jobs_df.columns:
    # Merge applications with job data to get category information
    merged_data = job_applications_df.merge(jobs_df, on='job_id', how='left')
    
    # Analyze category distribution in applications
    applied_categories = []
    for _, row in merged_data.iterrows():
        if isinstance(row.get('categories'), list):
            applied_categories.extend(row['categories'])
    
    # Count category frequencies
    applied_category_counts = pd.Series(applied_categories).value_counts().head(15)
    
    plt.figure(figsize=(14, 8))
    sns.barplot(x=applied_category_counts.index, y=applied_category_counts.values)
    plt.title('Top 15 Categories in Job Applications')
    plt.ylabel('Application Count')
    plt.xlabel('Category')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    # Analyze user application patterns
    user_job_counts = merged_data.groupby('user_id').size().reset_index(name='application_count')
    
    # Plot user application frequency distribution
    plt.figure(figsize=(12, 6))
    sns.histplot(user_job_counts['application_count'], bins=30, kde=True)
    plt.title('Distribution of Applications per User')
    plt.xlabel('Number of Applications')
    plt.ylabel('Count of Users')
    plt.tight_layout()
    plt.show()
    
    # Analyze category consistency of users
    # For each user, get all their applications and check category overlap
    user_category_consistency = {}
    
    for user_id, group in merged_data.groupby('user_id'):
        if len(group) < 2:  # Skip users with fewer than 2 applications
            continue
            
        # Get all categories for this user's applications
        all_cats = set()
        cat_counts = {}
        
        for _, row in group.iterrows():
            if isinstance(row.get('categories'), list):
                for cat in row['categories']:
                    all_cats.add(cat)
                    cat_counts[cat] = cat_counts.get(cat, 0) + 1
        
        # Calculate consistency score (average frequency of categories)
        if len(all_cats) > 0:
            consistency = sum(cat_counts.values()) / (len(all_cats) * len(group))
            user_category_consistency[user_id] = consistency
    
    # Plot distribution of consistency scores
    plt.figure(figsize=(12, 6))
    sns.histplot(list(user_category_consistency.values()), bins=20, kde=True)
    plt.title('Distribution of Category Consistency Across User Applications')
    plt.xlabel('Consistency Score (1.0 = perfectly consistent)')
    plt.ylabel('Count of Users')
    plt.show()
else:
    print("Job application and category data not available for analysis")

## 6. Dataset Statistics Summary

In [None]:
# Generate summary statistics for the dataset
summary = {}

# User statistics
if users_df is not None:
    summary['total_users'] = len(users_df)
    if 'user_type' in users_df.columns:
        summary['professional_count'] = len(users_df[users_df['user_type'] == 'professional'])
        summary['employer_count'] = len(users_df[users_df['user_type'] == 'employer'])

# Job statistics
if jobs_df is not None:
    summary['total_jobs'] = len(jobs_df)
    if 'employer_id' in jobs_df.columns:
        summary['unique_employers_with_jobs'] = jobs_df['employer_id'].nunique()

# Application statistics
if job_applications_df is not None:
    summary['total_applications'] = len(job_applications_df)
    summary['users_with_applications'] = job_applications_df['user_id'].nunique()
    summary['jobs_with_applications'] = job_applications_df['job_id'].nunique()

# Category statistics
category_sets = {}
if users_df is not None and 'categories' in users_df.columns:
    user_cats = set()
    for cats in users_df['categories']:
        if isinstance(cats, list):
            user_cats.update(cats)
    category_sets['user_categories'] = user_cats

if jobs_df is not None and 'categories' in jobs_df.columns:
    job_cats = set()
    for cats in jobs_df['categories']:
        if isinstance(cats, list):
            job_cats.update(cats)
    category_sets['job_categories'] = job_cats

if 'user_categories' in category_sets and 'job_categories' in category_sets:
    summary['unique_user_categories'] = len(category_sets['user_categories'])
    summary['unique_job_categories'] = len(category_sets['job_categories'])
    summary['overlapping_categories'] = len(category_sets['user_categories'] & category_sets['job_categories'])

# Print summary
print("\n===== Dataset Summary =====\n")
for key, value in summary.items():
    print(f"{key}: {value}")

# Create summary dataframe for visualization
summary_df = pd.DataFrame(list(summary.items()), columns=['Metric', 'Value'])
plt.figure(figsize=(12, 8))
summary_plot = sns.barplot(data=summary_df, x='Metric', y='Value')
plt.xticks(rotation=45, ha='right')
plt.title('Dataset Summary Statistics')
plt.tight_layout()
plt.show()

## 7. Conclusions and Next Steps

Based on the above analysis, we can draw the following conclusions and plan the next steps:

**Conclusions:**
* [This will be filled in after running the notebook with actual data]

**Next Steps:**
1. Feature engineering based on the insights from this analysis
2. Build a graph representation of users and jobs with appropriate edge weights
3. Train the GCN model with the structure identified above
4. Evaluate the model using the metrics defined in the evaluation module
5. Fine-tune the recommendation approach based on evaluation results