# Data Exploration: Lookup Tables & Unannotated CVs

This notebook explores the **training data sources** available for the zero-shot learning problem:
- `department-v2.csv` - Job title → department mappings (~10k examples)
- `seniority-v2.csv` - Job title → seniority mappings (~9k examples)
- `linkedin-cvs-not-annotated.json` - Unannotated LinkedIn CVs for inference

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from collections import Counter
import json

# Import our data loaders
import sys
sys.path.append('../')
from src.data.loader import load_label_lists, load_inference_dataset

# Setup plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

# Paths
DATA_DIR = Path('../data')

## 1. Load Lookup Tables (Training Data)

In [None]:
# Load department and seniority lookup tables
dept_df, sen_df = load_label_lists(DATA_DIR)

print(f"Department lookup: {len(dept_df):,} examples")
print(f"Seniority lookup:  {len(sen_df):,} examples")
print(f"\nTotal training examples: {len(dept_df) + len(sen_df):,}")

## Tier 1 Improvements Verification

Verify that encoding fixes and deduplication are working correctly.

In [None]:
# Verify encoding fix - check for mojibake markers
print("Checking for encoding issues (Ã character = mojibake)...")
dept_mojibake = dept_df['text'].str.contains('Ã', na=False).sum()
sen_mojibake = sen_df['text'].str.contains('Ã', na=False).sum()
print(f"Department texts with mojibake: {dept_mojibake}")
print(f"Seniority texts with mojibake: {sen_mojibake}")

# Show some sample texts after encoding fix
print("\nSample German texts (should show ä, ö, ü correctly):")
german_samples = dept_df[dept_df['text'].str.contains('ä|ö|ü|ß', na=False, regex=True)]['text'].head(5)
for t in german_samples:
    print(f"  - {t}")

# Verify deduplication
print(f"\n--- After Deduplication ---")
print(f"Department examples: {len(dept_df)}")
print(f"Seniority examples: {len(sen_df)}")

print(f"\nLabel distribution after dedup:")
print("\nDepartment:")
print(dept_df['label'].value_counts())
print("\nSeniority:")
print(sen_df['label'].value_counts())

## 2. Department Labels Analysis

In [None]:
# Label distribution
dept_counts = dept_df['label'].value_counts()
print("Department Label Distribution:")
print(dept_counts)
print(f"\nNumber of unique departments: {len(dept_counts)}")

In [None]:
# Visualize department distribution
plt.figure(figsize=(12, 6))
dept_counts.plot(kind='bar')
plt.title('Department Label Distribution (Lookup Table)', fontsize=14)
plt.xlabel('Department', fontsize=12)
plt.ylabel('Number of Examples', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Example job titles per department
print("\nExample job titles per department:\n")
for dept in dept_counts.head(5).index:
    examples = dept_df[dept_df['label'] == dept]['text'].head(5).tolist()
    print(f"\n{dept}:")
    for ex in examples:
        print(f"  - {ex}")

## 3. Seniority Labels Analysis

In [None]:
# Label distribution
sen_counts = sen_df['label'].value_counts()
print("Seniority Label Distribution:")
print(sen_counts)
print(f"\nNumber of unique seniority levels: {len(sen_counts)}")

In [None]:
# Visualize seniority distribution
plt.figure(figsize=(10, 6))
sen_counts.plot(kind='bar', color='coral')
plt.title('Seniority Label Distribution (Lookup Table)', fontsize=14)
plt.xlabel('Seniority Level', fontsize=12)
plt.ylabel('Number of Examples', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Example job titles per seniority level
print("\nExample job titles per seniority level:\n")
for sen in sen_counts.index:
    examples = sen_df[sen_df['label'] == sen]['text'].head(5).tolist()
    print(f"\n{sen}:")
    for ex in examples:
        print(f"  - {ex}")

## 4. Text Length Analysis

In [None]:
# Compute text statistics
dept_df['text_length'] = dept_df['text'].str.len()
dept_df['word_count'] = dept_df['text'].str.split().str.len()

sen_df['text_length'] = sen_df['text'].str.len()
sen_df['word_count'] = sen_df['text'].str.split().str.len()

print("Department job titles:")
print(f"  Avg length: {dept_df['text_length'].mean():.1f} chars")
print(f"  Avg words:  {dept_df['word_count'].mean():.1f} words")

print("\nSeniority job titles:")
print(f"  Avg length: {sen_df['text_length'].mean():.1f} chars")
print(f"  Avg words:  {sen_df['word_count'].mean():.1f} words")

In [None]:
# Visualize word count distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(dept_df['word_count'], bins=30, edgecolor='black', alpha=0.7)
axes[0].set_title('Department: Word Count Distribution', fontsize=12)
axes[0].set_xlabel('Number of Words')
axes[0].set_ylabel('Frequency')

axes[1].hist(sen_df['word_count'], bins=30, edgecolor='black', alpha=0.7, color='coral')
axes[1].set_title('Seniority: Word Count Distribution', fontsize=12)
axes[1].set_xlabel('Number of Words')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 5. Load Unannotated LinkedIn CVs

In [None]:
# Load unannotated CVs (for inference demonstration)
inference_df = load_inference_dataset(DATA_DIR)

print(f"Unannotated LinkedIn CVs: {len(inference_df):,} positions")
print(f"\nColumns: {list(inference_df.columns)}")
print(f"\nFirst few examples:")
inference_df.head()

In [None]:
# Analyze LinkedIn CV job titles
inference_df['text_length'] = inference_df['title'].str.len()
inference_df['word_count'] = inference_df['title'].str.split().str.len()

print("LinkedIn CV job titles:")
print(f"  Avg length: {inference_df['text_length'].mean():.1f} chars")
print(f"  Avg words:  {inference_df['word_count'].mean():.1f} words")
print(f"  Min/Max words: {inference_df['word_count'].min()} / {inference_df['word_count'].max()}")

In [None]:
# Compare distributions: Lookup tables vs LinkedIn CVs
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Word count comparison
axes[0].hist(dept_df['word_count'], bins=20, alpha=0.5, label='Lookup (Dept)', edgecolor='black')
axes[0].hist(inference_df['word_count'], bins=20, alpha=0.5, label='LinkedIn CVs', edgecolor='black')
axes[0].set_title('Word Count Comparison', fontsize=12)
axes[0].set_xlabel('Number of Words')
axes[0].set_ylabel('Frequency')
axes[0].legend()

# Character length comparison
axes[1].hist(dept_df['text_length'], bins=20, alpha=0.5, label='Lookup (Dept)', edgecolor='black')
axes[1].hist(inference_df['text_length'], bins=20, alpha=0.5, label='LinkedIn CVs', edgecolor='black')
axes[1].set_title('Character Length Comparison', fontsize=12)
axes[1].set_xlabel('Number of Characters')
axes[1].set_ylabel('Frequency')
axes[1].legend()

plt.tight_layout()
plt.show()

## 6. Language Detection

In [None]:
# Simple language detection based on keywords
def detect_language(text):
    text_lower = text.lower()
    
    # German indicators
    german_words = ['geschäftsführer', 'leiter', 'mitarbeiter', 'projektmanager', 
                    'entwickler', 'berater', 'assistent', 'sachbearbeiter']
    
    # French indicators
    french_words = ['responsable', 'directeur', 'chef', 'chargé', 'gérant', 'adjoint']
    
    if any(word in text_lower for word in german_words):
        return 'German'
    elif any(word in text_lower for word in french_words):
        return 'French'
    else:
        return 'English/Other'

# Detect languages
dept_df['language'] = dept_df['text'].apply(detect_language)
sen_df['language'] = sen_df['text'].apply(detect_language)
inference_df['language'] = inference_df['title'].apply(detect_language)

print("Language distribution in lookup tables:")
print("\nDepartment:")
print(dept_df['language'].value_counts())
print("\nSeniority:")
print(sen_df['language'].value_counts())
print("\nLinkedIn CVs:")
print(inference_df['language'].value_counts())

## 7. Key Observations

### Distribution Mismatch
- Lookup tables have simpler, cleaner job titles
- LinkedIn CVs may have longer, more complex titles
- Need models that can generalize from lookup patterns to real-world CVs

### Label Imbalance
- Some departments/seniority levels have many more examples than others
- May need class weighting or stratified sampling

### Multilingual Challenge
- Data contains German, French, and English
- Need multilingual models (e.g., multilingual-BERT, sentence-transformers)

### Zero-Shot Setting
- No labeled LinkedIn CV data for training
- Must transfer knowledge from lookup table patterns
- Challenge: How well do lookup table patterns generalize?

## 8. Summary Statistics

In [None]:
# Create summary table
summary = pd.DataFrame({
    'Dataset': ['Department Lookup', 'Seniority Lookup', 'LinkedIn CVs (Unannotated)'],
    'Examples': [len(dept_df), len(sen_df), len(inference_df)],
    'Unique Labels': [dept_df['label'].nunique(), sen_df['label'].nunique(), 'N/A'],
    'Avg Words': [dept_df['word_count'].mean(), sen_df['word_count'].mean(), inference_df['word_count'].mean()],
    'Languages': ['Multi', 'Multi', 'Multi']
})

print("\n" + "="*60)
print("DATA SUMMARY")
print("="*60)
print(summary.to_string(index=False))
print("="*60)