# Dataset Exploration - Resume NER

This notebook explores the generated NER dataset for resume keyword extraction.

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from collections import Counter

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## Load Dataset

In [None]:
# Load train, val, test datasets
data_dir = Path('../data/raw')

with open(data_dir / 'train.json', 'r') as f:
    train_data = json.load(f)

with open(data_dir / 'val.json', 'r') as f:
    val_data = json.load(f)

with open(data_dir / 'test.json', 'r') as f:
    test_data = json.load(f)

print(f"Train: {len(train_data)} sentences")
print(f"Val: {len(val_data)} sentences")
print(f"Test: {len(test_data)} sentences")

## Sample Sentences

In [None]:
# Display sample sentences with annotations
for i, item in enumerate(train_data[:5]):
    print(f"\n{'='*60}")
    print(f"Sample {i+1}")
    print(f"{'='*60}")
    print(f"Text: {item['text']}")
    print(f"\nEntities:")
    for token, tag in zip(item['tokens'], item['tags']):
        if tag != 'O':
            print(f"  {token} -> {tag}")

## Label Distribution

In [None]:
# Count all tags
all_tags = []
for item in train_data:
    all_tags.extend(item['tags'])

tag_counts = Counter(all_tags)

# Plot
plt.figure(figsize=(10, 6))
tags = list(tag_counts.keys())
counts = list(tag_counts.values())
colors = ['#3498db' if 'SKILL' in t else '#2ecc71' if 'DEGREE' in t else '#e74c3c' if 'EXPERIENCE' in t else '#95a5a6' for t in tags]

plt.bar(tags, counts, color=colors, alpha=0.7)
plt.title('Tag Distribution in Training Set', fontsize=16, fontweight='bold')
plt.xlabel('Tag', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Entity Statistics

In [None]:
# Count entities (B- tags only)
entity_counts = {
    'SKILL': tag_counts.get('B-SKILL', 0),
    'DEGREE': tag_counts.get('B-DEGREE', 0),
    'EXPERIENCE': tag_counts.get('B-EXPERIENCE', 0)
}

# Create DataFrame
df = pd.DataFrame({
    'Entity Type': list(entity_counts.keys()),
    'Count': list(entity_counts.values())
})

print(df)

# Pie chart
plt.figure(figsize=(8, 8))
colors_pie = ['#3498db', '#2ecc71', '#e74c3c']
plt.pie(df['Count'], labels=df['Entity Type'], autopct='%1.1f%%', colors=colors_pie, startangle=90)
plt.title('Entity Distribution', fontsize=16, fontweight='bold')
plt.show()

## Sentence Length Distribution

In [None]:
# Calculate sentence lengths
sentence_lengths = [len(item['tokens']) for item in train_data]

plt.figure(figsize=(10, 6))
plt.hist(sentence_lengths, bins=30, color='#3498db', alpha=0.7, edgecolor='black')
plt.title('Sentence Length Distribution', fontsize=16, fontweight='bold')
plt.xlabel('Number of Tokens', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.axvline(x=sum(sentence_lengths)/len(sentence_lengths), color='red', linestyle='--', label=f'Mean: {sum(sentence_lengths)/len(sentence_lengths):.1f}')
plt.legend()
plt.tight_layout()
plt.show()

print(f"Min length: {min(sentence_lengths)}")
print(f"Max length: {max(sentence_lengths)}")
print(f"Mean length: {sum(sentence_lengths)/len(sentence_lengths):.2f}")

## Conclusion

The dataset is well-balanced with good coverage of all three entity types. Ready for training!