# 📊 Job-Resume Matching Data Visualization
## Phân tích và trực quan hóa dữ liệu từ file CSV

**Dataset**: `jd_cr_similarity.csv`  
**Mục đích**: Phân tích dữ liệu matching giữa Job Descriptions và Candidate Resumes

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('default')
sns.set_palette("husl")

# Configure display
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("📚 Libraries imported successfully!")

In [None]:
# Load data
print("🔄 Loading data...")
df = pd.read_csv('../csv/jd_cr_similarity.csv')

print(f"✅ Data loaded successfully!")
print(f"📊 Dataset shape: {df.shape}")
print(f"📋 Columns: {list(df.columns)}")

# Display first few rows
print("\n🔍 First 5 rows:")
display(df.head())

# Display basic info
print("\n" + "="*60)
print("DATASET OVERVIEW")
print("="*60)
df.info()

In [None]:
# Data summary
print("\n" + "="*60)
print("DATA SUMMARY")
print("="*60)

print(f"📈 Total records: {len(df):,}")
print(f"🏢 Unique Job IDs: {df['jd_id'].nunique():,}")
print(f"👥 Unique CR Categories: {df['cr_category'].nunique():,}")
print(f"🎯 Suitability levels: {df['suitability'].nunique()}")

print("\n📊 Suitability Distribution:")
suitability_counts = df['suitability'].value_counts()
for suit, count in suitability_counts.items():
    pct = count / len(df) * 100
    print(f"  {suit}: {count:,} ({pct:.1f}%)")

print("\n📋 CR Categories (Top 15):")
cr_counts = df['cr_category'].value_counts()
for i, (cat, count) in enumerate(cr_counts.head(15).items(), 1):
    print(f"  {i:2d}. {cat}: {count:,}")

In [None]:
# Statistical summary
print("\n" + "="*60)
print("SIMILARITY SCORES STATISTICS")
print("="*60)

similarity_cols = ['primary_skills_sim', 'secondary_skills_sim', 'adjectives_sim', 'total_similarity']
stats_df = df[similarity_cols].describe()
print(stats_df.round(4))

# Check for missing values
print("\n🔍 Missing Values:")
missing = df.isnull().sum()
if missing.sum() == 0:
    print("  ✅ No missing values found!")
else:
    for col, count in missing[missing > 0].items():
        print(f"  {col}: {count}")

## 📊 Data Visualizations

In [None]:
# 1. Suitability Distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Pie chart
suitability_counts = df['suitability'].value_counts()
colors = ['#ff9999', '#66b3ff', '#99ff99']
axes[0].pie(suitability_counts.values, labels=suitability_counts.index, autopct='%1.1f%%', 
           colors=colors, startangle=90)
axes[0].set_title('📊 Suitability Distribution', fontsize=14, fontweight='bold')

# Bar chart
bars = axes[1].bar(suitability_counts.index, suitability_counts.values, color=colors)
axes[1].set_title('📈 Suitability Counts', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Count')
axes[1].tick_params(axis='x', rotation=45)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    axes[1].text(bar.get_x() + bar.get_width()/2., height + 1000,
                f'{int(height):,}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

print("\n📊 Suitability Statistics:")
for suit, count in suitability_counts.items():
    pct = count / len(df) * 100
    print(f"  {suit}: {count:,} records ({pct:.1f}%)")

In [None]:
# 2. CR Categories Distribution
plt.figure(figsize=(16, 10))

# Top 20 categories
top_categories = df['cr_category'].value_counts().head(20)

# Create horizontal bar chart
bars = plt.barh(range(len(top_categories)), top_categories.values, 
                color=plt.cm.Set3(np.linspace(0, 1, len(top_categories))))

plt.yticks(range(len(top_categories)), top_categories.index)
plt.xlabel('Number of Records')
plt.title('👥 Top 20 CR Categories Distribution', fontsize=16, fontweight='bold', pad=20)
plt.gca().invert_yaxis()

# Add value labels
for i, (bar, value) in enumerate(zip(bars, top_categories.values)):
    plt.text(value + 500, i, f'{value:,}', va='center', ha='left')

plt.tight_layout()
plt.show()

print(f"\n👥 Total CR Categories: {df['cr_category'].nunique()}")
print(f"📊 Showing top 20 out of {df['cr_category'].nunique()} categories")

In [None]:
# 3. Similarity Scores Distribution
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('📈 Similarity Scores Distribution', fontsize=16, fontweight='bold')

similarity_cols = ['primary_skills_sim', 'secondary_skills_sim', 'adjectives_sim', 'total_similarity']
titles = ['🎯 Primary Skills Similarity', '🔧 Secondary Skills Similarity', 
          '📝 Adjectives Similarity', '📊 Total Similarity']

for i, (col, title) in enumerate(zip(similarity_cols, titles)):
    row, col_idx = i // 2, i % 2
    
    # Histogram
    axes[row, col_idx].hist(df[col], bins=50, alpha=0.7, color=plt.cm.Set2(i))
    axes[row, col_idx].set_title(title)
    axes[row, col_idx].set_xlabel('Similarity Score')
    axes[row, col_idx].set_ylabel('Frequency')
    
    # Add statistics text
    mean_val = df[col].mean()
    median_val = df[col].median()
    axes[row, col_idx].axvline(mean_val, color='red', linestyle='--', alpha=0.8, label=f'Mean: {mean_val:.3f}')
    axes[row, col_idx].axvline(median_val, color='blue', linestyle='--', alpha=0.8, label=f'Median: {median_val:.3f}')
    axes[row, col_idx].legend()

plt.tight_layout()
plt.show()

In [None]:
# 4. Correlation Heatmap
plt.figure(figsize=(10, 8))

# Select numeric columns for correlation
numeric_cols = ['primary_skills_sim', 'secondary_skills_sim', 'adjectives_sim', 
                'adj_weight', 'total_similarity']
corr_matrix = df[numeric_cols].corr()

# Create heatmap
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": .8})

plt.title('🔥 Correlation Heatmap - Similarity Features', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("\n🔍 Key Correlations:")
# Find high correlations (excluding diagonal)
corr_pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        corr_val = corr_matrix.iloc[i, j]
        if abs(corr_val) > 0.3:  # Show correlations > 0.3
            corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_val))

corr_pairs.sort(key=lambda x: abs(x[2]), reverse=True)
for col1, col2, corr_val in corr_pairs[:5]:
    print(f"  {col1} ↔ {col2}: {corr_val:.3f}")

In [None]:
# 5. Suitability by CR Category (Top 15 categories)
plt.figure(figsize=(16, 10))

# Get top 15 categories by count
top_15_categories = df['cr_category'].value_counts().head(15).index
df_top15 = df[df['cr_category'].isin(top_15_categories)]

# Create crosstab
crosstab = pd.crosstab(df_top15['cr_category'], df_top15['suitability'], normalize='index') * 100

# Create stacked bar chart
ax = crosstab.plot(kind='bar', stacked=True, figsize=(16, 8), 
                   color=['#ff9999', '#66b3ff', '#99ff99'])

plt.title('🎯 Suitability Distribution by CR Category (Top 15)', fontsize=16, fontweight='bold')
plt.xlabel('CR Category')
plt.ylabel('Percentage (%)')
plt.legend(title='Suitability', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

print("\n📊 Categories with highest 'Most Suitable' percentage:")
most_suitable_pct = crosstab['Most Suitable'].sort_values(ascending=False)
for i, (cat, pct) in enumerate(most_suitable_pct.head(5).items(), 1):
    print(f"  {i}. {cat}: {pct:.1f}%")

In [None]:
# 6. Similarity Scores by Suitability (Box plots)
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('📦 Similarity Scores by Suitability Level', fontsize=16, fontweight='bold')

similarity_cols = ['primary_skills_sim', 'secondary_skills_sim', 'adjectives_sim', 'total_similarity']
titles = ['🎯 Primary Skills', '🔧 Secondary Skills', '📝 Adjectives', '📊 Total Similarity']

for i, (col, title) in enumerate(zip(similarity_cols, titles)):
    row, col_idx = i // 2, i % 2
    
    # Box plot
    df.boxplot(column=col, by='suitability', ax=axes[row, col_idx])
    axes[row, col_idx].set_title(title)
    axes[row, col_idx].set_xlabel('Suitability')
    axes[row, col_idx].set_ylabel('Similarity Score')
    
    # Remove the automatic title
    axes[row, col_idx].set_title(title)

plt.suptitle('📦 Similarity Scores by Suitability Level', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# 7. Job Analysis
print("\n" + "="*60)
print("🏢 JOB ANALYSIS")
print("="*60)

# Jobs with most records
job_counts = df['jd_id'].value_counts()
print(f"\n📈 Jobs with most matching records (Top 10):")
for i, (job_id, count) in enumerate(job_counts.head(10).items(), 1):
    print(f"  {i:2d}. {job_id}: {count:,} records")

# Average similarity by job
job_avg_similarity = df.groupby('jd_id')['total_similarity'].mean().sort_values(ascending=False)
print(f"\n🎯 Jobs with highest average similarity (Top 10):")
for i, (job_id, avg_sim) in enumerate(job_avg_similarity.head(10).items(), 1):
    print(f"  {i:2d}. {job_id}: {avg_sim:.3f}")

# Visualization: Job distribution
plt.figure(figsize=(14, 8))

# Top 20 jobs by record count
top_jobs = job_counts.head(20)
bars = plt.bar(range(len(top_jobs)), top_jobs.values, 
               color=plt.cm.viridis(np.linspace(0, 1, len(top_jobs))))

plt.xticks(range(len(top_jobs)), top_jobs.index, rotation=45, ha='right')
plt.xlabel('Job ID')
plt.ylabel('Number of Records')
plt.title('🏢 Top 20 Jobs by Number of Matching Records', fontsize=14, fontweight='bold')

# Add value labels
for bar, value in zip(bars, top_jobs.values):
    plt.text(bar.get_x() + bar.get_width()/2., value + 50,
             f'{value:,}', ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.show()

In [None]:
# 8. Advanced Analysis
print("\n" + "="*60)
print("🔬 ADVANCED ANALYSIS")
print("="*60)

# Skills analysis
print("\n🎯 Skills Similarity Analysis:")
skills_stats = df[['primary_skills_sim', 'secondary_skills_sim']].describe()
print(skills_stats.round(4))

# Categories with zero secondary skills
zero_secondary = df[df['secondary_skills_sim'] == 0]['cr_category'].value_counts()
print(f"\n🔧 Categories with most zero secondary skills (Top 10):")
for i, (cat, count) in enumerate(zero_secondary.head(10).items(), 1):
    total_cat = df[df['cr_category'] == cat].shape[0]
    pct = count / total_cat * 100
    print(f"  {i:2d}. {cat}: {count:,}/{total_cat:,} ({pct:.1f}%)")

# High similarity analysis
high_similarity = df[df['total_similarity'] > 1.0]
print(f"\n📊 Records with total_similarity > 1.0: {len(high_similarity):,} ({len(high_similarity)/len(df)*100:.1f}%)")

if len(high_similarity) > 0:
    print("\nTop categories with high similarity:")
    high_sim_cats = high_similarity['cr_category'].value_counts().head(10)
    for i, (cat, count) in enumerate(high_sim_cats.items(), 1):
        print(f"  {i:2d}. {cat}: {count:,}")

In [None]:
# 9. Summary and Insights
print("\n" + "="*60)
print("💡 KEY INSIGHTS & SUMMARY")
print("="*60)

total_records = len(df)
unique_jobs = df['jd_id'].nunique()
unique_categories = df['cr_category'].nunique()

print(f"\n📊 Dataset Overview:")
print(f"  • Total matching records: {total_records:,}")
print(f"  • Unique job positions: {unique_jobs:,}")
print(f"  • Unique CV categories: {unique_categories:,}")
print(f"  • Average records per job: {total_records/unique_jobs:.1f}")

# Suitability insights
suitability_dist = df['suitability'].value_counts(normalize=True) * 100
print(f"\n🎯 Suitability Distribution:")
for suit, pct in suitability_dist.items():
    print(f"  • {suit}: {pct:.1f}%")

# Similarity insights
print(f"\n📈 Similarity Score Insights:")
print(f"  • Average primary skills similarity: {df['primary_skills_sim'].mean():.3f}")
print(f"  • Average secondary skills similarity: {df['secondary_skills_sim'].mean():.3f}")
print(f"  • Average adjectives similarity: {df['adjectives_sim'].mean():.3f}")
print(f"  • Average total similarity: {df['total_similarity'].mean():.3f}")

# Data quality insights
zero_primary = (df['primary_skills_sim'] == 0).sum()
zero_secondary = (df['secondary_skills_sim'] == 0).sum()
zero_adjectives = (df['adjectives_sim'] == 0).sum()

print(f"\n🔍 Data Quality:")
print(f"  • Records with zero primary skills: {zero_primary:,} ({zero_primary/total_records*100:.1f}%)")
print(f"  • Records with zero secondary skills: {zero_secondary:,} ({zero_secondary/total_records*100:.1f}%)")
print(f"  • Records with zero adjectives: {zero_adjectives:,} ({zero_adjectives/total_records*100:.1f}%)")

print(f"\n✅ Analysis completed successfully!")
print(f"📊 Total visualizations created: 8")
print(f"🎯 Ready for machine learning model training!")