In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from wordcloud import WordCloud

ModuleNotFoundError: No module named 'pandas'

In [2]:
def load_data(file_path):
    """Load processed data from CSV file."""
    return pd.read_csv(file_path)
 
def plot_category_distribution(data):
    """Plot the distribution of job categories."""
    plt.figure(figsize=(12, 6))
    sns.countplot(y=data['category'], order=data['category'].value_counts().index)
    plt.title('Distribution of Job Categories')
    plt.xlabel('Count')
    plt.ylabel('Category')
    plt.show()
 
def plot_word_frequency(data, top_n=20):
    """Plot the frequency of top N words across all resumes."""
    all_words = ' '.join(data['processed_text']).split()
    word_freq = Counter(all_words)
    top_words = dict(word_freq.most_common(top_n))
    
    plt.figure(figsize=(12, 6))
    sns.barplot(x=list(top_words.values()), y=list(top_words.keys()))
    plt.title(f'Top {top_n} Most Frequent Words')
    plt.xlabel('Frequency')
    plt.ylabel('Words')
    plt.show()
 
def generate_wordcloud(data):
    """Generate a word cloud from all resumes."""
    all_text = ' '.join(data['processed_text'])
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)
    
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud of Resume Content')
    plt.show()
 
def analyze_text_length(data):
    """Analyze and plot the distribution of text length."""
    data['text_length'] = data['processed_text'].apply(len)
    
    plt.figure(figsize=(10, 6))
    sns.histplot(data['text_length'], kde=True)
    plt.title('Distribution of Resume Text Length')
    plt.xlabel('Text Length')
    plt.ylabel('Count')
    plt.show()
 

In [None]:
# Main EDA function
def perform_eda(file_path):
    data = load_data(file_path)
    
    plot_category_distribution(data)
    plot_word_frequency(data)
    generate_wordcloud(data)
    analyze_text_length(data)
 
# Usage
perform_eda('data/processed/resumes.csv')