# Data exploration
Initial exploratory data analysis (EDA) and visualization of the legal text dataset.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
neptun_df = pd.read_csv('/data/processed/neptun_data.csv')
eval_df = pd.read_csv('/data/processed/evaluation.csv')

print(f"Train data: {len(neptun_df)} samples")
print(f"Evaluation data: {len(eval_df)} samples")

## Basic statistics

In [None]:
neptun_df['text_length'] = neptun_df['text'].str.len()
eval_df['text_length'] = eval_df['text'].str.len()

print("Train dataset:")
print(neptun_df['text_length'].describe())
print("\nEvaluation dataset:")
print(eval_df['text_length'].describe())

In [None]:
neptun_df['word_count'] = neptun_df['text'].str.split().str.len()
eval_df['word_count'] = eval_df['text'].str.split().str.len()

print("Train dataset - word count:")
print(neptun_df['word_count'].describe())
print("\nEvaluation dataset - word count:")
print(eval_df['word_count'].describe())

## Text complexity metrics

In [None]:
def calculate_complexity_metrics(text):
    """Calculate various text complexity metrics"""
    sentences = re.split(r'[.!?]+', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    sentence_count = len(sentences)
    
    words = text.split()
    word_count = len(words)
    
    avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
    
    unique_words = len(set(words))
    lexical_diversity = unique_words / word_count if word_count > 0 else 0
    
    avg_word_length = np.mean([len(w) for w in words]) if words else 0
    
    punctuation_count = len(re.findall(r'[.,;:!?()\[\]{}"\'-]', text))
    punctuation_ratio = punctuation_count / len(text) if len(text) > 0 else 0
    
    uppercase_count = sum(1 for c in text if c.isupper())
    uppercase_ratio = uppercase_count / len(text) if len(text) > 0 else 0
    
    return {
        'avg_sentence_length': avg_sentence_length,
        'lexical_diversity': lexical_diversity,
        'avg_word_length': avg_word_length,
        'punctuation_ratio': punctuation_ratio,
        'uppercase_ratio': uppercase_ratio
    }

print("Calculating complexity metrics for train dataset...")
complexity_metrics = neptun_df['text'].apply(calculate_complexity_metrics)
complexity_df = pd.DataFrame(complexity_metrics.tolist())
neptun_df = pd.concat([neptun_df, complexity_df], axis=1)

print("\nComplexity metrics summary (train):")
print(complexity_df.describe())

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

metrics = ['avg_sentence_length', 'lexical_diversity', 'avg_word_length', 
           'punctuation_ratio', 'uppercase_ratio']
titles = ['Avg Sentence Length', 'Lexical Diversity', 'Avg Word Length',
          'Punctuation Ratio', 'Uppercase Ratio']

for i, (metric, title) in enumerate(zip(metrics, titles)):
    axes[i].hist(neptun_df[metric], bins=50, edgecolor='black', alpha=0.7)
    axes[i].set_title(f'{title} Distribution')
    axes[i].set_xlabel(title)
    axes[i].set_ylabel('Frequency')
    axes[i].axvline(neptun_df[metric].median(), color='red', linestyle='--', label='Median')
    axes[i].legend()

axes[5].axis('off')
plt.tight_layout()
plt.show()

In [None]:
print("\nComplexity metrics by label:")
for metric in metrics:
    print(f"\n{metric.upper()}:")
    print(neptun_df.groupby('label')[metric].agg(['mean', 'std']).round(3))

## Text length distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

axes[0].hist(neptun_df['text_length'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_title('Train - text length distribution')
axes[0].set_xlabel('Character count')
axes[0].set_ylabel('Frequency')
axes[0].axvline(neptun_df['text_length'].median(), color='red', linestyle='--', label='Median')
axes[0].legend()

axes[1].hist(eval_df['text_length'], bins=50, edgecolor='black', alpha=0.7, color='orange')
axes[1].set_title('Evaluation - text length distribution')
axes[1].set_xlabel('Character count')
axes[1].set_ylabel('Frequency')
axes[1].axvline(eval_df['text_length'].median(), color='red', linestyle='--', label='Median')
axes[1].legend()

plt.tight_layout()
plt.show()

## Word count distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

axes[0].hist(neptun_df['word_count'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_title('Train - word count distribution')
axes[0].set_xlabel('Word count')
axes[0].set_ylabel('Frequency')
axes[0].axvline(neptun_df['word_count'].median(), color='red', linestyle='--', label='Median')
axes[0].legend()

axes[1].hist(eval_df['word_count'], bins=50, edgecolor='black', alpha=0.7, color='orange')
axes[1].set_title('Evaluation - word count distribution')
axes[1].set_xlabel('Word count')
axes[1].set_ylabel('Frequency')
axes[1].axvline(eval_df['word_count'].median(), color='red', linestyle='--', label='Median')
axes[1].legend()

plt.tight_layout()
plt.show()

## Word frequency analysis

In [None]:
hungarian_stopwords = set([
    'a', 'az', 'egy', 'és', 'vagy', 'de', 'hogy', 'nem', 'van', 'volt',
    'lesz', 'lehet', 'mint', 'csak', 'is', 'ha', 'meg', 'el', 'fel',
    'ki', 'be', 'le', 'át', 'ezt', 'azt', 'ezen', 'azon', 'amely',
    'ami', 'aki', 'ahol', 'amikor', 'ahogy', 'minden', 'semmi', 'valami',
    'más', 'egyik', 'másik', 'mindkét', 'több', 'kevés', 'sok', 'néhány',
    'által', 'között', 'alatt', 'felett', 'mellett', 'után', 'előtt', 'nélkül',
    'miatt', 'számára', 'szerint', 'ellen', 'körül', 'során', 'keresztül',
    'illetve', 'valamint', 'továbbá', 'azonban', 'tehát', 'ezért', 'mert'
])

all_text = ' '.join(neptun_df['text'].values)
words = re.findall(r'\b[a-záéíóöőúüű]+\b', all_text.lower())

word_freq = Counter(words)
top_30 = word_freq.most_common(30)

print("Top 30 most common words:")
for word, count in top_30:
    print(f"{word:20s}: {count:5d}")

In [None]:
words_no_stop = [w for w in words if w not in hungarian_stopwords and len(w) > 2]
word_freq_no_stop = Counter(words_no_stop)
top_30_no_stop = word_freq_no_stop.most_common(30)

print("Top 30 most common words (without stop words):")
for word, count in top_30_no_stop:
    print(f"{word:20s}: {count:5d}")

words_plot, counts_plot = zip(*top_30_no_stop)
plt.figure(figsize=(14, 8))
plt.barh(range(len(words_plot)), counts_plot, color='steelblue', edgecolor='black')
plt.yticks(range(len(words_plot)), words_plot)
plt.xlabel('Frequency')
plt.title('Top 30 most common words (without stop words)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## Bigram and trigram analysis

In [None]:
vectorizer_bigram = CountVectorizer(ngram_range=(2, 2), max_features=30, 
                                    token_pattern=r'\b[a-záéíóöőúüű]+\b')
bigram_matrix = vectorizer_bigram.fit_transform(neptun_df['text'].str.lower())
bigram_freq = bigram_matrix.sum(axis=0).A1
bigram_names = vectorizer_bigram.get_feature_names_out()

bigram_data = sorted(zip(bigram_names, bigram_freq), key=lambda x: x[1], reverse=True)

print("Top 30 bigrams:")
for bigram, count in bigram_data[:30]:
    print(f"{bigram:40s}: {count:5.0f}")

bigrams_plot, counts_plot = zip(*bigram_data[:20])
plt.figure(figsize=(14, 8))
plt.barh(range(len(bigrams_plot)), counts_plot, color='coral', edgecolor='black')
plt.yticks(range(len(bigrams_plot)), bigrams_plot)
plt.xlabel('Frequency')
plt.title('Top 20 bigrams')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
vectorizer_trigram = CountVectorizer(ngram_range=(3, 3), max_features=30,
                                     token_pattern=r'\b[a-záéíóöőúüű]+\b')
trigram_matrix = vectorizer_trigram.fit_transform(neptun_df['text'].str.lower())
trigram_freq = trigram_matrix.sum(axis=0).A1
trigram_names = vectorizer_trigram.get_feature_names_out()

trigram_data = sorted(zip(trigram_names, trigram_freq), key=lambda x: x[1], reverse=True)

print("Top 30 trigrams:")
for trigram, count in trigram_data[:30]:
    print(f"{trigram:50s}: {count:5.0f}")

trigrams_plot, counts_plot = zip(*trigram_data[:20])
plt.figure(figsize=(14, 8))
plt.barh(range(len(trigrams_plot)), counts_plot, color='lightgreen', edgecolor='black')
plt.yticks(range(len(trigrams_plot)), trigrams_plot, fontsize=9)
plt.xlabel('Frequency')
plt.title('Top 20 trigrams')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## TF-IDF Analysis by Label

In [None]:
print("Top TF-IDF words by label:\n")

for label in sorted(neptun_df['label'].unique()):
    label_texts = neptun_df[neptun_df['label'] == label]['text'].values
    
    tfidf = TfidfVectorizer(max_features=15, token_pattern=r'\b[a-záéíóöőúüű]{3,}\b',
                            stop_words=list(hungarian_stopwords))
    tfidf_matrix = tfidf.fit_transform(label_texts)
    
    feature_names = tfidf.get_feature_names_out()
    avg_tfidf = tfidf_matrix.mean(axis=0).A1
    
    top_words = sorted(zip(feature_names, avg_tfidf), key=lambda x: x[1], reverse=True)
    
    print(f"Label {label}:")
    for word, score in top_words[:10]:
        print(f"  {word:20s}: {score:.4f}")
    print()

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for idx, label in enumerate(sorted(neptun_df['label'].unique())):
    label_texts = neptun_df[neptun_df['label'] == label]['text'].values
    
    tfidf = TfidfVectorizer(max_features=10, token_pattern=r'\b[a-záéíóöőúüű]{3,}\b',
                            stop_words=list(hungarian_stopwords))
    tfidf_matrix = tfidf.fit_transform(label_texts)
    
    feature_names = tfidf.get_feature_names_out()
    avg_tfidf = tfidf_matrix.mean(axis=0).A1
    
    top_words = sorted(zip(feature_names, avg_tfidf), key=lambda x: x[1], reverse=True)
    words, scores = zip(*top_words)
    
    axes[idx].barh(range(len(words)), scores, color='skyblue', edgecolor='black')
    axes[idx].set_yticks(range(len(words)))
    axes[idx].set_yticklabels(words, fontsize=9)
    axes[idx].set_xlabel('TF-IDF score')
    axes[idx].set_title(f'Label {label} - top TF-IDF words')
    axes[idx].invert_yaxis()

axes[5].axis('off')
plt.tight_layout()
plt.show()

## Text length by label

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

neptun_df.boxplot(column='text_length', by='label', ax=axes[0])
axes[0].set_title('Train - text length by label')
axes[0].set_xlabel('Label')
axes[0].set_ylabel('Character count')
plt.sca(axes[0])
plt.xticks(rotation=0)

eval_df.boxplot(column='text_length', by='label', ax=axes[1])
axes[1].set_title('Evaluation - text length by label')
axes[1].set_xlabel('Label')
axes[1].set_ylabel('Character count')
plt.sca(axes[1])
plt.xticks(rotation=0)

plt.tight_layout()
plt.show()

## Dataset comparison

In [None]:
comparison = pd.DataFrame({
    'Metric': ['Samples', 'Avg text length', 'Avg word count', 'Min length', 'Max length'],
    'Train': [
        len(neptun_df),
        neptun_df['text_length'].mean(),
        neptun_df['word_count'].mean(),
        neptun_df['text_length'].min(),
        neptun_df['text_length'].max()
    ],
    'Evaluation': [
        len(eval_df),
        eval_df['text_length'].mean(),
        eval_df['word_count'].mean(),
        eval_df['text_length'].min(),
        eval_df['text_length'].max()
    ]
})

print(comparison.to_string(index=False))