# Label Analysis
Analyzing the distribution and properties of the target labels (readability scores).

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from scipy import stats

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# Load processed data
neptun_df = pd.read_csv('/data/processed/neptun_data.csv')
eval_df = pd.read_csv('/data/processed/evaluation.csv')

print(f"NEPTUN data: {len(neptun_df)} samples")
print(f"Evaluation data: {len(eval_df)} samples")

## Calculate Complexity Metrics

In [None]:
def calculate_complexity_metrics(text):
    sentences = re.split(r'[.!?]+', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    sentence_count = len(sentences)
    
    words = text.split()
    word_count = len(words)
    
    avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
    unique_words = len(set(words))
    lexical_diversity = unique_words / word_count if word_count > 0 else 0
    avg_word_length = np.mean([len(w) for w in words]) if words else 0
    punctuation_count = len(re.findall(r'[.,;:!?()\[\]{}"\'-]', text))
    punctuation_ratio = punctuation_count / len(text) if len(text) > 0 else 0
    uppercase_count = sum(1 for c in text if c.isupper())
    uppercase_ratio = uppercase_count / len(text) if len(text) > 0 else 0
    
    return {
        'avg_sentence_length': avg_sentence_length,
        'lexical_diversity': lexical_diversity,
        'avg_word_length': avg_word_length,
        'punctuation_ratio': punctuation_ratio,
        'uppercase_ratio': uppercase_ratio
    }

print("Calculating complexity metrics...")
complexity_metrics = neptun_df['text'].apply(calculate_complexity_metrics)
complexity_df = pd.DataFrame(complexity_metrics.tolist())
neptun_df = pd.concat([neptun_df.reset_index(drop=True), complexity_df], axis=1)

neptun_df['text_length'] = neptun_df['text'].str.len()
neptun_df['word_count'] = neptun_df['text'].str.split().str.len()
eval_df['text_length'] = eval_df['text'].str.len()
eval_df['word_count'] = eval_df['text'].str.split().str.len()

print("Done!")

## Label Distribution

In [None]:
label_names = {
    1: 'Very Hard',
    2: 'Hard',
    3: 'Moderate',
    4: 'Easy',
    5: 'Very Easy'
}

print("NEPTUN Dataset - Label Distribution:")
neptun_counts = neptun_df['label'].value_counts().sort_index()
for label, count in neptun_counts.items():
    pct = (count / len(neptun_df)) * 100
    print(f"  {label} ({label_names[label]}): {count:4d} ({pct:5.1f}%)")

print("\nEvaluation Dataset - Label Distribution:")
eval_counts = eval_df['label'].value_counts().sort_index()
for label, count in eval_counts.items():
    pct = (count / len(eval_df)) * 100
    print(f"  {label} ({label_names[label]}): {count:4d} ({pct:5.1f}%)")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

neptun_counts.plot(kind='bar', ax=axes[0], color='steelblue', edgecolor='black')
axes[0].set_title('NEPTUN - Label Distribution')
axes[0].set_xlabel('Label')
axes[0].set_ylabel('Count')
axes[0].set_xticklabels([f"{l}\n{label_names[l]}" for l in neptun_counts.index], rotation=45, ha='right')
for i, v in enumerate(neptun_counts.values):
    axes[0].text(i, v + 10, str(v), ha='center', va='bottom')

eval_counts.plot(kind='bar', ax=axes[1], color='coral', edgecolor='black')
axes[1].set_title('Evaluation - Label Distribution')
axes[1].set_xlabel('Label')
axes[1].set_ylabel('Count')
axes[1].set_xticklabels([f"{l}\n{label_names[l]}" for l in eval_counts.index], rotation=45, ha='right')
for i, v in enumerate(eval_counts.values):
    axes[1].text(i, v + 1, str(v), ha='center', va='bottom')

plt.tight_layout()
plt.show()

## Complexity vs Label Correlation Analysis

In [None]:
# Correlation matrix with all complexity metrics
complexity_features = ['label', 'text_length', 'word_count', 'avg_sentence_length', 
                       'lexical_diversity', 'avg_word_length', 'punctuation_ratio', 'uppercase_ratio']

correlation_matrix = neptun_df[complexity_features].corr()

print("Correlation Matrix - All Features vs Label:")
print(correlation_matrix['label'].sort_values(ascending=False))

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt='.3f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix: Complexity Metrics vs Label')
plt.tight_layout()
plt.show()

In [None]:
# Scatter plots for key metrics
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

metrics_to_plot = ['text_length', 'word_count', 'avg_sentence_length', 
                   'lexical_diversity', 'avg_word_length', 'punctuation_ratio']
titles = ['Text Length', 'Word Count', 'Avg Sentence Length',
          'Lexical Diversity', 'Avg Word Length', 'Punctuation Ratio']

for idx, (metric, title) in enumerate(zip(metrics_to_plot, titles)):
    axes[idx].scatter(neptun_df[metric], neptun_df['label'], alpha=0.3, s=20)
    axes[idx].set_xlabel(title)
    axes[idx].set_ylabel('Label')
    axes[idx].set_title(f'{title} vs Label')
    axes[idx].set_yticks([1, 2, 3, 4, 5])
    axes[idx].grid(True, alpha=0.3)
    
    # Add correlation coefficient
    corr = neptun_df[[metric, 'label']].corr().iloc[0, 1]
    axes[idx].text(0.05, 0.95, f'r = {corr:.3f}', 
                   transform=axes[idx].transAxes, 
                   bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5),
                   verticalalignment='top')

plt.tight_layout()
plt.show()

## Polynomial Fit Analysis

In [None]:
# Test linear vs polynomial relationships
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.flatten()

key_metrics = ['text_length', 'lexical_diversity', 'avg_sentence_length', 'avg_word_length']
metric_titles = ['Text Length', 'Lexical Diversity', 'Avg Sentence Length', 'Avg Word Length']

for idx, (metric, title) in enumerate(zip(key_metrics, metric_titles)):
    X = neptun_df[metric].values.reshape(-1, 1)
    y = neptun_df['label'].values
    
    # Remove NaN values
    mask = ~np.isnan(X.flatten())
    X_clean = X[mask]
    y_clean = y[mask]
    
    # Scatter plot
    axes[idx].scatter(X_clean, y_clean, alpha=0.3, s=20, label='Data')
    
    # Linear fit
    lr = LinearRegression()
    lr.fit(X_clean, y_clean)
    X_range = np.linspace(X_clean.min(), X_clean.max(), 100).reshape(-1, 1)
    y_pred_linear = lr.predict(X_range)
    r2_linear = r2_score(y_clean, lr.predict(X_clean))
    axes[idx].plot(X_range, y_pred_linear, 'r-', linewidth=2, 
                   label=f'Linear (R²={r2_linear:.3f})')
    
    # Polynomial fit (degree 2)
    poly = PolynomialFeatures(degree=2)
    X_poly = poly.fit_transform(X_clean)
    lr_poly = LinearRegression()
    lr_poly.fit(X_poly, y_clean)
    X_range_poly = poly.transform(X_range)
    y_pred_poly = lr_poly.predict(X_range_poly)
    r2_poly = r2_score(y_clean, lr_poly.predict(X_poly))
    axes[idx].plot(X_range, y_pred_poly, 'g--', linewidth=2,
                   label=f'Polynomial (R²={r2_poly:.3f})')
    
    axes[idx].set_xlabel(title)
    axes[idx].set_ylabel('Label')
    axes[idx].set_title(f'{title} vs Label - Fit Comparison')
    axes[idx].set_yticks([1, 2, 3, 4, 5])
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Outlier Detection by Label

In [None]:
# Z-score based outlier detection
def detect_outliers_zscore(df, column, threshold=3):
    z_scores = np.abs(stats.zscore(df[column]))
    return z_scores > threshold

# IQR based outlier detection
def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return (df[column] < lower_bound) | (df[column] > upper_bound)

print("Outlier Detection by Label\n" + "="*60)

for label in sorted(neptun_df['label'].unique()):
    label_data = neptun_df[neptun_df['label'] == label].copy()
    
    # Detect outliers using both methods
    outliers_zscore_length = detect_outliers_zscore(label_data, 'text_length')
    outliers_iqr_length = detect_outliers_iqr(label_data, 'text_length')
    outliers_zscore_lex = detect_outliers_zscore(label_data, 'lexical_diversity')
    outliers_iqr_lex = detect_outliers_iqr(label_data, 'lexical_diversity')
    
    # Combined outliers (either method)
    combined_outliers = outliers_zscore_length | outliers_iqr_length | outliers_zscore_lex | outliers_iqr_lex
    
    print(f"\nLabel {label} ({label_names[label]}):")
    print(f"  Total samples: {len(label_data)}")
    print(f"  Outliers (Z-score, text_length): {outliers_zscore_length.sum()}")
    print(f"  Outliers (IQR, text_length): {outliers_iqr_length.sum()}")
    print(f"  Outliers (Z-score, lexical_diversity): {outliers_zscore_lex.sum()}")
    print(f"  Outliers (IQR, lexical_diversity): {outliers_iqr_lex.sum()}")
    print(f"  Combined outliers: {combined_outliers.sum()} ({combined_outliers.sum()/len(label_data)*100:.1f}%)")

In [None]:
# Visualize outliers
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for idx, label in enumerate(sorted(neptun_df['label'].unique())):
    label_data = neptun_df[neptun_df['label'] == label].copy()
    
    # Detect outliers
    outliers_length = detect_outliers_iqr(label_data, 'text_length')
    outliers_lex = detect_outliers_iqr(label_data, 'lexical_diversity')
    
    # Plot
    axes[idx].scatter(label_data.loc[~outliers_length & ~outliers_lex, 'text_length'],
                     label_data.loc[~outliers_length & ~outliers_lex, 'lexical_diversity'],
                     alpha=0.5, s=30, label='Normal', color='blue')
    
    axes[idx].scatter(label_data.loc[outliers_length | outliers_lex, 'text_length'],
                     label_data.loc[outliers_length | outliers_lex, 'lexical_diversity'],
                     alpha=0.8, s=50, label='Outlier', color='red', marker='x')
    
    axes[idx].set_xlabel('Text Length')
    axes[idx].set_ylabel('Lexical Diversity')
    axes[idx].set_title(f'Label {label} - Outlier Detection (IQR)')
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)

axes[5].axis('off')
plt.tight_layout()
plt.show()

In [None]:
# Show example outlier texts
print("\nExample Outlier Texts:\n" + "="*80)

for label in sorted(neptun_df['label'].unique()):
    label_data = neptun_df[neptun_df['label'] == label].copy()
    
    outliers_length = detect_outliers_iqr(label_data, 'text_length')
    outliers_lex = detect_outliers_iqr(label_data, 'lexical_diversity')
    combined_outliers = outliers_length | outliers_lex
    
    if combined_outliers.sum() > 0:
        outlier_samples = label_data[combined_outliers].head(2)
        
        print(f"\nLabel {label} ({label_names[label]}):")
        for idx, row in outlier_samples.iterrows():
            print(f"  Text: {row['text'][:150]}...")
            print(f"  Length: {row['text_length']}, Lex Div: {row['lexical_diversity']:.3f}")
            print(f"  Reason: ", end="")
            if outliers_length.loc[idx]:
                print("Unusual text length ", end="")
            if outliers_lex.loc[idx]:
                print("Unusual lexical diversity")
            print("-" * 80)

## Class Imbalance Analysis

In [None]:
def calculate_imbalance_ratio(df):
    counts = df['label'].value_counts()
    max_count = counts.max()
    min_count = counts.min()
    return max_count / min_count

neptun_ratio = calculate_imbalance_ratio(neptun_df)
eval_ratio = calculate_imbalance_ratio(eval_df)

print(f"NEPTUN Imbalance Ratio: {neptun_ratio:.2f}:1")
print(f"Evaluation Imbalance Ratio: {eval_ratio:.2f}:1")
print("\nNote: Ratio > 1.5 indicates class imbalance")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

neptun_counts.plot(kind='pie', ax=axes[0], autopct='%1.1f%%', startangle=90,
                   labels=[label_names[l] for l in neptun_counts.index])
axes[0].set_title('NEPTUN - Label Proportions')
axes[0].set_ylabel('')

eval_counts.plot(kind='pie', ax=axes[1], autopct='%1.1f%%', startangle=90,
                 labels=[label_names[l] for l in eval_counts.index])
axes[1].set_title('Evaluation - Label Proportions')
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()

## Dataset Comparison

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))

x = np.arange(len(label_names))
width = 0.35

neptun_pct = (neptun_counts / len(neptun_df) * 100).values
eval_pct = (eval_counts / len(eval_df) * 100).values

ax.bar(x - width/2, neptun_pct, width, label='NEPTUN', color='steelblue', edgecolor='black')
ax.bar(x + width/2, eval_pct, width, label='Evaluation', color='coral', edgecolor='black')

ax.set_xlabel('Label')
ax.set_ylabel('Percentage (%)')
ax.set_title('Label Distribution Comparison')
ax.set_xticks(x)
ax.set_xticklabels([f"{l}\n{label_names[l]}" for l in sorted(label_names.keys())])
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## Summary Statistics

In [None]:
print("=" * 60)
print("LABEL ANALYSIS SUMMARY")
print("=" * 60)
print(f"\nNEPTUN Dataset:")
print(f"  Total samples: {len(neptun_df)}")
print(f"  Number of classes: {neptun_df['label'].nunique()}")
print(f"  Imbalance ratio: {neptun_ratio:.2f}:1")
print(f"  Most common label: {neptun_counts.idxmax()} ({label_names[neptun_counts.idxmax()]})")
print(f"  Least common label: {neptun_counts.idxmin()} ({label_names[neptun_counts.idxmin()]})")

print(f"\nEvaluation Dataset:")
print(f"  Total samples: {len(eval_df)}")
print(f"  Number of classes: {eval_df['label'].nunique()}")
print(f"  Imbalance ratio: {eval_ratio:.2f}:1")
print(f"  Most common label: {eval_counts.idxmax()} ({label_names[eval_counts.idxmax()]})")
print(f"  Least common label: {eval_counts.idxmin()} ({label_names[eval_counts.idxmin()]})")

print(f"\nKey Correlations with Label:")
for feature in ['text_length', 'lexical_diversity', 'avg_sentence_length', 'avg_word_length']:
    corr = neptun_df[[feature, 'label']].corr().iloc[0, 1]
    print(f"  {feature:25s}: {corr:6.3f}")

print("=" * 60)