# Exploratory Data Analysis (EDA) - Emotions Dataset

This notebook provides comprehensive exploratory data analysis of the emotions training dataset.

## Analysis Sections:
1. **Dataset Overview** - Basic statistics and structure
2. **Class Distribution** - Emotion label analysis
3. **Text Length Analysis** - Character and word count distributions
4. **Vocabulary Analysis** - Unique words and vocabulary richness
5. **Word Frequency Analysis** - Most common words per emotion
6. **Word Clouds** - Visual representation of emotion-specific vocabulary
7. **N-gram Analysis** - Common phrases (bigrams, trigrams)
8. **Text Complexity Metrics** - Linguistic diversity measures
9. **Sample Text Inspection** - Example texts from each emotion
10. **Statistical Summary** - Key insights and recommendations


## 1. Import Required Libraries


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import nltk
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.util import ngrams
import warnings

warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Download NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

print("‚úÖ Libraries imported successfully")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")


## 2. Load Training Data


In [None]:
# Load training data
df = pd.read_csv('./data/train.csv')

print(f"‚úÖ Dataset loaded successfully")
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")


## 3. Dataset Overview


In [None]:
# Display first few rows
print("\nüìä First 10 rows of the dataset:")
df.head(10)


In [None]:
# Dataset information
print("\nüìã Dataset Information:")
df.info()


In [None]:
# Basic statistics
print("\nüìà Statistical Summary:")
df.describe(include='all')


In [None]:
# Check for missing values
print("\nüîç Missing Values:")
missing = df.isnull().sum()
print(missing)
print(f"\nTotal missing values: {missing.sum()}")


In [None]:
# Check for duplicates
duplicates = df.duplicated().sum()
print(f"\nüîç Duplicate rows: {duplicates} ({duplicates/len(df)*100:.2f}%)")

if duplicates > 0:
    print("\nSample duplicate entries:")
    display(df[df.duplicated(keep=False)].head(10))


## 4. Emotion Label Mapping & Distribution


In [None]:
# Define emotion label mapping
emotion_mapping = {
    0: 'Sadness',
    1: 'Joy',
    2: 'Love',
    3: 'Anger',
    4: 'Fear',
    5: 'Surprise'
}

# Create emotion name column for better visualization
df['emotion_name'] = df['label'].map(emotion_mapping)

print("\nüè∑Ô∏è Emotion Label Mapping:")
for label, emotion in emotion_mapping.items():
    print(f"  {label} ‚Üí {emotion}")


In [None]:
# Class distribution
print("\nüìä Class Distribution:")
class_counts = df['emotion_name'].value_counts().sort_index()
class_percentages = (class_counts / len(df) * 100).round(2)

distribution_df = pd.DataFrame({
    'Emotion': class_counts.index,
    'Count': class_counts.values,
    'Percentage': class_percentages.values
})

display(distribution_df)

# Check for class imbalance
max_ratio = class_counts.max() / class_counts.min()
print(f"\n‚öñÔ∏è Class imbalance ratio (max/min): {max_ratio:.2f}")
if max_ratio > 3:
    print("‚ö†Ô∏è Warning: Significant class imbalance detected!")
else:
    print("‚úÖ Classes are relatively balanced")


In [None]:
# Visualize class distribution
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Pie chart
colors = sns.color_palette('husl', len(class_counts))
axes[0].pie(class_counts.values, labels=class_counts.index, autopct='%1.1f%%', 
            startangle=140, colors=colors)
axes[0].set_title('Emotion Distribution (Pie Chart)', fontsize=14, fontweight='bold')

# Bar chart
sns.barplot(x=class_counts.index, y=class_counts.values, ax=axes[1], palette='husl')
axes[1].set_title('Emotion Distribution (Bar Chart)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Emotion', fontsize=12)
axes[1].set_ylabel('Count', fontsize=12)
axes[1].tick_params(axis='x', rotation=45)

# Add count labels on bars
for i, v in enumerate(class_counts.values):
    axes[1].text(i, v + 100, str(v), ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()


## 5. Text Length Analysis


In [None]:
# Calculate text statistics
df['text_length'] = df['text'].str.len()
df['word_count'] = df['text'].str.split().str.len()
df['avg_word_length'] = df['text'].apply(lambda x: np.mean([len(word) for word in x.split()]))
df['unique_words'] = df['text'].apply(lambda x: len(set(x.split())))
df['unique_word_ratio'] = df['unique_words'] / df['word_count']

print("‚úÖ Text features calculated")


In [None]:
# Overall text length statistics
print("\nüìè Overall Text Length Statistics:")
print(f"  Character count - Mean: {df['text_length'].mean():.2f}, Median: {df['text_length'].median():.2f}")
print(f"  Character count - Min: {df['text_length'].min()}, Max: {df['text_length'].max()}")
print(f"  Word count - Mean: {df['word_count'].mean():.2f}, Median: {df['word_count'].median():.2f}")
print(f"  Word count - Min: {df['word_count'].min()}, Max: {df['word_count'].max()}")
print(f"  Average word length - Mean: {df['avg_word_length'].mean():.2f}")
print(f"  Unique word ratio - Mean: {df['unique_word_ratio'].mean():.2f}")


In [None]:
# Text length statistics by emotion
print("\nüìä Text Length Statistics by Emotion:")
length_stats = df.groupby('emotion_name')[['text_length', 'word_count', 'avg_word_length']].agg(['mean', 'median', 'std'])
display(length_stats.round(2))


In [None]:
# Visualize text length distributions
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Character count distribution
axes[0, 0].hist(df['text_length'], bins=50, color='skyblue', edgecolor='black')
axes[0, 0].set_title('Character Count Distribution', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Number of Characters')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(df['text_length'].mean(), color='red', linestyle='--', label=f"Mean: {df['text_length'].mean():.1f}")
axes[0, 0].legend()

# Word count distribution
axes[0, 1].hist(df['word_count'], bins=50, color='lightcoral', edgecolor='black')
axes[0, 1].set_title('Word Count Distribution', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Number of Words')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].axvline(df['word_count'].mean(), color='red', linestyle='--', label=f"Mean: {df['word_count'].mean():.1f}")
axes[0, 1].legend()

# Word count by emotion (boxplot)
df_sorted = df.sort_values('emotion_name')
sns.boxplot(data=df_sorted, x='emotion_name', y='word_count', ax=axes[1, 0], palette='Set2')
axes[1, 0].set_title('Word Count Distribution by Emotion', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Emotion')
axes[1, 0].set_ylabel('Word Count')
axes[1, 0].tick_params(axis='x', rotation=45)

# Character count by emotion (violin plot)
sns.violinplot(data=df_sorted, x='emotion_name', y='text_length', ax=axes[1, 1], palette='Set3')
axes[1, 1].set_title('Character Count Distribution by Emotion', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Emotion')
axes[1, 1].set_ylabel('Character Count')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()


In [None]:
# Calculate vocabulary statistics
all_words = ' '.join(df['text']).lower().split()
total_words = len(all_words)
unique_words = len(set(all_words))
vocab_richness = unique_words / total_words

print("\nüìö Vocabulary Statistics:")
print(f"  Total words: {total_words:,}")
print(f"  Unique words: {unique_words:,}")
print(f"  Vocabulary richness: {vocab_richness:.4f}")

# Most common words overall
word_freq = Counter(all_words)
most_common = word_freq.most_common(20)

print("\nüî§ Top 20 Most Common Words:")
for i, (word, count) in enumerate(most_common, 1):
    print(f"  {i:2d}. {word:15s} - {count:6,} times")


In [None]:
# Visualize most common words
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar chart of top 20 words
words, counts = zip(*most_common)
axes[0].barh(range(len(words)), counts, color='steelblue')
axes[0].set_yticks(range(len(words)))
axes[0].set_yticklabels(words)
axes[0].invert_yaxis()
axes[0].set_title('Top 20 Most Common Words', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Frequency')

# Word frequency distribution (log scale)
freq_values = sorted(word_freq.values(), reverse=True)
axes[1].plot(freq_values[:1000], color='darkgreen', linewidth=2)
axes[1].set_title('Word Frequency Distribution (Top 1000 words)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Rank')
axes[1].set_ylabel('Frequency')
axes[1].set_yscale('log')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Vocabulary size by emotion
print("\nüìä Vocabulary Statistics by Emotion:")
vocab_by_emotion = {}

for emotion in emotion_mapping.values():
    emotion_texts = df[df['emotion_name'] == emotion]['text']
    emotion_words = ' '.join(emotion_texts).lower().split()
    vocab_by_emotion[emotion] = {
        'total_words': len(emotion_words),
        'unique_words': len(set(emotion_words)),
        'vocab_richness': len(set(emotion_words)) / len(emotion_words)
    }

vocab_df = pd.DataFrame(vocab_by_emotion).T
display(vocab_df.style.format({
    'total_words': '{:,.0f}',
    'unique_words': '{:,.0f}',
    'vocab_richness': '{:.4f}'
}))


## 7. Most Common Words by Emotion


In [None]:
# Get top words for each emotion
print("\nüéØ Top 10 Most Common Words by Emotion:\n")

for emotion in emotion_mapping.values():
    emotion_texts = df[df['emotion_name'] == emotion]['text']
    emotion_words = ' '.join(emotion_texts).lower().split()
    emotion_word_freq = Counter(emotion_words)
    top_words = emotion_word_freq.most_common(10)
    
    print(f"\n{emotion}:")
    for i, (word, count) in enumerate(top_words, 1):
        print(f"  {i:2d}. {word:15s} - {count:5,} times")


In [None]:
# Visualize top words by emotion
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for idx, emotion in enumerate(emotion_mapping.values()):
    emotion_texts = df[df['emotion_name'] == emotion]['text']
    emotion_words = ' '.join(emotion_texts).lower().split()
    emotion_word_freq = Counter(emotion_words)
    top_words = emotion_word_freq.most_common(10)
    
    words, counts = zip(*top_words)
    axes[idx].barh(range(len(words)), counts, color=sns.color_palette('husl', 6)[idx])
    axes[idx].set_yticks(range(len(words)))
    axes[idx].set_yticklabels(words)
    axes[idx].invert_yaxis()
    axes[idx].set_title(f'{emotion}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('Frequency', fontsize=10)

plt.tight_layout()
plt.suptitle('Top 10 Words by Emotion', fontsize=16, fontweight='bold', y=1.02)
plt.show()


## 8. Word Clouds by Emotion


In [None]:
# Generate word clouds for each emotion
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

colors = ['Blues', 'Greens', 'Reds', 'Oranges', 'Purples', 'YlOrBr']

for idx, emotion in enumerate(emotion_mapping.values()):
    emotion_texts = df[df['emotion_name'] == emotion]['text']
    combined_text = ' '.join(emotion_texts)
    
    wordcloud = WordCloud(
        width=800, 
        height=400, 
        background_color='white',
        colormap=colors[idx],
        max_words=100
    ).generate(combined_text)
    
    axes[idx].imshow(wordcloud, interpolation='bilinear')
    axes[idx].set_title(f'{emotion} Word Cloud', fontsize=14, fontweight='bold')
    axes[idx].axis('off')

plt.tight_layout()
plt.show()


## 9. N-gram Analysis (Bigrams & Trigrams)


In [None]:
# Function to get n-grams
def get_ngrams(text_series, n=2, top_k=15):
    """Extract top k n-grams from text series"""
    all_ngrams = []
    for text in text_series:
        tokens = text.lower().split()
        text_ngrams = list(ngrams(tokens, n))
        all_ngrams.extend(text_ngrams)
    
    ngram_freq = Counter(all_ngrams)
    return ngram_freq.most_common(top_k)

# Get overall bigrams and trigrams
print("\nüî§ Top 15 Bigrams (2-word phrases):\n")
bigrams = get_ngrams(df['text'], n=2, top_k=15)
for i, (bigram, count) in enumerate(bigrams, 1):
    print(f"  {i:2d}. {' '.join(bigram):30s} - {count:5,} times")

print("\nüî§ Top 15 Trigrams (3-word phrases):\n")
trigrams = get_ngrams(df['text'], n=3, top_k=15)
for i, (trigram, count) in enumerate(trigrams, 1):
    print(f"  {i:2d}. {' '.join(trigram):40s} - {count:5,} times")


In [None]:
# Visualize bigrams and trigrams
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bigrams
bigram_labels = [' '.join(bg) for bg, _ in bigrams]
bigram_counts = [count for _, count in bigrams]
axes[0].barh(range(len(bigram_labels)), bigram_counts, color='teal')
axes[0].set_yticks(range(len(bigram_labels)))
axes[0].set_yticklabels(bigram_labels, fontsize=9)
axes[0].invert_yaxis()
axes[0].set_title('Top 15 Bigrams', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Frequency')

# Trigrams
trigram_labels = [' '.join(tg) for tg, _ in trigrams]
trigram_counts = [count for _, count in trigrams]
axes[1].barh(range(len(trigram_labels)), trigram_counts, color='coral')
axes[1].set_yticks(range(len(trigram_labels)))
axes[1].set_yticklabels(trigram_labels, fontsize=9)
axes[1].invert_yaxis()
axes[1].set_title('Top 15 Trigrams', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Frequency')

plt.tight_layout()
plt.show()


In [None]:
# Bigrams by emotion
print("\nüéØ Top 5 Bigrams by Emotion:\n")

for emotion in emotion_mapping.values():
    emotion_texts = df[df['emotion_name'] == emotion]['text']
    emotion_bigrams = get_ngrams(emotion_texts, n=2, top_k=5)
    
    print(f"\n{emotion}:")
    for i, (bigram, count) in enumerate(emotion_bigrams, 1):
        print(f"  {i}. {' '.join(bigram):25s} - {count:4,} times")


## 10. Text Complexity & Diversity Metrics


In [None]:
# Calculate additional complexity metrics
print("\nüìä Text Complexity Metrics by Emotion:\n")

complexity_stats = df.groupby('emotion_name').agg({
    'text_length': 'mean',
    'word_count': 'mean',
    'avg_word_length': 'mean',
    'unique_words': 'mean',
    'unique_word_ratio': 'mean'
}).round(2)

display(complexity_stats)


In [None]:
# Visualize text complexity metrics
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Average word length by emotion
sns.barplot(data=df, x='emotion_name', y='avg_word_length', ax=axes[0, 0], 
            palette='viridis', estimator=np.mean, ci=None)
axes[0, 0].set_title('Average Word Length by Emotion', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Emotion')
axes[0, 0].set_ylabel('Average Word Length')
axes[0, 0].tick_params(axis='x', rotation=45)

# Unique word ratio by emotion
sns.boxplot(data=df, x='emotion_name', y='unique_word_ratio', ax=axes[0, 1], palette='coolwarm')
axes[0, 1].set_title('Unique Word Ratio Distribution by Emotion', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Emotion')
axes[0, 1].set_ylabel('Unique Word Ratio')
axes[0, 1].tick_params(axis='x', rotation=45)

# Unique words by emotion
sns.violinplot(data=df, x='emotion_name', y='unique_words', ax=axes[1, 0], palette='Set2')
axes[1, 0].set_title('Unique Words Distribution by Emotion', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Emotion')
axes[1, 0].set_ylabel('Number of Unique Words')
axes[1, 0].tick_params(axis='x', rotation=45)

# Correlation heatmap
numeric_cols = ['text_length', 'word_count', 'avg_word_length', 'unique_words', 'unique_word_ratio', 'label']
corr_matrix = df[numeric_cols].corr()
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', ax=axes[1, 1], 
            cbar_kws={'label': 'Correlation'})
axes[1, 1].set_title('Feature Correlation Matrix', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()


## 11. Sample Text Inspection


In [None]:
# Display sample texts from each emotion
print("\nüìù Sample Texts by Emotion (5 examples per emotion):\n")

for emotion in emotion_mapping.values():
    print(f"\n{'='*80}")
    print(f"{emotion.upper()}")
    print('='*80)
    
    samples = df[df['emotion_name'] == emotion]['text'].sample(n=5, random_state=42)
    
    for i, text in enumerate(samples, 1):
        print(f"\n{i}. {text}")


In [None]:
# Find longest and shortest texts
print("\nüìè Text Length Extremes:\n")

# Longest texts
print("\nüîπ Top 3 Longest Texts:\n")
longest_texts = df.nlargest(3, 'text_length')[['text', 'emotion_name', 'text_length', 'word_count']]
for idx, row in longest_texts.iterrows():
    print(f"Emotion: {row['emotion_name']}")
    print(f"Length: {row['text_length']} chars, {row['word_count']} words")
    print(f"Text: {row['text']}")
    print()

# Shortest texts
print("\nüîπ Top 3 Shortest Texts:\n")
shortest_texts = df.nsmallest(3, 'text_length')[['text', 'emotion_name', 'text_length', 'word_count']]
for idx, row in shortest_texts.iterrows():
    print(f"Emotion: {row['emotion_name']}")
    print(f"Length: {row['text_length']} chars, {row['word_count']} words")
    print(f"Text: {row['text']}")
    print()


## 12. Stopwords Analysis


In [None]:
# Analyze stopwords presence
stop_words = set(stopwords.words('english'))

def count_stopwords(text):
    words = text.lower().split()
    return sum(1 for word in words if word in stop_words)

df['stopword_count'] = df['text'].apply(count_stopwords)
df['stopword_ratio'] = df['stopword_count'] / df['word_count']

print("\nüõë Stopword Statistics:")
print(f"  Average stopwords per text: {df['stopword_count'].mean():.2f}")
print(f"  Average stopword ratio: {df['stopword_ratio'].mean():.2%}")

print("\nüìä Stopword Statistics by Emotion:")
stopword_stats = df.groupby('emotion_name')[['stopword_count', 'stopword_ratio']].mean().round(3)
display(stopword_stats)


In [None]:
# Visualize stopword distribution
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Stopword count distribution
axes[0].hist(df['stopword_count'], bins=30, color='salmon', edgecolor='black')
axes[0].set_title('Stopword Count Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Number of Stopwords')
axes[0].set_ylabel('Frequency')
axes[0].axvline(df['stopword_count'].mean(), color='red', linestyle='--', 
                label=f"Mean: {df['stopword_count'].mean():.1f}")
axes[0].legend()

# Stopword ratio by emotion
sns.boxplot(data=df, x='emotion_name', y='stopword_ratio', ax=axes[1], palette='pastel')
axes[1].set_title('Stopword Ratio by Emotion', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Emotion')
axes[1].set_ylabel('Stopword Ratio')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()


## 13. Key Insights & Summary


In [None]:
# Generate comprehensive summary
print("\n" + "="*80)
print("üìä EDA SUMMARY - KEY INSIGHTS")
print("="*80)

print(f"\n1Ô∏è‚É£ DATASET OVERVIEW:")
print(f"   ‚Ä¢ Total samples: {len(df):,}")
print(f"   ‚Ä¢ Number of emotions: {df['label'].nunique()}")
print(f"   ‚Ä¢ Missing values: {df.isnull().sum().sum()}")
print(f"   ‚Ä¢ Duplicate rows: {duplicates:,} ({duplicates/len(df)*100:.2f}%)")

print(f"\n2Ô∏è‚É£ CLASS DISTRIBUTION:")
for emotion, count in class_counts.items():
    print(f"   ‚Ä¢ {emotion:10s}: {count:6,} ({count/len(df)*100:5.2f}%)")
print(f"   ‚Ä¢ Class imbalance ratio: {max_ratio:.2f}")

print(f"\n3Ô∏è‚É£ TEXT STATISTICS:")
print(f"   ‚Ä¢ Average text length: {df['text_length'].mean():.1f} characters")
print(f"   ‚Ä¢ Average word count: {df['word_count'].mean():.1f} words")
print(f"   ‚Ä¢ Average word length: {df['avg_word_length'].mean():.2f} characters")
print(f"   ‚Ä¢ Text length range: {df['text_length'].min()} - {df['text_length'].max()} characters")
print(f"   ‚Ä¢ Word count range: {df['word_count'].min()} - {df['word_count'].max()} words")

print(f"\n4Ô∏è‚É£ VOCABULARY:")
print(f"   ‚Ä¢ Total words: {total_words:,}")
print(f"   ‚Ä¢ Unique words: {unique_words:,}")
print(f"   ‚Ä¢ Vocabulary richness: {vocab_richness:.4f}")
print(f"   ‚Ä¢ Average unique word ratio per text: {df['unique_word_ratio'].mean():.3f}")

print(f"\n5Ô∏è‚É£ STOPWORDS:")
print(f"   ‚Ä¢ Average stopwords per text: {df['stopword_count'].mean():.2f}")
print(f"   ‚Ä¢ Average stopword ratio: {df['stopword_ratio'].mean():.2%}")

print(f"\n6Ô∏è‚É£ RECOMMENDATIONS FOR PREPROCESSING:")
print(f"   ‚úì Remove stopwords (they comprise ~{df['stopword_ratio'].mean():.0%} of text)")
print(f"   ‚úì Consider handling duplicates ({duplicates} found)")
print(f"   ‚úì Text length varies significantly - consider padding/truncation")
if max_ratio > 3:
    print(f"   ‚ö† Handle class imbalance (ratio: {max_ratio:.2f}) - consider class weights")
print(f"   ‚úì Vocabulary size ({unique_words:,}) suggests embeddings dimension 50-300")

print(f"\n7Ô∏è‚É£ MODELING RECOMMENDATIONS:")
print(f"   ‚Ä¢ Suggested max sequence length: {int(df['word_count'].quantile(0.95))} words (95th percentile)")
print(f"   ‚Ä¢ Embedding dimension: 100-300 (given vocabulary size)")
print(f"   ‚Ä¢ Consider using pre-trained embeddings (GloVe, Word2Vec)")
print(f"   ‚Ä¢ Use dropout and regularization (high vocabulary richness)")

print("\n" + "="*80)
print("‚úÖ EDA COMPLETED SUCCESSFULLY")
print("="*80)


In [None]:
# Save extended dataframe with features for future use (optional)
print("\nüíæ Saving extended dataframe with calculated features...")
df.to_csv('./data/train_with_features.csv', index=False)
print("‚úÖ Saved to './data/train_with_features.csv'")
