# Data Exploration

This notebook is used for exploring the dataset, visualizing data distributions, and understanding the sentiment labels.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data_path = '../data/processed/dataset.csv'  # Update with the actual path
df = pd.read_csv(data_path)

# Display the first few rows of the dataset
df.head()

In [2]:
# Visualize the distribution of sentiment labels
plt.figure(figsize=(10, 6))
sns.countplot(x='sentiment', data=df)
plt.title('Distribution of Sentiment Labels')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

In [3]:
# Visualize the correlation between features
plt.figure(figsize=(12, 8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Feature Correlation Matrix')
plt.show()

In [4]:
# Check for missing values
missing_values = df.isnull().sum()
missing_values[missing_values > 0]

In [5]:
# Summary statistics of the dataset
df.describe()