# Polars Tutorial - Part 4: Data Visualization

In this notebook, we'll explore visualization techniques with Polars:
- Integration with Matplotlib
- Creating plots with Seaborn
- Interactive visualizations with Plotly
- Best practices for visualization with Polars

In [None]:
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import os

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

DATA_DIR = '../data/'

# Load datasets
df_sales = pl.read_csv(os.path.join(DATA_DIR, 'sales_data.csv'))
df_employees = pl.read_json(os.path.join(DATA_DIR, 'employees.json'))

print("Data loaded successfully!")

## 1. Basic Matplotlib Visualizations

### 1.1 Converting to Pandas for Plotting

In [None]:
# Polars can convert to Pandas for easy plotting
sales_by_category = df_sales.group_by('category').agg([
    pl.sum('revenue').alias('total_revenue')
]).sort('total_revenue', descending=True)

# Convert to Pandas for plotting
sales_pd = sales_by_category.to_pandas()

# Create bar plot
plt.figure(figsize=(10, 6))
plt.bar(sales_pd['category'], sales_pd['total_revenue'], color='steelblue')
plt.xlabel('Category', fontsize=12)
plt.ylabel('Total Revenue ($)', fontsize=12)
plt.title('Total Revenue by Category', fontsize=14, fontweight='bold')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("Bar chart created!")

### 1.2 Line Plot

In [None]:
# Prepare time series data
daily_revenue = df_sales.group_by('date').agg([
    pl.sum('revenue').alias('daily_revenue')
]).sort('date')

# Convert to Pandas
daily_pd = daily_revenue.to_pandas()
daily_pd['date'] = pl.Series(daily_pd['date']).str.strptime(pl.Date, format='%Y-%m-%d').to_pandas()

# Create line plot
plt.figure(figsize=(12, 6))
plt.plot(daily_pd['date'], daily_pd['daily_revenue'], marker='o', linewidth=2, markersize=6)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Revenue ($)', fontsize=12)
plt.title('Daily Revenue Trend', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("Line chart created!")

### 1.3 Scatter Plot

In [None]:
# Scatter plot of price vs quantity
plt.figure(figsize=(10, 6))
scatter_data = df_sales.to_pandas()

colors = {'Electronics': 'blue', 'Furniture': 'green'}
for category in scatter_data['category'].unique():
    data = scatter_data[scatter_data['category'] == category]
    plt.scatter(data['price'], data['quantity'], 
               label=category, alpha=0.6, s=100, 
               color=colors.get(category, 'gray'))

plt.xlabel('Price ($)', fontsize=12)
plt.ylabel('Quantity', fontsize=12)
plt.title('Price vs Quantity by Category', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("Scatter plot created!")

## 2. Seaborn Visualizations

### 2.1 Box Plot

In [None]:
# Box plot for revenue distribution by region
plt.figure(figsize=(10, 6))
sns.boxplot(data=df_sales.to_pandas(), x='region', y='revenue', palette='Set2')
plt.xlabel('Region', fontsize=12)
plt.ylabel('Revenue ($)', fontsize=12)
plt.title('Revenue Distribution by Region', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("Box plot created!")

### 2.2 Heatmap

In [None]:
# Create pivot table for heatmap
pivot_data = df_sales.pivot(
    values='revenue',
    index='category',
    columns='region',
    aggregate_function='sum'
).fill_null(0)

# Convert to pandas and create heatmap
pivot_pd = pivot_data.to_pandas().set_index('category')

plt.figure(figsize=(10, 6))
sns.heatmap(pivot_pd, annot=True, fmt='.0f', cmap='YlOrRd', cbar_kws={'label': 'Revenue ($)'})
plt.title('Revenue Heatmap: Category vs Region', fontsize=14, fontweight='bold')
plt.xlabel('Region', fontsize=12)
plt.ylabel('Category', fontsize=12)
plt.tight_layout()
plt.show()

print("Heatmap created!")

### 2.3 Count Plot

In [None]:
# Count plot for products by category
plt.figure(figsize=(10, 6))
sns.countplot(data=df_sales.to_pandas(), x='category', palette='viridis')
plt.xlabel('Category', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.title('Number of Transactions by Category', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("Count plot created!")

### 2.4 Violin Plot

In [None]:
# Violin plot for employee salaries by department
plt.figure(figsize=(12, 6))
sns.violinplot(data=df_employees.to_pandas(), x='department', y='salary', palette='muted')
plt.xlabel('Department', fontsize=12)
plt.ylabel('Salary ($)', fontsize=12)
plt.title('Salary Distribution by Department', fontsize=14, fontweight='bold')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("Violin plot created!")

## 3. Interactive Plotly Visualizations

### 3.1 Interactive Bar Chart

In [None]:
# Prepare data
region_sales = df_sales.group_by('region').agg([
    pl.sum('revenue').alias('total_revenue'),
    pl.count().alias('num_transactions')
]).sort('total_revenue', descending=True)

# Create interactive bar chart
fig = px.bar(
    region_sales.to_pandas(),
    x='region',
    y='total_revenue',
    color='region',
    title='Total Revenue by Region (Interactive)',
    labels={'total_revenue': 'Total Revenue ($)', 'region': 'Region'},
    text='total_revenue'
)

fig.update_traces(texttemplate='$%{text:.2f}', textposition='outside')
fig.update_layout(showlegend=False, height=500)
fig.show()

print("Interactive bar chart created!")

### 3.2 Interactive Scatter Plot

In [None]:
# Interactive scatter plot
fig = px.scatter(
    df_sales.to_pandas(),
    x='price',
    y='revenue',
    color='category',
    size='quantity',
    hover_data=['product', 'region'],
    title='Price vs Revenue (Interactive)',
    labels={'price': 'Price ($)', 'revenue': 'Revenue ($)'}
)

fig.update_layout(height=600)
fig.show()

print("Interactive scatter plot created!")

### 3.3 Interactive Line Chart with Multiple Series

In [None]:
# Revenue by date and category
daily_category = df_sales.group_by(['date', 'category']).agg([
    pl.sum('revenue').alias('revenue')
]).sort('date')

fig = px.line(
    daily_category.to_pandas(),
    x='date',
    y='revenue',
    color='category',
    title='Daily Revenue by Category (Interactive)',
    labels={'revenue': 'Revenue ($)', 'date': 'Date'},
    markers=True
)

fig.update_layout(hovermode='x unified', height=500)
fig.show()

print("Interactive line chart created!")

### 3.4 Interactive Pie Chart

In [None]:
# Pie chart for revenue by category
category_revenue = df_sales.group_by('category').agg([
    pl.sum('revenue').alias('total_revenue')
])

fig = px.pie(
    category_revenue.to_pandas(),
    values='total_revenue',
    names='category',
    title='Revenue Distribution by Category',
    hole=0.3  # Creates a donut chart
)

fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(height=500)
fig.show()

print("Interactive pie chart created!")

### 3.5 Interactive Histogram

In [None]:
# Histogram of revenue distribution
fig = px.histogram(
    df_sales.to_pandas(),
    x='revenue',
    nbins=20,
    color='category',
    title='Revenue Distribution Histogram',
    labels={'revenue': 'Revenue ($)'},
    marginal='box'  # Adds a box plot on top
)

fig.update_layout(height=500)
fig.show()

print("Interactive histogram created!")

## 4. Advanced Visualizations

### 4.1 Subplots with Matplotlib

In [None]:
# Create a figure with multiple subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot 1: Revenue by Category
sales_by_cat = df_sales.group_by('category').agg([
    pl.sum('revenue').alias('total_revenue')
]).to_pandas()
axes[0, 0].bar(sales_by_cat['category'], sales_by_cat['total_revenue'], color='skyblue')
axes[0, 0].set_title('Revenue by Category', fontweight='bold')
axes[0, 0].set_ylabel('Revenue ($)')

# Plot 2: Revenue by Region
sales_by_reg = df_sales.group_by('region').agg([
    pl.sum('revenue').alias('total_revenue')
]).to_pandas()
axes[0, 1].barh(sales_by_reg['region'], sales_by_reg['total_revenue'], color='lightcoral')
axes[0, 1].set_title('Revenue by Region', fontweight='bold')
axes[0, 1].set_xlabel('Revenue ($)')

# Plot 3: Price Distribution
axes[1, 0].hist(df_sales['price'].to_pandas(), bins=15, color='lightgreen', edgecolor='black')
axes[1, 0].set_title('Price Distribution', fontweight='bold')
axes[1, 0].set_xlabel('Price ($)')
axes[1, 0].set_ylabel('Frequency')

# Plot 4: Quantity Distribution
axes[1, 1].hist(df_sales['quantity'].to_pandas(), bins=10, color='plum', edgecolor='black')
axes[1, 1].set_title('Quantity Distribution', fontweight='bold')
axes[1, 1].set_xlabel('Quantity')
axes[1, 1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

print("Subplots created!")

### 4.2 Correlation Matrix

In [None]:
# Calculate correlation for employee data
emp_numeric = df_employees.select(['salary', 'years_experience', 'performance_score']).to_pandas()
correlation = emp_numeric.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix - Employee Metrics', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("Correlation matrix created!")

## 5. Custom Visualization Functions

### 5.1 Creating Reusable Plot Functions

In [None]:
def plot_grouped_bar(df, group_col, value_col, title):
    """
    Create a grouped bar chart from a Polars DataFrame
    """
    grouped_data = df.group_by(group_col).agg([
        pl.sum(value_col).alias(f'total_{value_col}')
    ]).sort(f'total_{value_col}', descending=True)
    
    pd_data = grouped_data.to_pandas()
    
    plt.figure(figsize=(10, 6))
    plt.bar(pd_data[group_col], pd_data[f'total_{value_col}'], color='steelblue')
    plt.xlabel(group_col.capitalize(), fontsize=12)
    plt.ylabel(f'Total {value_col.capitalize()}', fontsize=12)
    plt.title(title, fontsize=14, fontweight='bold')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# Use the function
plot_grouped_bar(df_sales, 'region', 'revenue', 'Total Revenue by Region')
plot_grouped_bar(df_sales, 'product', 'quantity', 'Total Quantity by Product')

## 6. Best Practices for Visualizing Polars Data

### Tips:
1. **Convert to Pandas when needed**: For libraries that don't support Polars directly
2. **Aggregate before plotting**: Reduce data size for faster plotting
3. **Use lazy evaluation**: For large datasets, filter/aggregate before collecting
4. **Choose the right chart type**:
   - Bar/Column: Comparing categories
   - Line: Time series or trends
   - Scatter: Relationships between variables
   - Pie/Donut: Composition
   - Heatmap: Matrix data or correlations
5. **Interactive plots for exploration**: Use Plotly for interactive dashboards

## 7. Summary

In this notebook, we explored:
- ✅ Creating visualizations with Matplotlib
- ✅ Advanced plots with Seaborn
- ✅ Interactive visualizations with Plotly
- ✅ Subplots and complex layouts
- ✅ Correlation analysis
- ✅ Custom visualization functions

### Key Takeaways:
1. Polars integrates seamlessly with visualization libraries through `.to_pandas()`
2. Aggregate data with Polars before plotting for better performance
3. Plotly provides excellent interactive visualizations
4. Choose the right visualization for your data type and message

**Next:** In the next notebook, we'll explore parallel processing capabilities!