In [None]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

print("--- Intermediate Matplotlib & Seaborn Exercise ---")
print("Complete each task by writing the requested plotting code.")
print("Focus on proper labeling, titles, and drawing insights.")
print("--------------------------------------------------")

# Load the dataset
# We'll use the 'tips' dataset, which is built into Seaborn.
# It contains information about restaurant tips: total bill, tip, gender of bill payer, smoker status, day, time, and size of party.
df_tips = sns.load_dataset('tips')
print("Dataset loaded (first 5 rows):\n", df_tips.head())
print("\nDataset Info:")
df_tips.info()
print("\nDescriptive Statistics:\n", df_tips.describe())

# --- Task 1: Single Variable Distributions (Histogram, Pie Chart) ---
print("\n--- Task 1: Single Variable Distributions ---")

# 1.1 Histogram: Distribution of 'total_bill'
# Create a histogram of the 'total_bill' column.
# Add a title, x-label, and y-label. Use `bins=20`.
# Insight Question: What is the typical range of total bills? Is the distribution symmetric or skewed?
plt.figure(figsize=(8, 5))
# Your code for Task 1.1 here:
sns.histplot(df_tips['total_bill'], bins=20, kde=True) # kde=True adds a Kernel Density Estimate for smoothness
plt.title('Distribution of Total Bill Amounts')
plt.xlabel('Total Bill ($)')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
# Insight: The distribution of total bills is right-skewed, meaning most bills are on the lower side, with fewer very high bills. The typical range appears to be between
25.


# 1.2 Pie Chart: Proportion of 'smoker' status
# Create a pie chart showing the proportion of smokers vs. non-smokers.
# Ensure the percentages are displayed on the slices.
# Insight Question: Is there a significant difference in the number of smokers vs. non-smokers in the dataset?
plt.figure(figsize=(7, 7))
# Your code for Task 1.2 here:
smoker_counts = df_tips['smoker'].value_counts()
plt.pie(smoker_counts, labels=smoker_counts.index, autopct='%1.1f%%', startangle=90, colors=sns.color_palette('pastel'))
plt.title('Proportion of Smokers vs. Non-Smokers')
plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
# Insight: The dataset contains more non-smokers than smokers, with non-smokers making up a larger proportion of the observed population.


# --- Task 2: Relationships Between Two Variables (Scatter, Bar, Horizontal Bar) ---
print("\n--- Task 2: Relationships Between Two Variables ---")

# 2.1 Scatter Plot: 'total_bill' vs. 'tip'
# Create a scatter plot with 'total_bill' on the x-axis and 'tip' on the y-axis.
# Use 'smoker' as the `hue` to differentiate points by smoker status.
# Add a title and labels.
# Insight Question: Is there a relationship between total bill and tip amount? Does smoker status influence this relationship?
plt.figure(figsize=(9, 6))
# Your code for Task 2.1 here:
sns.scatterplot(x='total_bill', y='tip', hue='smoker', data=df_tips, s=80, alpha=0.8, palette='deep')
plt.title('Total Bill vs. Tip Amount by Smoker Status')
plt.xlabel('Total Bill ($)')
plt.ylabel('Tip Amount ($)')
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend(title='Smoker')
plt.show()
# Insight: There appears to be a positive correlation: generally, higher total bills lead to higher tips. It's not immediately clear if smoker status drastically changes this trend, but visually, non-smokers seem to have a wider range of total bills and tips.


# 2.2 Bar Plot: Average 'total_bill' by 'day'
# Create a bar plot showing the average 'total_bill' for each 'day' of the week.
# Add error bars (which Seaborn's barplot does by default, showing confidence intervals).
# Add a title and labels.
# Insight Question: Which day has the highest average total bill? Are there noticeable differences between weekdays and weekends?
plt.figure(figsize=(8, 5))
# Your code for Task 2.2 here:
sns.barplot(x='day', y='total_bill', data=df_tips, palette='viridis', ci='sd') # ci='sd' shows standard deviation
plt.title('Average Total Bill by Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Average Total Bill ($)')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
# Insight: Saturday appears to have the highest average total bill, followed closely by Sunday. Weekdays (Thursday, Friday) have lower average total bills. This suggests more dining activity or larger groups on weekends.


# 2.3 Horizontal Bar Plot: Average 'tip' by 'time'
# Create a horizontal bar plot showing the average 'tip' for 'Lunch' vs. 'Dinner'.
# Add a title and labels.
# Insight Question: Do customers tip more during lunch or dinner?
plt.figure(figsize=(8, 5))
# Your code for Task 2.3 here:
sns.barplot(x='tip', y='time', data=df_tips, palette='plasma', orient='h', ci=None) # ci=None removes error bars for simplicity
plt.title('Average Tip Amount by Time of Day')
plt.xlabel('Average Tip Amount ($)')
plt.ylabel('Time of Day')
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.show()
# Insight: On average, customers tend to leave slightly higher tips during dinner compared to lunch.


# --- Task 3: Multiple Plots and Subplots ---
print("\n--- Task 3: Multiple Plots and Subplots ---")

# 3.1 Create a figure with 2 rows and 2 columns of subplots.
#    - Top-left: Histogram of 'tip' amount.
#    - Top-right: Box plot of 'total_bill' by 'sex'.
#    - Bottom-left: Count plot of 'day'.
#    - Bottom-right: Swarm plot of 'tip' by 'smoker' status.
# Ensure each subplot has an appropriate title and labels. Use `plt.tight_layout()` to prevent overlap.
# Insight Question: What can you learn about tip distribution, total bill distribution across genders, daily customer patterns, and tip differences based on smoker status from these combined plots?
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Comprehensive Analysis of Tips Dataset', fontsize=16)

# Your code for Task 3.1 here:

# Top-left: Histogram of 'tip'
sns.histplot(df_tips['tip'], bins=15, kde=True, ax=axes[0, 0])
axes[0, 0].set_title('Distribution of Tip Amounts')
axes[0, 0].set_xlabel('Tip Amount ($)')
axes[0, 0].set_ylabel('Frequency')

# Top-right: Box plot of 'total_bill' by 'sex'
sns.boxplot(x='sex', y='total_bill', data=df_tips, palette='coolwarm', ax=axes[0, 1])
axes[0, 1].set_title('Total Bill Distribution by Gender')
axes[0, 1].set_xlabel('Gender')
axes[0, 1].set_ylabel('Total Bill ($)')

# Bottom-left: Count plot of 'day'
sns.countplot(x='day', data=df_tips, palette='magma', ax=axes[1, 0])
axes[1, 0].set_title('Number of Customers by Day')
axes[1, 0].set_xlabel('Day of the Week')
axes[1, 0].set_ylabel('Count')

# Bottom-right: Swarm plot of 'tip' by 'smoker' status
sns.swarmplot(x='smoker', y='tip', data=df_tips, palette='rocket', ax=axes[1, 1], s=4) # s controls marker size
axes[1, 1].set_title('Tip Amount by Smoker Status')
axes[1, 1].set_xlabel('Smoker Status')
axes[1, 1].set_ylabel('Tip Amount ($)')

plt.tight_layout(rect=[0, 0.03, 1, 0.95]) # Adjust layout to make space for suptitle
plt.show()

# Combined Insights:
# 1. Tip Distribution: Tips are also right-skewed, with most tips being smaller amounts (
4).
# 2. Total Bill by Gender: While both genders have a similar median total bill, males seem to have a slightly wider spread and potentially more outliers on the higher end of total bills.
# 3. Daily Customer Patterns: Saturday is the busiest day, followed by Sunday and Thursday, with Friday being the least busy among the days recorded.
# 4. Tip by Smoker Status: Visually, it appears that non-smokers might give a slightly wider range of tips, including some higher tips, though the bulk of tips for both groups falls in a similar range. There's no immediately obvious huge difference in average tip based on smoker status from the swarm plot alone.


print("\n--- Exercise Complete! ---")
print("You've practiced creating various plots and interpreting their insights.")
print("Remember that good EDA often involves iterative plotting and questioning.")
