In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Start by importing the necessary libraries.

# Retrieve the Starbucks dataset from this link and load it into a DataFrame named ‘starbucks’
df = pd.read_csv('starbucks.csv')

# Display the first 10 entries of the DataFrame using the head() function.
print(df.head(10))

print()

# Determine the total number of entries in ‘starbucks’.
number_of_entries = len(df)
print(number_of_entries)

print()

# Print all the column names in the DataFrame.
print(df.columns)

print()

# Find out how the DataFrame is indexed.
print(df.index)

print()
# Determine the drink that contains the most amount of sugar on average.
df.columns = df.columns.str.strip()

avg_suger_per_drink = df.groupby('Beverage')['Sugars (g)'].mean()

max_sugar_drink = avg_suger_per_drink.sort_values(ascending=False).head(1)

print(max_sugar_drink)

print()

# Find out how many different drink categories there are. Use the nunique() function for this.
unique_name_count = df['Beverage'].nunique()
print(unique_name_count)

print()
# Identify the 5 most frequent drink categories
top_categories = df['Beverage_category'].value_counts().head(5)
print(top_categories)

print()

# Find out the average calorie content per drink category
avg_calories_per_category = df.groupby('Beverage_category')['Calories'].mean().sort_values(ascending=True)
print(avg_calories_per_category)

print()

# Create a new column ‘caffeine_to_calories’ that represents the ratio of caffeine to calories in each drink

df['Caffeine (mg)'] = pd.to_numeric(df['Caffeine (mg)'], errors='coerce')
df['Calories'] = pd.to_numeric(df['Calories'], errors='coerce')

new_column_caffeine_to_calories = df['caffeine_to_calories'] = df['Caffeine (mg)'] / df['Calories'].replace(0, pd.NA)
print(new_column_caffeine_to_calories)

# Find the drink category that, on average, has the highest ‘caffeine_to_calories’ ratio
avg_ratio_per_category = df.groupby('Beverage_category')['caffeine_to_calories'].mean().sort_values()

highest_ratio_category = avg_ratio_per_category.idxmax()
highest_ratio_value = avg_ratio_per_category.max()

print(highest_ratio_category)
print(highest_ratio_value)

# Create a horizontal bar plot that shows the average ‘caffeine_to_calories’ ratio for each drink category.
# Use matplotlib’s barh() function for this
plt.figure(figsize=(10, 6))
avg_ratio_per_category.plot(kind='barh', color='teal')
plt.xlabel('Average Caffeine-to-Calories Ratio')
plt.ylabel('Beverage Category')
plt.title('Average Caffeine-to-Calories Ratio by Drink Category')
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

# Formulate and answer a question about the dataset.
# For example, “What is the relationship between sugar and caffeine across different drink categories?”

# From the scatter plot, we observe that:
# Brewed Coffee and Espresso Beverages tend to have high caffeine but low sugar — these are typically consumed without much added sweetener.
# Frappuccino® Blended Beverages and Shaken Iced Teas often have high sugar and low to moderate caffeine, aligning with their dessert-like nature.
# There's no strong linear correlation across all categories — some categories are high in both sugar and caffeine (like Hot Chocolate), while others are high in only one.