<a href="https://colab.research.google.com/github/Natthamon-Piy/Chipotle-Visualization/blob/main/Chipotle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Data import

In [3]:
import pandas as pd
import plotly.express as px
import seaborn as sns

df = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv', sep='\t')
df.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


#Best selling menu

###dataframe for best selling menu

In [5]:
item_quantities = df.groupby('item_name')['quantity'].sum()
most_ordered_item = item_quantities.sort_values(ascending=False)
print("All items in descending order of quantity ordered:")
display(most_ordered_item)

All items in descending order of quantity ordered:


Unnamed: 0_level_0,quantity
item_name,Unnamed: 1_level_1
Chicken Bowl,761
Chicken Burrito,591
Chips and Guacamole,506
Steak Burrito,386
Canned Soft Drink,351
Chips,230
Steak Bowl,221
Bottled Water,211
Chips and Fresh Tomato Salsa,130
Canned Soda,126


##Top20 Quantity Ordered per Item

In [9]:
item_quantities = df.groupby('item_name')['quantity'].sum()
top_20_items = item_quantities.sort_values(ascending=False).head(20)

fig = px.bar(top_20_items, x='quantity', y=top_20_items.index, orientation='h', title='Top 20 Quantity Ordered per Item')
fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()

##Box Plot showing range of best selling menu

In [24]:
# Get the top 5 most ordered items
item_quantities = df.groupby('item_name')['quantity'].sum()
top_5_items = item_quantities.sort_values(ascending=False).head(5).index

# Filter the dataframe to include only the top 5 items
df_top5 = df[df['item_name'].isin(top_5_items)].copy() # Use .copy() to avoid SettingWithCopyWarning

# Convert item_price to numeric
df_top5.loc[:, 'item_price'] = df_top5['item_price'].str.replace('$', '').astype(float)

# Divide item_price by quantity if quantity is greater than 1
df_top5.loc[df_top5['quantity'] > 1, 'item_price'] = df_top5['item_price'] / df_top5['quantity']


# Display the dataframe with top 5 items
display(df_top5)

# Create a box plot of item prices for the top 5 items
fig = px.box(df_top5, x='item_name', y='item_price', title='Distribution of Item Prices per Item for Top 5 Most Ordered Items')
fig.show()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,choice_description_split
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",8.49,"[Tomatillo-Red Chili Salsa (Hot), Black Beans..."
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",10.98,"[Fresh Tomato Salsa (Mild), Rice, Cheese, S..."
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",11.75,"[Tomatillo Red Chili Salsa, Fajita Vegetables..."
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",9.25,"[Fresh Tomato Salsa, Rice, Black Beans, Pin..."
10,5,1,Chips and Guacamole,,4.45,[]
...,...,...,...,...,...,...
4609,1829,1,Canned Soft Drink,[Sprite],1.25,[Sprite]
4610,1830,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...",11.75,"[Fresh Tomato Salsa, Rice, Sour Cream, Chee..."
4616,1832,1,Chips and Guacamole,,4.45,[]
4617,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...",11.75,"[Fresh Tomato Salsa, Rice, Black Beans, Sou..."


# Salsa Type of Best selling Menu

##data prep

## Bar Chart of Salsa Type by Percentage

In [42]:
# Function to extract salsa type without spice level
def extract_salsa_type_without_spice(choices_list):
    salsa_type = None
    for choice in choices_list:
        if 'Salsa' in choice:
            # Remove spice level indicators
            salsa_type = choice.strip().replace('(Mild)', '').replace('(Hot)', '').replace('(Medium)', '').strip()
            break # Assuming only one salsa type per item for this analysis
    return salsa_type

# Create the new 'SalsaType' column
df['SalsaType'] = df['choice_description_split'].apply(extract_salsa_type_without_spice)

# Replace hyphens with spaces in specific salsa names in the 'SalsaType' column
df['SalsaType'] = df['SalsaType'].str.replace('Tomatillo-Red Chili Salsa', 'Tomatillo Red Chili Salsa').str.replace('Tomatillo-Green Chili Salsa', 'Tomatillo Green Chili Salsa')

# Filter data to include only items that contain a salsa type
df_with_salsa = df[df['SalsaType'].notna() & (df['SalsaType'] != 'None')].copy()

# Get the total quantity for each item in the filtered data
item_quantities_with_salsa = df_with_salsa.groupby('item_name')['quantity'].sum()

# Get the top 5 most ordered items from the filtered data
top_5_items_with_salsa = item_quantities_with_salsa.sort_values(ascending=False).head(5).index

# Filter the data for the top 5 items that contain salsa
df_top5_with_salsa = df_with_salsa[df_with_salsa['item_name'].isin(top_5_items_with_salsa)].copy()


# Calculate percentage of each SalsaType for each top 5 item
salsa_percentages_by_item_top5 = {}
for item in top_5_items_with_salsa:
    item_df = df_top5_with_salsa[df_top5_with_salsa['item_name'] == item]
    salsa_counts = item_df['SalsaType'].value_counts()
    if salsa_counts.sum() > 0:
        salsa_percentages = salsa_counts / salsa_counts.sum() * 100
        salsa_percentages_by_item_top5[item] = salsa_percentages.sort_values(ascending=False) # Sort by percentage descending
    else:
        salsa_percentages_by_item_top5[item] = pd.Series(dtype=float)


# Prepare data for plotting
plot_data_top5 = []
for item, percentages in salsa_percentages_by_item_top5.items():
    for salsa_type, percentage in percentages.items():
        if salsa_type: # Exclude empty salsa types
            plot_data_top5.append({'item_name': item, 'salsa_type': salsa_type, 'percentage': percentage})

plot_df_top5 = pd.DataFrame(plot_data_top5)

# Create a stacked bar chart with percentages shown for top 5 items with salsa
fig = px.bar(plot_df_top5, x='percentage', y='item_name', color='salsa_type',
             title='Percentage of Salsa Types per Item for Top 5 Items with Salsa',
             orientation='h',
             text='percentage') # Add text labels

fig.update_layout(yaxis={'categoryorder':'array', 'categoryarray': top_5_items_with_salsa[::-1]}) # Maintain the order of items
fig.update_traces(texttemplate='%{text:.1f}%', textposition='auto') # Format text to one decimal place and position automatically
fig.show()

In [33]:
# Filter the dataframe to show only items that contain a salsa type


# Display the head of the filtered dataframe
display(df_with_salsa.head())

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,choice_description_split,SpiceLevel,SalsaType
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98,"[Tomatillo-Red Chili Salsa (Hot), Black Beans...",Hot,Tomatillo Red Chili Salsa
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",$10.98,"[Fresh Tomato Salsa (Mild), Rice, Cheese, S...",Mild,Fresh Tomato Salsa
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",$11.75,"[Tomatillo Red Chili Salsa, Fajita Vegetables...",,Tomatillo Red Chili Salsa
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",$9.25,"[Tomatillo Green Chili Salsa, Pinto Beans, C...",,Tomatillo Green Chili Salsa
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",$9.25,"[Fresh Tomato Salsa, Rice, Black Beans, Pin...",,Fresh Tomato Salsa


In [41]:
# Prepare data for sunburst chart, handling potential missing values
df_sunburst = df.copy()
df_sunburst['SalsaType'] = df_sunburst['SalsaType'].fillna('None')
df_sunburst['SpiceLevel'] = df_sunburst['SpiceLevel'].fillna('None')

# Filter to include only items that contain a salsa type
df_sunburst_filtered = df_sunburst[df_sunburst['SalsaType'] != 'None'].copy()

# Get the total quantity for each item in the filtered data
item_quantities_filtered = df_sunburst_filtered.groupby('item_name')['quantity'].sum()

# Get the top 10 most ordered items from the filtered data
top_5_items_filtered = item_quantities_filtered.sort_values(ascending=False).head(5).index

# Filter the sunburst data to include only the top 10 items
df_sunburst_top5 = df_sunburst_filtered[df_sunburst_filtered['item_name'].isin(top_5_items_filtered)].copy()


# Create the sunburst chart with filtered data
fig = px.sunburst(df_sunburst_top5, path=['item_name', 'SalsaType', 'SpiceLevel'], values='quantity',
                  title='Sunburst Chart of Top 5 Item Name, Salsa Type, and Spice Level (Items with Salsa Only)')
fig.show()