<a href="https://colab.research.google.com/github/Natthamon-Piy/Chipotle-Visualization/blob/main/Chipotle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Data import

In [3]:
import pandas as pd
import plotly.express as px
import seaborn as sns

df = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv', sep='\t')
df.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


##Best selling menu

###dataframe for best selling menu

In [5]:
item_quantities = df.groupby('item_name')['quantity'].sum()
most_ordered_item = item_quantities.sort_values(ascending=False)
print("All items in descending order of quantity ordered:")
display(most_ordered_item)

All items in descending order of quantity ordered:


Unnamed: 0_level_0,quantity
item_name,Unnamed: 1_level_1
Chicken Bowl,761
Chicken Burrito,591
Chips and Guacamole,506
Steak Burrito,386
Canned Soft Drink,351
Chips,230
Steak Bowl,221
Bottled Water,211
Chips and Fresh Tomato Salsa,130
Canned Soda,126


##Top20 Quantity Ordered per Item'

In [9]:
item_quantities = df.groupby('item_name')['quantity'].sum()
top_20_items = item_quantities.sort_values(ascending=False).head(20)

fig = px.bar(top_20_items, x='quantity', y=top_20_items.index, orientation='h', title='Top 20 Quantity Ordered per Item')
fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()

##Box Plot showing range of best selling menu

In [8]:
# Get the top 5 most ordered items
item_quantities = df.groupby('item_name')['quantity'].sum()
top_5_items = item_quantities.sort_values(ascending=False).head(5).index

# Filter the dataframe to include only the top 5 items
df_top5 = df[df['item_name'].isin(top_5_items)]

# Convert item_price to numeric
df_top5['item_price'] = df_top5['item_price'].str.replace('$', '').astype(float)

# Create a box plot of item prices for the top 5 items
fig = px.box(df_top5, x='item_name', y='item_price', title='Distribution of Item Prices for Top 5 Most Ordered Items')
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



# Salsa Type of Best selling Menu

##data prep

In [11]:
# Fill NaN values with empty strings and then remove brackets and split by comma
df['choice_description_split'] = df['choice_description'].fillna('').str.replace(r'[\[\]]', '', regex=True).str.split(',')

# Display the original and the new split column
display(df[['choice_description', 'choice_description_split']].head())

Unnamed: 0,choice_description,choice_description_split
0,,[]
1,[Clementine],[Clementine]
2,[Apple],[Apple]
3,,[]
4,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...","[Tomatillo-Red Chili Salsa (Hot), Black Beans..."


In [12]:
salsa_choices = []
for choices_list in df['choice_description_split']:
    for choice in choices_list:
        if 'Salsa' in choice:
            salsa_choices.append(choice.strip()) # .strip() to remove leading/trailing whitespace

# Display the unique salsa choices found
unique_salsa_choices = pd.Series(salsa_choices).unique()
print("Unique salsa choices found:")
display(unique_salsa_choices)

Unique salsa choices found:


array(['Tomatillo-Red Chili Salsa (Hot)', 'Fresh Tomato Salsa (Mild)',
       'Tomatillo Red Chili Salsa', 'Tomatillo Green Chili Salsa',
       'Fresh Tomato Salsa', 'Roasted Chili Corn Salsa',
       'Tomatillo-Green Chili Salsa (Medium)',
       'Roasted Chili Corn Salsa (Medium)', 'Salsa'], dtype=object)

In [14]:
target_items = ['Chicken Bowl', 'Steak Burrito', 'Chicken Burrito']
salsa_counts_by_item = {}

for item in target_items:
    # Filter rows for the current item
    item_df = df[df['item_name'] == item]

    # Extract salsa choices for the current item
    item_salsa_choices = []
    for choices_list in item_df['choice_description_split']:
        for choice in choices_list:
            if 'Salsa' in choice:
                item_salsa_choices.append(choice.strip())

    # Count the occurrences of each salsa type for the current item
    if item_salsa_choices:
        salsa_counts = pd.Series(item_salsa_choices).value_counts()
        salsa_counts_by_item[item] = salsa_counts # Get all salsa counts
    else:
        salsa_counts_by_item[item] = "No salsa recorded"

print("Salsa types and their counts for each item:")
for item, salsa in salsa_counts_by_item.items():
    print(f"- {item}:")
    display(salsa)

Salsa types and their counts for each item:
- Chicken Bowl:


Unnamed: 0,count
Fresh Tomato Salsa,324
Roasted Chili Corn Salsa,106
Fresh Tomato Salsa (Mild),83
Tomatillo-Red Chili Salsa (Hot),65
Tomatillo Green Chili Salsa,65
Roasted Chili Corn Salsa (Medium),65
Tomatillo Red Chili Salsa,61
Tomatillo-Green Chili Salsa (Medium),21


- Steak Burrito:


Unnamed: 0,count
Fresh Tomato Salsa,94
Fresh Tomato Salsa (Mild),60
Roasted Chili Corn Salsa (Medium),59
Roasted Chili Corn Salsa,53
Tomatillo-Red Chili Salsa (Hot),45
Tomatillo Red Chili Salsa,36
Tomatillo Green Chili Salsa,34
Tomatillo-Green Chili Salsa (Medium),17


- Chicken Burrito:


Unnamed: 0,count
Fresh Tomato Salsa,208
Tomatillo Red Chili Salsa,72
Roasted Chili Corn Salsa,67
Fresh Tomato Salsa (Mild),57
Tomatillo-Red Chili Salsa (Hot),47
Roasted Chili Corn Salsa (Medium),45
Tomatillo Green Chili Salsa,40
Tomatillo-Green Chili Salsa (Medium),34


## Bar Chart of Salsa Type by Percentage

In [16]:
target_items = ['Chicken Bowl', 'Steak Burrito', 'Chicken Burrito']
salsa_percentages_by_item = {}

for item in target_items:
    # Filter rows for the current item
    item_df = df[df['item_name'] == item]

    # Extract salsa choices for the current item
    item_salsa_choices = []
    for choices_list in item_df['choice_description_split']:
        for choice in choices_list:
            if 'Salsa' in choice:
                item_salsa_choices.append(choice.strip())

    # Calculate the percentage of each salsa type for the current item
    if item_salsa_choices:
        salsa_counts = pd.Series(item_salsa_choices).value_counts()
        salsa_percentages = salsa_counts / salsa_counts.sum() * 100
        salsa_percentages_by_item[item] = salsa_percentages.sort_values(ascending=False) # Sort by percentage descending
    else:
        salsa_percentages_by_item[item] = pd.Series(dtype=float) # Empty series if no salsa

# Prepare data for plotting
plot_data = []
for item, percentages in salsa_percentages_by_item.items():
    for salsa_type, percentage in percentages.items():
        plot_data.append({'item_name': item, 'salsa_type': salsa_type, 'percentage': percentage})

plot_df = pd.DataFrame(plot_data)

# Create a stacked bar chart with percentages shown
fig = px.bar(plot_df, x='percentage', y='item_name', color='salsa_type',
             title='Percentage of Salsa Types per Item',
             orientation='h',
             text='percentage') # Add text labels

fig.update_layout(yaxis={'categoryorder':'array', 'categoryarray': target_items[::-1]}) # Maintain the order of items
fig.update_traces(texttemplate='%{text:.1f}%', textposition='auto') # Format text to one decimal place and position automatically
fig.show()