## Getting knowing DataFrames

In [1]:
import pandas as pd
import numpy as np

# Step 1: Import dataset 

In [2]:
df = pd.read_csv("chipotle.tsv", sep = "\t")

In [3]:
df.head(5)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


## Step 2: Dataset Overview

In [4]:
df.shape

(4622, 5)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4622 entries, 0 to 4621
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   order_id            4622 non-null   int64 
 1   quantity            4622 non-null   int64 
 2   item_name           4622 non-null   object
 3   choice_description  3376 non-null   object
 4   item_price          4622 non-null   object
dtypes: int64(2), object(3)
memory usage: 180.7+ KB


In [6]:
list(df.columns)

['order_id', 'quantity', 'item_name', 'choice_description', 'item_price']

In [7]:
df.index

RangeIndex(start=0, stop=4622, step=1)

In [8]:
df.describe()

Unnamed: 0,order_id,quantity
count,4622.0,4622.0
mean,927.254868,1.075725
std,528.890796,0.410186
min,1.0,1.0
25%,477.25,1.0
50%,926.0,1.0
75%,1393.0,1.0
max,1834.0,15.0


In [9]:
df.describe(include = "all")

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
count,4622.0,4622.0,4622,3376,4622
unique,,,50,1043,78
top,,,Chicken Bowl,[Diet Coke],$8.75
freq,,,726,134,730
mean,927.254868,1.075725,,,
std,528.890796,0.410186,,,
min,1.0,1.0,,,
25%,477.25,1.0,,,
50%,926.0,1.0,,,
75%,1393.0,1.0,,,


### loc & iloc

In [10]:
df.loc[(df.quantity > 5) | (df.item_name == "Bottled Water")]

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
34,17,1,Bottled Water,,$1.09
87,38,1,Bottled Water,,$1.09
318,138,1,Bottled Water,,$1.09
329,143,1,Bottled Water,,$1.50
376,163,1,Bottled Water,,$1.50
...,...,...,...,...,...
4568,1817,1,Bottled Water,,$1.50
4570,1817,1,Bottled Water,,$1.50
4582,1822,2,Bottled Water,,$3.00
4598,1826,1,Bottled Water,,$1.50


In [11]:
df.loc[(df.quantity > 5) | (df.item_name == "Bottled Water") , ["order_id", "item_name"]]

Unnamed: 0,order_id,item_name
34,17,Bottled Water
87,38,Bottled Water
318,138,Bottled Water
329,143,Bottled Water
376,163,Bottled Water
...,...,...
4568,1817,Bottled Water
4570,1817,Bottled Water
4582,1822,Bottled Water
4598,1826,Bottled Water


In [12]:
df.iloc[[10]]
type(df.iloc[10])

pandas.core.series.Series

In [13]:
df.iloc[[10]]
type(df.iloc[[10]])

pandas.core.frame.DataFrame

In [14]:
df.iloc[3:5, :-1]

Unnamed: 0,order_id,quantity,item_name,choice_description
3,1,1,Chips and Tomatillo-Green Chili Salsa,
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans..."


### data manipulation

In [15]:
df.item_price.dtype

dtype('O')

In [16]:
type(df.item_price)

pandas.core.series.Series

### Apply()

In [17]:
df.item_price = df.item_price.apply(lambda x : float(x.replace('$', '')))
df.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,2.39
1,1,1,Izze,[Clementine],3.39
2,1,1,Nantucket Nectar,[Apple],3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98


### groupby()

In [18]:
df.groupby("item_name")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000005E725744F0>

In [19]:
df.groupby("item_name")["quantity"]

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000005E7257E1F0>

In [29]:
df.groupby("item_name")["quantity"].sum().sort_values(ascending = False)


item_name
Chicken Bowl                             761
Chicken Burrito                          591
Chips and Guacamole                      506
Steak Burrito                            386
Canned Soft Drink                        351
Chips                                    230
Steak Bowl                               221
Bottled Water                            211
Chips and Fresh Tomato Salsa             130
Canned Soda                              126
Chicken Salad Bowl                       123
Chicken Soft Tacos                       120
Side of Chips                            110
Veggie Burrito                            97
Barbacoa Burrito                          91
Veggie Bowl                               87
Carnitas Bowl                             71
Barbacoa Bowl                             66
Carnitas Burrito                          60
Steak Soft Tacos                          56
6 Pack Soft Drink                         55
Chips and Tomatillo Red Chili Salsa       50


In [31]:
df.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,total_price
0,1,1,Chips and Fresh Tomato Salsa,,2.39,2.39
1,1,1,Izze,[Clementine],3.39,3.39
2,1,1,Nantucket Nectar,[Apple],3.39,3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39,2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98,33.96


### What's the revenue?

In [32]:
df["total_price"] = df["quantity"]*df["item_price"]

In [34]:
total = df["total_price"].sum()
print(total)

39237.02
