# Python Lists

## Creating

In [5]:
# this is a python list
a = [2,4,7,10,17,25]

In [6]:
a

[2, 4, 7, 10, 17, 25]

In [7]:
# this is a list comprehension -- think of it as a sexy for loop

# the following gives us a list in which we multiplied each element in a by 2
z = [i * 2 for i in a]
z

[4, 8, 14, 20, 34, 50]

## Indexing

In [8]:
# you can index into it
a[0]

2

In [9]:
# what's the 3rd element?
a[-1]

25

In [10]:
# indices can also be negative
# this gives you the last element
a[-1]

25

## Slicing

In [11]:
a

[2, 4, 7, 10, 17, 25]

In [12]:
# you can also get subsets of the list with slicing
#     a[start:end]
# [start, end)

# this returns the 3rd and 4th entries (indices 2 and 3 -- note we exclude 4!)
a[2:4]

[7, 10]

In [13]:
# if you leave one side blank, it automatically goes all the way
# first five:
a[:4]

[2, 4, 7, 10]

In [14]:
# how do you get the last three elements?
a[-3:]

[10, 17, 25]

In [15]:
# slices can also skip numbers
# a[start:end:interval]

# this gives us every other number, starting with the first
a[::2]

[2, 7, 17]

In [16]:
# the interval can also be negative
# what does that do?

a[::-2]

[25, 10, 4]

# Numpy

In [17]:
import numpy as np

In [18]:
hello = np.arange(12)
hello.reshape(3,4)

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

## Creating

In [19]:
# numpy arrays can be created from a python list
b = np.array(a)
b

array([ 2,  4,  7, 10, 17, 25])

Right now, it looks an awful like a python list, but there are some key points you should know.

numpy arrays are:
- homogeneous (all elements in an array have the same type)
- multidimensional

In [20]:
# Homogeneous: all numpy arrays have an associated data type.
# numbers are usually ints or floats
b.dtype

dtype('int64')

In [21]:
# Multidimensional: numpy arrays can have multiple dimensions, like a nested list.
# We can reshape b into a 3x2 matrix
# Note: this doesn't change b. That's why we assign it to a new variable: m
m = b.reshape(3, 2)
m

array([[ 2,  4],
       [ 7, 10],
       [17, 25]])

In [22]:
# Each dimension is called an axis
# The size across each axis is called the shape
# These are two very important concepts!
m.shape

(3, 2)

## Indexing

In [23]:
# We index into numpy arrays much the same way as python lists.
b[0]
m

array([[ 2,  4],
       [ 7, 10],
       [17, 25]])

In [24]:
# But N-dimensional arrays mean we can be more expressive with indexing
# This gives us [0th index of axis 0, 1st index of axis 1]
# You can think of this as a grid
# Alternatively, this is like m[0][1]
m[0:2,0]

array([2, 7])

In [25]:
# We can also pass in multiple indices as a list
# This gives us the 1st, 2nd, and 5th values of b
b[[0, 1, 2]]

array([2, 4, 7])

In [26]:
m

array([[ 2,  4],
       [ 7, 10],
       [17, 25]])

In [27]:
m[1:3,0]

array([ 7, 17])

In [28]:
# Let's combine these two facts to get the 2nd and 3rd items in the second column of m
m[1:3,1]

array([10, 25])

In [29]:
# We can also incorporate our previous knowledge of slices.
# So to get the second column
# This gives us the entire range on axis 0, and only the 1st index on axis 1
m[:,1]

array([ 4, 10, 25])

## Math

In [30]:
# numpy gives us a lot of math functions to work with
# I'll only show you a couple, but you can find them all in the documentation

np.sum(b)  # guess what this does?

65

In [31]:
np.mean(b)  # and this?

10.833333333333334

In [32]:
# for convenience, you can also call
b.mean()

10.833333333333334

In [33]:
m

array([[ 2,  4],
       [ 7, 10],
       [17, 25]])

In [34]:
# you can also apply these functions to only one axis
# only sum across rows (read: apply the sum to axis 1)
np.mean(m, axis=1)

array([ 3. ,  8.5, 21. ])

In [35]:
# numpy has a concept called podcasting
# It tries to coerce non-matching shapes.
# 2 is a scalar, but we can still multiply m by it
# it just repeats the 2 across all instances of m
m * 2

array([[ 4,  8],
       [14, 20],
       [34, 50]])

In [36]:
Q = np.arange(10)
Q *2

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18])

In [37]:
mu,sigma = (10,2)

# Pandas

In [38]:
import pandas as pd

## Creating

Pandas lets us read all sorts of data into a Dataframe. Think of this as a series of lists. Let's look at an example.

In [39]:
df = pd.read_csv("cereal.csv")
list(df.columns)
#df.columns

['name',
 'mfr',
 'type',
 'calories',
 'protein',
 'fat',
 'sodium',
 'fiber',
 'carbo',
 'sugars',
 'potass',
 'vitamins',
 'shelf',
 'weight',
 'cups',
 'rating']

In [40]:
# head() gives us the first 10 rows in the dataframe (pd.DataFrame)
df.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.5,93.704912
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843


In [41]:
# you can think of each column as a list (or a 1D numpy array)
# in practice, these are called pandas Series (pd.Series)
# you can index into the dataframe with a string to get one column
df["name"]

0                     100% Bran
1             100% Natural Bran
2                      All-Bran
3     All-Bran with Extra Fiber
4                Almond Delight
                ...            
72                      Triples
73                         Trix
74                   Wheat Chex
75                     Wheaties
76          Wheaties Honey Gold
Name: name, Length: 77, dtype: object

In [42]:
type(df["name"])

pandas.core.series.Series

## Pandas Series vs Numpy Arrays

In [43]:
# There are many similarities between pd.Series and np.ndarray
# for example:
df["carbo"].mean()

14.597402597402597

In [44]:
# In fact, we can turn pd.Series into a numpy array
# again, this returns a numpy array -- df["carbo"] doesn't change.
df["carbo"].to_numpy()

array([ 5. ,  8. ,  7. ,  8. , 14. , 10.5, 11. , 18. , 15. , 13. , 12. ,
       17. , 13. , 13. , 12. , 22. , 21. , 13. , 12. , 10. , 21. , 21. ,
       11. , 18. , 11. , 14. , 14. , 12. , 14. , 13. , 11. , 15. , 15. ,
       17. , 13. , 12. , 11.5, 14. , 17. , 20. , 21. , 12. , 12. , 16. ,
       16. , 16. , 17. , 15. , 15. , 21. , 18. , 13.5, 11. , 20. , 13. ,
       10. , 14. , -1. , 14. , 10.5, 15. , 23. , 22. , 16. , 19. , 20. ,
        9. , 16. , 15. , 21. , 15. , 16. , 21. , 13. , 17. , 17. , 16. ])

In [45]:
# The key difference is that Series are indexed
# See the 0, 1, ... 76 on the left? That is the index of each item.
# Right now they are just positions, but theoretically they can be any unique identifier for the row
# Think: ID, username, etc
df["carbo"].index

RangeIndex(start=0, stop=77, step=1)

## Indexing into DataFrames and Series

In [46]:
df.loc[:,"shelf":"rating"]

Unnamed: 0,shelf,weight,cups,rating
0,3,1.0,0.33,68.402973
1,3,1.0,1.00,33.983679
2,3,1.0,0.33,59.425505
3,3,1.0,0.50,93.704912
4,3,1.0,0.75,34.384843
...,...,...,...,...
72,3,1.0,0.75,39.106174
73,2,1.0,1.00,27.753301
74,1,1.0,0.67,49.787445
75,1,1.0,1.00,51.592193


In [47]:
# get the row
df.iloc[:2]

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679


In [48]:
# get the row
df.iloc[-5:]

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
72,Triples,G,C,110,2,1,250,0.0,21.0,3,60,25,3,1.0,0.75,39.106174
73,Trix,G,C,110,1,1,140,0.0,13.0,12,25,25,2,1.0,1.0,27.753301
74,Wheat Chex,R,C,100,3,1,230,3.0,17.0,3,115,25,1,1.0,0.67,49.787445
75,Wheaties,G,C,100,3,1,200,3.0,17.0,3,110,25,1,1.0,1.0,51.592193
76,Wheaties Honey Gold,G,C,110,2,1,200,1.0,16.0,8,60,25,1,1.0,0.75,36.187559


In [49]:
# caveat: remember that pandas doesn't require zero-indexing. indices can be anything.
# this means slicing might not work all the time (what would df.loc["asdf":"hjkl"] even mean?)
# in the cases that you actually want to index by row number, you can always do that with .iloc[]
# again, this will behave the same as .loc[] with our dataset because our data is 0-indexed
df.loc[[1,2,3]]

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.5,93.704912


In [50]:
# This is powerful because we can also make comparisons with Series and values.
df["protein"] > 3

0      True
1     False
2      True
3      True
4     False
      ...  
72    False
73    False
74    False
75    False
76    False
Name: protein, Length: 77, dtype: bool

In [51]:
# Combining these two things, we have a very expressive way of filtering.
# This gives us all the rows in which the protein is greater than 3.
df[df["protein"] > 3]

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.5,93.704912
11,Cheerios,G,C,110,6,2,290,2.0,17.0,1,105,25,1,1.0,1.25,50.764999
41,Life,Q,C,100,4,2,150,2.0,12.0,6,95,25,2,1.0,0.67,45.328074
43,Maypo,A,H,100,4,1,0,0.0,16.0,3,95,25,2,1.0,1.0,54.850917
44,Muesli Raisins; Dates; & Almonds,R,C,150,4,3,95,3.0,16.0,11,170,25,3,1.0,1.0,37.136863
45,Muesli Raisins; Peaches; & Pecans,R,C,150,4,3,150,3.0,16.0,11,170,25,3,1.0,1.0,34.139765
56,Quaker Oat Squares,Q,C,100,4,1,135,2.0,14.0,6,110,25,3,1.0,0.5,49.511874
57,Quaker Oatmeal,Q,H,100,5,2,0,2.7,-1.0,-1,110,0,1,1.0,0.67,50.828392


## Manipulating Series

Often when we're preprocessing data, we want to make uniform changes to a specific column. We can do this by applying functions.

In [52]:
# Suppose we want to make the cereals more appetizing.
# Let's add "Delicious " to the beginning of every name.

# The pattern is we define a function for a single entry
def make_delicious(name):
    return "Delicious " + name

# and then call apply on the series to apply the function to each element in the series
df["name"].apply(make_delicious)

0                     Delicious 100% Bran
1             Delicious 100% Natural Bran
2                      Delicious All-Bran
3     Delicious All-Bran with Extra Fiber
4                Delicious Almond Delight
                     ...                 
72                      Delicious Triples
73                         Delicious Trix
74                   Delicious Wheat Chex
75                     Delicious Wheaties
76          Delicious Wheaties Honey Gold
Name: name, Length: 77, dtype: object

In [53]:
# this returns the changes, but doesn't apply them in place.
# that means on our original dataframe, the cereals are still bland
df.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.5,93.704912
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843


In [54]:
# we can fix this by assigning the new names to the column.
df["name"] = df["name"].apply(make_delicious)
df.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,Delicious 100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,Delicious 100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679
2,Delicious All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
3,Delicious All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.5,93.704912
4,Delicious Almond Delight,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843


In [55]:
# here's another example.
# Jackson is a skeptic and doesn't believe calling things "Delicious" makes them taste better.
# But he does think adding sugar will make them taste better.
# How can we add 10 grams of sugar to every cereal?
df["sugars"] = df["sugars"] +10
df.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,Delicious 100% Bran,N,C,70,4,1,130,10.0,5.0,16,280,25,3,1.0,0.33,68.402973
1,Delicious 100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,18,135,0,3,1.0,1.0,33.983679
2,Delicious All-Bran,K,C,70,4,1,260,9.0,7.0,15,320,25,3,1.0,0.33,59.425505
3,Delicious All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,10,330,25,3,1.0,0.5,93.704912
4,Delicious Almond Delight,R,C,110,2,2,200,1.0,14.0,18,-1,25,3,1.0,0.75,34.384843


## Groups and Aggregates

When we have lots and lots of data, it's more useful to look at aggregate statistics like the mean or median. But sometimes we lose too much detail aggregating across the whole dataset.

The solution is to aggregate across groups. For example, maybe we're less interested in the mean calorie count of all cereals and more interested in the mean for each manufacturer.

In [56]:
# First, we can see how many (and which) unique manufacturers there are
# Note: this gives us a numpy array
df["mfr"].unique()

array(['N', 'Q', 'K', 'R', 'G', 'P', 'A'], dtype=object)

In [57]:
# Now let's group by the manufacturers
# This gives us a groupby object across the dataframe
mfrs = df.groupby("mfr")["calories"].mean()
mfrs

mfr
A    100.000000
G    111.363636
K    108.695652
N     86.666667
P    108.888889
Q     95.000000
R    115.000000
Name: calories, dtype: float64

In [58]:
# what happens if we try to access the calories column?
mfrs["calories"]

KeyError: 'calories'

In [59]:
# now let's try to get the mean
mfrs["calories"].mean()

KeyError: 'calories'

In [60]:
# we can also aggregate across multiple columns, and even use different aggregations
# let's get the average calorie count but the maximum protein
mfrs[["calories", "protein"]].agg({"calories": "mean", "protein": "max"})

KeyError: "None of [Index(['calories', 'protein'], dtype='object', name='mfr')] are in the [index]"

# Exercises

Unless otherwise noted, these should be one line of code.

In [61]:
# here is a Python list:

a = [1, 2, 3, 4, 5, 6]

# get a list containing the last 3 elements of a

#a[-3:]

# reverse the list
#a[::-1]

# get a list where each entry in a is cubed (so the new list is [1, 4, 9, 16, 25, 36])
z = [i*i for i in a]
z

[1, 4, 9, 16, 25, 36]

In [62]:
# create a numpy array from this list
b = np.array(a)
b

array([1, 2, 3, 4, 5, 6])

In [63]:
# find the mean of b
b.mean()

3.5

In [64]:
# change b from a length-6 list to a 2x3 matrix
temp = b.reshape(2,3)

In [65]:
# find the mean value of each row
np.mean(temp,axis = 1)

array([2., 5.])

In [66]:
# find the mean value of each column
np.mean(temp, axis = 0)

array([2.5, 3.5, 4.5])

In [67]:
# find the third column of b
temp[0:2,2]

array([3, 6])

In [68]:
# get a list where each entry in b is cubed (so the new numpy array is [1, 4, 9, 16, 25, 36])
# use a different (numpy-specific) approach


In [105]:
# load in the "starbucks.csv" dataset
df_temp = pd.read_csv("./starbucks.csv")

In [70]:
# this is nutritional info for starbucks items
# let's see if we can answer some questions
# what is the average # calories across all items?
df_temp["Calories"].mean()

193.87190082644628

In [140]:
# how many different categories of beverages are there?
df_temp["Beverage"].unique() 

array(['Brewed Coffee', 'Caffè Latte',
       'Caffè Mocha (Without Whipped Cream)',
       'Vanilla Latte (Or Other Flavoured Latte)', 'Caffè Americano',
       'Cappuccino', 'Espresso', 'Skinny Latte (Any Flavour)',
       'Caramel Macchiato',
       'White Chocolate Mocha (Without Whipped Cream)',
       'Hot Chocolate (Without Whipped Cream)',
       'Caramel Apple Spice (Without Whipped Cream)', 'Tazo® Tea',
       'Tazo® Chai Tea Latte', 'Tazo® Green Tea Latte',
       'Tazo® Full-Leaf Tea Latte',
       'Tazo® Full-Leaf Red Tea Latte (Vanilla Rooibos)',
       'Iced Brewed Coffee (With Classic Syrup)',
       'Iced Brewed Coffee (With Milk & Classic Syrup)',
       'Shaken Iced Tazo® Tea (With Classic Syrup)',
       'Shaken Iced Tazo® Tea Lemonade (With Classic Syrup)',
       'Banana Chocolate Smoothie', 'Orange Mango Banana Smoothie',
       'Strawberry Banana Smoothie', 'Coffee',
       'Mocha (Without Whipped Cream)', 'Caramel (Without Whipped Cream)',
       'Java Chip (Wi

In [71]:
# what is the average # calories for each beverage category?
temp =  df_temp.groupby("Beverage")
temp["Calories"].mean()

Beverage
Banana Chocolate Smoothie                              290.000000
Brewed Coffee                                            4.250000
Caffè Americano                                         13.750000
Caffè Latte                                            139.166667
Caffè Mocha (Without Whipped Cream)                    210.000000
Cappuccino                                              90.000000
Caramel                                                150.000000
Caramel (Without Whipped Cream)                        268.888889
Caramel Apple Spice (Without Whipped Cream)            247.500000
Caramel Macchiato                                      184.166667
Coffee                                                 210.000000
Espresso                                                 7.500000
Hot Chocolate (Without Whipped Cream)                  236.666667
Iced Brewed Coffee (With Classic Syrup)                 93.333333
Iced Brewed Coffee (With Milk & Classic Syrup)         122.222222
J

In [91]:
# what beverage preparation includes the most sugar?
a = df_temp.groupby("Beverage_prep")
a[[" Sugars (g)"]].max()

Unnamed: 0_level_0,Sugars (g)
Beverage_prep,Unnamed: 1_level_1
2% Milk,74
Doppio,0
Grande,65
Grande Nonfat Milk,62
Short,33
Short Nonfat Milk,29
Solo,0
Soymilk,80
Tall,49
Tall Nonfat Milk,45


In [107]:
df_temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242 entries, 0 to 241
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Beverage_category          242 non-null    object 
 1   Beverage                   242 non-null    object 
 2   Beverage_prep              242 non-null    object 
 3   Calories                   242 non-null    int64  
 4    Total Fat (g)             242 non-null    object 
 5   Trans Fat (g)              242 non-null    float64
 6   Saturated Fat (g)          242 non-null    float64
 7    Sodium (mg)               242 non-null    int64  
 8    Total Carbohydrates (g)   242 non-null    int64  
 9   Cholesterol (mg)           242 non-null    int64  
 10   Dietary Fibre (g)         242 non-null    int64  
 11   Sugars (g)                242 non-null    int64  
 12   Protein (g)               242 non-null    float64
 13  Vitamin A (% DV)           242 non-null    object 

In [160]:
# what is the average % daily value calcium content for each beverage?
# HINT: make sure your columns have the datatypes you want
# (you can use more than one line for this one)
g =  df_temp
g[' Calcium (% DV) '] = (g[' Calcium (% DV) '].astype(str).str.rstrip('%').astype(int))
(g.groupby("Beverage")[' Calcium (% DV) '].mean()/100).round(3)

Beverage
Banana Chocolate Smoothie                              0.200
Brewed Coffee                                          0.005
Caffè Americano                                        0.015
Caffè Latte                                            0.350
Caffè Mocha (Without Whipped Cream)                    0.300
Cappuccino                                             0.225
Caramel                                                0.110
Caramel (Without Whipped Cream)                        0.120
Caramel Apple Spice (Without Whipped Cream)            0.000
Caramel Macchiato                                      0.283
Coffee                                                 0.123
Espresso                                               0.000
Hot Chocolate (Without Whipped Cream)                  0.350
Iced Brewed Coffee (With Classic Syrup)                0.000
Iced Brewed Coffee (With Milk & Classic Syrup)         0.080
Java Chip                                              0.117
Java Chip (With