# Data Files and Summary statistics

In [2]:
import csv

with open('mpg.csv') as csvfile:
    mpg = list(csv.DictReader(csvfile))

mpg[:3]

[OrderedDict([('', '1'),
              ('manufacturer', 'audi'),
              ('model', 'a4'),
              ('displ', '1.8'),
              ('year', '1999'),
              ('cyl', '4'),
              ('trans', 'auto(l5)'),
              ('drv', 'f'),
              ('cty', '18'),
              ('hwy', '29'),
              ('fl', 'p'),
              ('class', 'compact')]),
 OrderedDict([('', '2'),
              ('manufacturer', 'audi'),
              ('model', 'a4'),
              ('displ', '1.8'),
              ('year', '1999'),
              ('cyl', '4'),
              ('trans', 'manual(m5)'),
              ('drv', 'f'),
              ('cty', '21'),
              ('hwy', '29'),
              ('fl', 'p'),
              ('class', 'compact')]),
 OrderedDict([('', '3'),
              ('manufacturer', 'audi'),
              ('model', 'a4'),
              ('displ', '2'),
              ('year', '2008'),
              ('cyl', '4'),
              ('trans', 'manual(m6)'),
              ('drv',

In [3]:
len(mpg)

234

keys gives us the column names our csv

In [4]:
mpg[0].keys()

odict_keys(['', 'manufacturer', 'model', 'displ', 'year', 'cyl', 'trans', 'drv', 'cty', 'hwy', 'fl', 'class'])

This is how to find  the average cty fuel economy across all cars. All values in the dictionaries are strings, so we need to convert to float.

In [5]:
sum(float(d['cty']) for d in mpg) / len(mpg)

16.858974358974358

Similarly this is how to find the average hwy fuel economy across all cars.


In [6]:
sum(float(d['hwy']) for d in mpg) /len(mpg)

23.44017094017094

Use `set` to return the unique values for the number of cylinders the cars in our dataset have.

In [7]:
cylinders = set(d['cyl'] for d in mpg)
cylinders

{'4', '5', '6', '8'}

Here's a more complex example where we are grouping the cars by number of cylinder, and finding the average cty mpg for each group

In [8]:
CtyMpgByCyl = []

for c in cylinders: # iterate over all the cylinders levels
    summpg = 0
    cyltypecount = 0
    for d in mpg:
        if d['cyl'] == c: # if the cylinder level type matches,
            summpg += float(d['cty']) #add the cty mpg
            cyltypecount += 1 # increment the count
    CtyMpgByCyl.append((c, summpg / cyltypecount)) # append the tuple('cyliners', 'avg mpg')

CtyMpgByCyl.sort(key=lambda x: x[0])
CtyMpgByCyl

[('4', 21.012345679012345),
 ('5', 20.5),
 ('6', 16.21518987341772),
 ('8', 12.571428571428571)]

Use `set` to return the unique values for the class types in our dataset.

In [9]:
vehicleclass = set(d['class'] for d in mpg) # what are the class types
vehicleclass

{'2seater', 'compact', 'midsize', 'minivan', 'pickup', 'subcompact', 'suv'}

In [10]:
HwyMpgByClass = []

for t in vehicleclass:
    summpg = 0
    vclasscount = 0
    for d in mpg: # iterate over all dictionaries
        if d['class'] == t:
            summpg += float(d['hwy']) # add the hwy mpg
            vclasscount += 1 # increment the count
    HwyMpgByClass.append((t, summpg / vclasscount)) # append the tuple ('class', 'avg mpg')

HwyMpgByClass

[('midsize', 27.29268292682927),
 ('minivan', 22.363636363636363),
 ('pickup', 16.87878787878788),
 ('subcompact', 28.142857142857142),
 ('2seater', 24.8),
 ('suv', 18.129032258064516),
 ('compact', 28.29787234042553)]

# The Python Programming Language: Dates and Times

In [11]:
import datetime as dt
import time as tm

`time` returns the current time in seconds since the Epoch. (January 1st, 1970)

In [12]:
   tm.time()

1570020193.6258862

Convert the timestamp to datetime.

In [13]:
dtnow = dt.datetime.fromtimestamp(tm.time())
dtnow

datetime.datetime(2019, 10, 2, 20, 43, 13, 994928)

Handy datetime attributes:

In [14]:
dtnow.year, dtnow.month, dtnow.day, dtnow.hour, dtnow.minute, dtnow.second # get year month, day, hour, minute, second

(2019, 10, 2, 20, 43, 13)

`timedelta` is a duration expressing the difference between two dates.

In [15]:
delta = dt.timedelta(days = 100) # create a timedelta of 100 days
delta

datetime.timedelta(days=100)

`date.today` returns the current local date.

In [16]:
today = dt.date.today()

In [17]:
today - delta # the date 100 days ago

datetime.date(2019, 6, 24)

In [18]:
today > today - delta # compare dates

True

# The Python Programming Language: Object and map()

An example of a class in python:

In [19]:
class Person:
    department = 'School of information' # a class variable
    
    def set_name(self, new_name): # a method
        self.name = new_name
    def set_location(self, new_location):
        self.location = new_location

In [20]:
person = Person()
person.set_name("Christpher Brooks")
person.set_location("Ann Arbor, MI, USA")
print('{} live {} and works in the department {}'.format(person.name, person.location, person.department))

Christpher Brooks live Ann Arbor, MI, USA and works in the department School of information


Here's an example of mapping the `min` function between two lists.

In [21]:
store1 = [10.00, 11.00, 12.34, 2.34]
store2 = [9.00, 11.10, 12.34, 2.01]
cheapest = map(min, store1, store2)
cheapest

<map at 0x24105f08e10>

Now let's iterate through the map object to see the values.

In [22]:
for item in cheapest:
    print(item)

9.0
11.0
12.34
2.01


# The Python Programming Language: Lambda and List Comprehensions

Here's an example of lambda that takes in three parameters and adds the first two.

In [23]:
my_function = lambda a, b, c, : a + b

In [24]:
my_function(1, 2, 3)

3

Let's iterate from 0 to 999 and return the even numbers.

In [25]:
my_list = []
for number in range(0, 1000):
    if number % 2 == 0:
        my_list.append(number)
my_list

[0,
 2,
 4,
 6,
 8,
 10,
 12,
 14,
 16,
 18,
 20,
 22,
 24,
 26,
 28,
 30,
 32,
 34,
 36,
 38,
 40,
 42,
 44,
 46,
 48,
 50,
 52,
 54,
 56,
 58,
 60,
 62,
 64,
 66,
 68,
 70,
 72,
 74,
 76,
 78,
 80,
 82,
 84,
 86,
 88,
 90,
 92,
 94,
 96,
 98,
 100,
 102,
 104,
 106,
 108,
 110,
 112,
 114,
 116,
 118,
 120,
 122,
 124,
 126,
 128,
 130,
 132,
 134,
 136,
 138,
 140,
 142,
 144,
 146,
 148,
 150,
 152,
 154,
 156,
 158,
 160,
 162,
 164,
 166,
 168,
 170,
 172,
 174,
 176,
 178,
 180,
 182,
 184,
 186,
 188,
 190,
 192,
 194,
 196,
 198,
 200,
 202,
 204,
 206,
 208,
 210,
 212,
 214,
 216,
 218,
 220,
 222,
 224,
 226,
 228,
 230,
 232,
 234,
 236,
 238,
 240,
 242,
 244,
 246,
 248,
 250,
 252,
 254,
 256,
 258,
 260,
 262,
 264,
 266,
 268,
 270,
 272,
 274,
 276,
 278,
 280,
 282,
 284,
 286,
 288,
 290,
 292,
 294,
 296,
 298,
 300,
 302,
 304,
 306,
 308,
 310,
 312,
 314,
 316,
 318,
 320,
 322,
 324,
 326,
 328,
 330,
 332,
 334,
 336,
 338,
 340,
 342,
 344,
 346,
 348,
 350,

Now the same thing but with list comprehension

In [26]:
my_list = [number for number in range(0, 1000) if number % 2 == 0]
my_list 

[0,
 2,
 4,
 6,
 8,
 10,
 12,
 14,
 16,
 18,
 20,
 22,
 24,
 26,
 28,
 30,
 32,
 34,
 36,
 38,
 40,
 42,
 44,
 46,
 48,
 50,
 52,
 54,
 56,
 58,
 60,
 62,
 64,
 66,
 68,
 70,
 72,
 74,
 76,
 78,
 80,
 82,
 84,
 86,
 88,
 90,
 92,
 94,
 96,
 98,
 100,
 102,
 104,
 106,
 108,
 110,
 112,
 114,
 116,
 118,
 120,
 122,
 124,
 126,
 128,
 130,
 132,
 134,
 136,
 138,
 140,
 142,
 144,
 146,
 148,
 150,
 152,
 154,
 156,
 158,
 160,
 162,
 164,
 166,
 168,
 170,
 172,
 174,
 176,
 178,
 180,
 182,
 184,
 186,
 188,
 190,
 192,
 194,
 196,
 198,
 200,
 202,
 204,
 206,
 208,
 210,
 212,
 214,
 216,
 218,
 220,
 222,
 224,
 226,
 228,
 230,
 232,
 234,
 236,
 238,
 240,
 242,
 244,
 246,
 248,
 250,
 252,
 254,
 256,
 258,
 260,
 262,
 264,
 266,
 268,
 270,
 272,
 274,
 276,
 278,
 280,
 282,
 284,
 286,
 288,
 290,
 292,
 294,
 296,
 298,
 300,
 302,
 304,
 306,
 308,
 310,
 312,
 314,
 316,
 318,
 320,
 322,
 324,
 326,
 328,
 330,
 332,
 334,
 336,
 338,
 340,
 342,
 344,
 346,
 348,
 350,

# The python programming language: Numerical Python (Numpy)

In [27]:
import numpy as np

## Create Arrays

Create a list and convert it to a numpy array

In [28]:
mylist = [1, 2, 3]
x = np.array(mylist)
x

array([1, 2, 3])

Or just pass in a list directly

In [29]:
y = np.array([4, 5, 6])
y

array([4, 5, 6])

Pass in a list of lists to create a multidimensional array.

In [30]:
m = np.array([[7, 8, 9], [10, 11, 12]])
m

array([[ 7,  8,  9],
       [10, 11, 12]])

Use the shape method to find the dimensions of the array. (rows, columns)

In [31]:
m.shape

(2, 3)

`arange` returns evenly spaced values within a given interval.

In [32]:
n = np.arange(0, 30, 2) # Start at 0 and count up by 2, stop before 30
n

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28])

`reshape` returns an array with the same data with a new shape.

In [33]:
n = n.reshape(3, 5) # reshape array to be 3 x 5
n

array([[ 0,  2,  4,  6,  8],
       [10, 12, 14, 16, 18],
       [20, 22, 24, 26, 28]])

`linspace` returns evenly sapced numbers over a specified interval.

In [34]:
o = np.linspace(0, 4, 9) # return 9 evenly spaced values from 0 to 4
o

array([0. , 0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. ])

`resize` chanes the shape and size of array in-place

In [35]:
o.resize(3, 3)
o

array([[0. , 0.5, 1. ],
       [1.5, 2. , 2.5],
       [3. , 3.5, 4. ]])

`ones` returns a new array of given shape and type, filled with ones.

In [36]:
np.ones((3, 2))

array([[1., 1.],
       [1., 1.],
       [1., 1.]])

`zeros` returns a new array of given shape and type, filled with zeros.

In [37]:
np.zeros((2, 9))

array([[0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.]])

`eye` returns a 2-D array with ones on the diagonal and zeros elsewhere.

In [38]:
np.eye(3)

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

`diag` extracts a diagonal or construct a diagonal array.

In [39]:
np.diag(y)

array([[4, 0, 0],
       [0, 5, 0],
       [0, 0, 6]])

Create an array using repeating list(or see `np.tile`)

In [40]:
np.repeat([1, 2, 3], 3)

array([1, 1, 1, 2, 2, 2, 3, 3, 3])

### Combining Arrays

In [41]:
p = np.ones([2, 3], int)
p

array([[1, 1, 1],
       [1, 1, 1]])

Use `vstack` to stack arrays in sequence vertically (row wise)

In [42]:
np.vstack([p, 2*p])

array([[1, 1, 1],
       [1, 1, 1],
       [2, 2, 2],
       [2, 2, 2]])

Use `hstack` to stack arrays in sequence horizontally (column wise)

In [43]:
np.hstack([p, 2*p])

array([[1, 1, 1, 2, 2, 2],
       [1, 1, 1, 2, 2, 2]])

## Operations

Use `+`, `-`, `/` and `**` to perform element wise addition, subtraction, multiplication, division and power

In [44]:
print(x + y) # elementwise addition [1 2 3] + [4 5 6] = [5 7 9]
print(x - y) # elementwise subtraction [1 2 3] - [4 5 6] = [-3 -3 -3]

[5 7 9]
[-3 -3 -3]


In [45]:
print(x * y) # elemenetwise multiplication [1 2 3] * [4 5 6] = [4 10 18]
print(x / y) # elementwise division [1 2 3] / [4 5 6] = [0.25 0.4 0.5]

[ 4 10 18]
[0.25 0.4  0.5 ]


In [46]:
y

array([4, 5, 6])

In [47]:
x

array([1, 2, 3])

In [48]:
print(x**2) # elementwise power [1 2 3] ^2 = [1 4 9]

[1 4 9]


**Dot Product:**  

$ \begin{bmatrix}x_1 \ x_2 \ x_3\end{bmatrix}
\cdot
\begin{bmatrix}y_1 \\ y_2 \\ y_3\end{bmatrix}
= x_1 y_1 + x_2 y_2 + x_3 y_3$

In [49]:
x.dot(y) # dot product 1*4 + 4*5 + 3*6

32

In [50]:
z = np.array([y, y**2])
print(len(z)) # numebr of rows of array

2


In [51]:
z

array([[ 4,  5,  6],
       [16, 25, 36]])

The shape of array `z` is `(2, 3)` before transposing.

In [52]:
z.shape

(2, 3)

Use `.T` to get the transpose.

In [53]:
z.T

array([[ 4, 16],
       [ 5, 25],
       [ 6, 36]])

The number of rows has swapped with the number of columns.

In [54]:
z.T.shape

(3, 2)

Use `.dtype` to see the data type of the elements in the array.

In [55]:
z.dtype

dtype('int32')

Use `.astype` to cast to a specific type.

In [56]:
z = z.astype('f')
z.dtype

dtype('float32')

In [57]:
z

array([[ 4.,  5.,  6.],
       [16., 25., 36.]], dtype=float32)

## Math functions

Numpy has many built in math functions that can be performed on arrays.

In [58]:
a = np.array([-4, -2, 1, 3, 5])

In [59]:
a.sum()

3

In [60]:
a.max()

5

In [61]:
a.min()

-4

In [62]:
a.mean()

0.6

In [63]:
a.std()

3.2619012860600183

`argmax` and `argmin` return the index of the maximum and minimum values in the array.

In [64]:
a.argmax()

4

In [65]:
a.argmin()

0

## Indexing / Slicing

In [66]:
s = np.arange(13)**2
s

array([  0,   1,   4,   9,  16,  25,  36,  49,  64,  81, 100, 121, 144],
      dtype=int32)

Use bracket notation to get the value at a specific index. Remember that indexing starts at 0.

In [67]:
s[0], s[4], s[-1]

(0, 16, 144)

Use `:` to indicate a range. `array[start:stop]`
Leaving `start` or `stop` empty will default to the beginning/end of the array.

In [68]:
s[1:5]

array([ 1,  4,  9, 16], dtype=int32)

Use negatives to count from the back.

In [69]:
s[-4:]

array([ 81, 100, 121, 144], dtype=int32)

A third `:` can be used to indicate step-size. `array[start:stop:stepsize]`
here we are starting 5th element from the end, and counting backwards by 2 until the beginning of the array is reached.

In [70]:
s[-5::-2]

array([64, 36, 16,  4,  0], dtype=int32)

Let's look at a multidimensional array.

In [71]:
r = np.arange(36)
r.resize((6, 6))
r

array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23],
       [24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35]])

Use barcket notation to slice: `array[row, column]`

In [72]:
r[2, 2]

14

In [73]:
r

array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23],
       [24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35]])

And use : to select a range of rows or columns

In [74]:
r[3, 3:6]

array([21, 22, 23])

Here we are selecting all the rows up to (and not including) row 2, and all the columns up to (and not including) the last column.

In [75]:
r[:2, :-1]

array([[ 0,  1,  2,  3,  4],
       [ 6,  7,  8,  9, 10]])

This is a slice of the row, and only every other element.

In [76]:
r[-1, ::2]

array([30, 32, 34])

We can also perform conditional indexing. Here we are selecting values from the array that are greater than 30. (Also see`np.where`)

In [77]:
r[r > 20]

array([21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35])

Here we are assigning all values in the array that are greater than 30 to the value of 30.

In [78]:
r[r > 30] = 30
r

array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23],
       [24, 25, 26, 27, 28, 29],
       [30, 30, 30, 30, 30, 30]])

## Copying Data

Be careful with copying and modifying arrays in NumPy!
`r2` is a slice of `r`

In [79]:
r2 = r[:3, :3]
r2

array([[ 0,  1,  2],
       [ 6,  7,  8],
       [12, 13, 14]])

Set this slice's values to zero ([:] selects the entire array)

In [80]:
r2[:] = 0
r2

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

`r` has also been changed!

In [81]:
r

array([[ 0,  0,  0,  3,  4,  5],
       [ 0,  0,  0,  9, 10, 11],
       [ 0,  0,  0, 15, 16, 17],
       [18, 19, 20, 21, 22, 23],
       [24, 25, 26, 27, 28, 29],
       [30, 30, 30, 30, 30, 30]])

To avoid this, use `r.copy` to create a copy that will not affect the original array

In [82]:
r_copy = r.copy()
r_copy

array([[ 0,  0,  0,  3,  4,  5],
       [ 0,  0,  0,  9, 10, 11],
       [ 0,  0,  0, 15, 16, 17],
       [18, 19, 20, 21, 22, 23],
       [24, 25, 26, 27, 28, 29],
       [30, 30, 30, 30, 30, 30]])

Now when r_copy is modified, r will not be changed

In [83]:
r_copy[:] = 10
print(r_copy, 'This is the r copy\n')
print(r)

[[10 10 10 10 10 10]
 [10 10 10 10 10 10]
 [10 10 10 10 10 10]
 [10 10 10 10 10 10]
 [10 10 10 10 10 10]
 [10 10 10 10 10 10]] This is the r copy

[[ 0  0  0  3  4  5]
 [ 0  0  0  9 10 11]
 [ 0  0  0 15 16 17]
 [18 19 20 21 22 23]
 [24 25 26 27 28 29]
 [30 30 30 30 30 30]]


### iterating Over Arrays
Let's create a new 4 by 3 array of random numbers 0-9.

In [89]:
test = np.random.randint(0, 10, (4, 3))
test

array([[5, 5, 8],
       [3, 8, 9],
       [8, 2, 1],
       [7, 9, 0]])

Iterate by row:

In [90]:
for row in test:
    print(row)

[5 5 8]
[3 8 9]
[8 2 1]
[7 9 0]


iterate by index:

In [91]:
for i in range(len(test)):
    print(test[i])

[5 5 8]
[3 8 9]
[8 2 1]
[7 9 0]


iterate by row and index:

In [92]:
for i, row in enumerate(test):
    print('row', i, 'is', row)

row 0 is [5 5 8]
row 1 is [3 8 9]
row 2 is [8 2 1]
row 3 is [7 9 0]


Use `zip` to iterate over multiple iterablas.

In [93]:
test2 = test.copy()
test2

array([[5, 5, 8],
       [3, 8, 9],
       [8, 2, 1],
       [7, 9, 0]])

In [94]:
test2 = test2**2
test2

array([[25, 25, 64],
       [ 9, 64, 81],
       [64,  4,  1],
       [49, 81,  0]], dtype=int32)

In [97]:
for i, j in zip(test, test2):
    print(i, '+', j, '=', i+j)

[5 5 8] + [25 25 64] = [30 30 72]
[3 8 9] + [ 9 64 81] = [12 72 90]
[8 2 1] + [64  4  1] = [72  6  2]
[7 9 0] + [49 81  0] = [56 90  0]
