# Introduction to Data Science in Python
## week 1 - Python Fundamentals 

### python unpacking

In [None]:
x = ('a','b','c')  # x = ['a','b','c']
fname,lname,email = x

### string placeholder <br>
don't have to convert every varible with str(), format evaluation will take care of the type

In [3]:
sales_record = {'price':3.24,'num_items':4,'person':'Chris'}
sales_statement = '{} bought {} items at the price of {} each'
print(sales_statement.format(sales_record['person'], sales_record['num_items'], sales_record['price']))

Chris bought 4 items at the price of 3.24 each


### data file and statistics summary
<br>
Let's import our datafile mpg.csv, which contains fuel economy data for 234 cars.

* mpg : miles per gallon
* class : car classification
* cty : city mpg
* cyl : # of cylinders
* displ : engine displacement in liters
* drv : f = front-wheel drive, r = rear wheel drive, 4 = 4wd
* fl : fuel (e = ethanol E85, d = diesel, r = regular, p = premium, c = CNG)
* hwy : highway mpg
* manufacturer : automobile manufacturer
* model : model of car
* trans : type of transmission
* year : model year

In [8]:
import csv
% precision 2
with open('mpg.csv') as csvfile:
    mpg = list(csv.DictReader(csvfile))
mpg[:3]

[OrderedDict([('', '1'),
              ('manufacturer', 'audi'),
              ('model', 'a4'),
              ('displ', '1.8'),
              ('year', '1999'),
              ('cyl', '4'),
              ('trans', 'auto(l5)'),
              ('drv', 'f'),
              ('cty', '18'),
              ('hwy', '29'),
              ('fl', 'p'),
              ('class', 'compact')]),
 OrderedDict([('', '2'),
              ('manufacturer', 'audi'),
              ('model', 'a4'),
              ('displ', '1.8'),
              ('year', '1999'),
              ('cyl', '4'),
              ('trans', 'manual(m5)'),
              ('drv', 'f'),
              ('cty', '21'),
              ('hwy', '29'),
              ('fl', 'p'),
              ('class', 'compact')]),
 OrderedDict([('', '3'),
              ('manufacturer', 'audi'),
              ('model', 'a4'),
              ('displ', '2'),
              ('year', '2008'),
              ('cyl', '4'),
              ('trans', 'manual(m6)'),
              ('drv',

In [9]:
# each row is a dictionary, with column names as keys
print(mpg[0].keys())
# list comprehension
print(sum (float(a['cty']) for a in mpg)/len(mpg))

odict_keys(['', 'manufacturer', 'model', 'displ', 'year', 'cyl', 'trans', 'drv', 'cty', 'hwy', 'fl', 'class'])
16.858974358974358


In [14]:
# avg of city mpg grouped by cylinder
# will be later simplified by pandas

# the unique values for the number of cylinders the cars in dataset have
cylinders = set(a['cyl'] for a in mpg)

CtyMpgByCyl = []

for c in cylinders: # iterate over all the cylinder levels
    summpg = 0
    cyltypecount = 0
    for d in mpg: # iterate over all dictionaries
        if d['cyl'] == c: # if the cylinder level type matches,
             summpg += float(d['cty']) # add the cty mpg
             cyltypecount += 1 # increment the count
    CtyMpgByCyl.append((c, summpg/cyltypecount)) # append the tuple ('cylinder', 'avg mpg')

CtyMpgByCyl.sort(key=lambda x: x[0])
CtyMpgByCyl

[('4', 21.01), ('5', 20.50), ('6', 16.22), ('8', 12.57)]

city fuel economy appears to be decreasing as the number of cylinders increases

### Dates and Times

In [1]:
# get time since epoch (01.01.1970)
import datetime as dt
import time as tm

# current time in seconds since the Epoch
print(tm.time())

# convert the time stamp to datetime
dtnow = dt.datetime.fromtimestamp(tm.time())
print(dtnow)
print(dtnow.year, dtnow.month, dtnow.day, dtnow.hour, dtnow.minute, dtnow.second)

1553045931.0339966
2019-03-19 21:38:51.033997
2019 3 19 21 38 51


In [3]:
# timedelta is a duration expressing the difference between two dates
delta = dt.timedelta(days = 730) # create a timedelta of 100 days

# commenly used in data science for creating sliding window
today = dt.date.today()
window = today - delta
print(window)

2017-03-19


### functional programming map()<br>
Returns a list of the results after applying the given function  
to each item of a given iterable (list, tuple etc.) 

In [20]:
store1 = [10.00, 11.00, 12.34, 2.34]
store2 = [9.00, 11.10, 12.34, 2.01]
cheapest = map(min, store1, store2)

print(cheapest)    # return map oject

<map object at 0x000002570D936668>


In [21]:
 # only run the map() funtion when looking inside for a value (efficient memory management)
for item in cheapest:   
    print(item)

9.0
11.0
12.34
2.01


In [1]:
# --------------------map function--------------------------
# Return double of n 
def addition(n): 
    return n + n 
  
# We double all numbers using map() 
numbers = (1, 2, 3, 4) 
result = map(addition, numbers) 
print(list(result)) 

# --------achieve same result using map and lambda---------- 
numbers = (1, 2, 3, 4) 
result = map(lambda x: x + x, numbers) 
print(list(result)) 

[2, 4, 6, 8]


### Python Lamda
<br>
Lamba create a anonymous functions <br>
lambda + a list of arguments + a colon + a single expression

In [7]:
people = ['Dr. Christopher Brooks', 'Dr. Kevyn Collins-Thompson', 'Dr. VG Vinod Vydiswaran', 'Dr. Daniel Romero']

def split_title_and_name(person):
    return person.split()[0] + ' ' + person.split()[-1]

#option 1
for person in people:
    print(split_title_and_name(person) == (lambda x: x.split()[0] + ' ' + x.split()[-1])(person))  # input value to evoke lamda
    # cannot be "lambda person: person.split()[0] + ' ' + person.split()[-1])"

#option 2
list(map(split_title_and_name, people)) == list(map(lambda person: person.split()[0] + ' ' + person.split()[-1], people))

True
True
True
True


True

### list comprehension

In [34]:
# option 1 
def times_tables():
    lst = []
    for i in range(10):
        for j in range (10):
            lst.append(i*j)
    return lst

# option 2
times_tables() == [j*i for i in range(10) for j in range(10)]

True

In [24]:
# option 1
my_list = []
for number in range(0, 1000):
    if number % 2 == 0:
        my_list.append(number)
        
# option 2
my_list = [number for number in range(0,1000) if number % 2 == 0]

### NymPy

In [9]:
import numpy as np

#### creat array from list

In [14]:
mylist = [1, 2, 3]
x = np.array(mylist)
y = np.array([4,5,6])

In [10]:
m = np.arange(0, 30, 2)  # define interval
n = np.linspace(0, 30, 16) # define number of returned values, no need to input interval
print(m)
print(n)

[ 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28]
[ 0.  2.  4.  6.  8. 10. 12. 14. 16. 18. 20. 22. 24. 26. 28. 30.]


#### other functions

In [15]:
n.resize(4,4)
np.ones((3, 2))
np.zeros((2, 3))
np.eye(3)
np.diag(y)

array([[4, 0, 0],
       [0, 5, 0],
       [0, 0, 6]])

#### repeat an array

In [10]:
np.array([1, 2, 3] * 3) # output array([1, 2, 3, 1, 2, 3, 1, 2, 3])
np.repeat([1, 2, 3], 3) # output array([1, 1, 1, 2, 2, 2, 3, 3, 3])

array([1, 2, 3, 1, 2, 3, 1, 2, 3])

In [11]:
np.repeat([1, 2, 3], 3)

array([1, 1, 1, 2, 2, 2, 3, 3, 3])

#### combine arrays

In [12]:
p = np.ones([2, 3], int)
np.vstack([p, 2*p]) # stack vertically
np.hstack([p, 2*p]) # stack horizontally

array([[1, 1, 1, 2, 2, 2],
       [1, 1, 1, 2, 2, 2]])

#### operations

In [16]:
print(x + y) # elementwise addition     [1 2 3] + [4 5 6] = [5  7  9]
print(x - y) # elementwise subtraction  [1 2 3] - [4 5 6] = [-3 -3 -3]
print(x * y) # elementwise multiplication  [1 2 3] * [4 5 6] = [4  10  18]
print(x / y) # elementwise divison         [1 2 3] / [4 5 6] = [0.25  0.4  0.5]
print(x**2) # elementwise power  [1 2 3] ^2 =  [1 4 9]

[5 7 9]
[-3 -3 -3]
[ 4 10 18]
[0.25 0.4  0.5 ]
[1 4 9]


In [17]:
x.dot(y) # dot product  1*4 + 2*5 + 3*6
z = np.array([y, y**2])
print(len(z)) # number of rows of array
z.T # transpose
z.dtype # see the data type of the elements in the array
z = z.astype('f') # cast data type to floating
z

2


array([[ 4.,  5.,  6.],
       [16., 25., 36.]], dtype=float32)

#### math funtion

In [None]:
a = np.array([-4, -2, 1, 3, 5])
a.sum()
a.max()
a.min()
a.mean()
a.std()
# index of max and min value
a.argmax() # if multiple values are max, return the smallest index
a.argmin()

#### indexing and slicing

In [21]:
s = np.arange(13)**2
s[-4:]
s[-5::-2]  # [start:stop:stepsize]

array([64, 36, 16,  4,  0], dtype=int32)

In [21]:
r = np.arange(36)
r.resize((6, 6))
r[:2, :-1] # all the rows up to (and not including) row 2, and all the columns up to (and not including) the last column
r[-1, ::2] # slice of the last row, and only every other element.
r[r > 30] # selecting values from the array that are greater than 30. (Also see np.where)
r[r > 20] # assigning values from the array that are greater than 30 to the value of 30

array([21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35])

#### copying data

In [24]:
r2 = r[:3,:3] # r2 is a slice of r
r2[:] = 0
r   # r has also been changed

array([[ 0,  0,  0,  3,  4,  5],
       [ 0,  0,  0,  9, 10, 11],
       [ 0,  0,  0, 15, 16, 17],
       [18, 19, 20, 21, 22, 23],
       [24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35]])

To avoid this, use r.copy to create a copy that will not affect the original array

In [23]:
r_copy = r.copy()
r_copy[:] = 10
print(r_copy,'\n')
print(r)

[[10 10 10 10 10 10]
 [10 10 10 10 10 10]
 [10 10 10 10 10 10]
 [10 10 10 10 10 10]
 [10 10 10 10 10 10]
 [10 10 10 10 10 10]] 

[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
 [12 13 14 15 16 17]
 [18 19 20 21 22 23]
 [24 25 26 27 28 29]
 [30 31 32 33 34 35]]


#### iterating over arrays

In [27]:
test = np.random.randint(0, 10, (4,3))
# iterate by row
for row in test:
    print(row)

# iterate by index
# iterate over rows
for row in range(len(test)):
    print(test[row])
# iterate over columns
for col in range(test.shape[1]):
    print(test[:,col])
    
# iterate by row and index
for i, row in enumerate(test):
    print('row', i, 'is', row)

[3 8 1]
[8 0 2]
[9 4 5]
[3 6 2]
[3 8 1]
[8 0 2]
[9 4 5]
[3 6 2]
[3 8 9 3]
[8 0 4 6]
[1 2 5 2]
row 0 is [3 8 1]
row 1 is [8 0 2]
row 2 is [9 4 5]
row 3 is [3 6 2]


use zip to iterate over multiple iterables

In [35]:
test2 = test**2
for i, j in zip(test, test2):
    print(i,'+',j,'=',i+j)
# different from i+j for i in test for j in test2

[3 8 1] + [ 9 64  1] = [12 72  2]
[8 0 2] + [64  0  4] = [72  0  6]
[9 4 5] + [81 16 25] = [90 20 30]
[3 6 2] + [ 9 36  4] = [12 42  6]
<generator object <genexpr> at 0x000002D2C5436360>
