### 04 - Python Data Science Toolbox II
* **Iterators**
    * Intro to iterators *(iter(), next())*
    * Playing with iterators *(enumerate(), zip())*
    * Using iterators to load large files into memory *(for chunk in pd.read_csv(), count_entries())*
* **List comprehensions & generators**
    * List comprehensions *(creating codes, cubes, matrix)*
    * List comprehensions with conditions
    * Dictionary comprehensions
    * Intro to generator expressions
    * List comprehesions for time-stamped data

## Iterators

### Intro to iterators

In [1]:
# Iterating over dog_breeds

dog_breeds = ['beagle', 'fox terrier', 'pug', 'basenji', 'whippet']

for item in dog_breeds:
    print(item)  
print()

dogs = iter(dog_breeds)
print(next(dogs))
print(next(dogs))
print(next(dogs))
print(next(dogs))

beagle
fox terrier
pug
basenji
whippet

beagle
fox terrier
pug
basenji


In [2]:
# Iterating over numbers

years = iter(range(2010, 2018, 2))
print(next(years))
print(next(years))
print(next(years))
print()

for num in range(2010, 2019, 2):
    print(num)
print()

even = iter(range(2, 2 ** 100, 2))
print(next(even))
print(next(even))
print(next(even))
print(next(even))

2010
2012
2014

2010
2012
2014
2016
2018

2
4
6
8


In [3]:
# Iterators as function arguments

range_of_years = range(2010, 2018)
print(range_of_years)

list_of_years = list(range_of_years)
print(list_of_years)

sum_of_years = sum(range_of_years)
print(sum_of_years)

range(2010, 2018)
[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]
16108


### Playing with iterators

In [4]:
# Using enumerate

dog_breeds = ['beagle', 'fox terrier', 'pug', 'basenji', 'whippet']

breed_list = list(enumerate(dog_breeds)) # List of tuples
print(breed_list)
print()

for ind, val in enumerate(dog_breeds):
    print(ind, val)
print()

for ind, val in enumerate(dog_breeds, start = 1) :
    print(ind, val)

[(0, 'beagle'), (1, 'fox terrier'), (2, 'pug'), (3, 'basenji'), (4, 'whippet')]

0 beagle
1 fox terrier
2 pug
3 basenji
4 whippet

1 beagle
2 fox terrier
3 pug
4 basenji
5 whippet


In [5]:
# Using zip

dog_breeds = ['beagle', 'fox terrier', 'pug', 'basenji', 'whippet']
dog_names = ['Bella', 'Max', 'Rocky', 'Daisy', 'Maggie']
dog_ages = [4, 8, 1, 2, 5]

dog_zip = zip(dog_breeds, dog_names, dog_ages)
print(dog_zip)
print()

for val1, val2, val3 in dog_zip:
    print(val1 + ': '+  val2 + ', ' + str(val3) + 'yo')
print()

dog_data = list(zip(dog_breeds, dog_names, dog_ages))
print(dog_data)

<zip object at 0x10ef85ec8>

beagle: Bella, 4yo
fox terrier: Max, 8yo
pug: Rocky, 1yo
basenji: Daisy, 2yo
whippet: Maggie, 5yo

[('beagle', 'Bella', 4), ('fox terrier', 'Max', 8), ('pug', 'Rocky', 1), ('basenji', 'Daisy', 2), ('whippet', 'Maggie', 5)]


In [6]:
# Using * and zip to 'unzip'

z1 = zip(dog_breeds, dog_names)
print(*z1)
print()

z1 = zip(dog_breeds, dog_names)
breeds, names = zip(*z1)
print(breeds)
print(names)

('beagle', 'Bella') ('fox terrier', 'Max') ('pug', 'Rocky') ('basenji', 'Daisy') ('whippet', 'Maggie')

('beagle', 'fox terrier', 'pug', 'basenji', 'whippet')
('Bella', 'Max', 'Rocky', 'Daisy', 'Maggie')


### Using iterators to load large files into memory

In [10]:
# Processing large amounts of Dogs data

import pandas as pd
dog_dict = {}
for chunk in pd.read_csv('Dogs.csv', delimiter=';', chunksize=10):
    for entry in chunk['Origin']:
        if entry in dog_dict.keys():
            dog_dict[entry] += 1
        else:
            dog_dict[entry] = 1

print(dog_dict)

{'Germany': 38, 'Afghanistan': 2, 'Morocco': 2, 'United Kingdom': 79, 'Turkey': 3, 'Japan': 12, 'Spain': 29, 'United States': 45, 'Albania': 1, 'Greece': 6, 'Austria': 5, 'Switzerland': 10, 'France': 49, 'Egypt': 1, 'Armenia': 1, 'Australia': 9, 'Mali': 1, 'India': 15, 'Democratic Republic of the Congo': 1, 'Belgium': 9, 'Italy': 14, 'Norway': 6, 'Russia': 15, 'South Africa': 1, 'Czech Republic': 4, 'Bosnia and Herzegovina': 2, 'Brazil': 4, 'Denmark': 3, 'Romania': 3, 'Pakistan': 1, 'Lebanon': 1, 'Canada': 11, 'Portugal': 10, 'Georgia': 2, 'Mexico': 2, 'Chile': 1, 'China': 11, 'Slovakia': 4, 'Argentina': 2, 'Madagascar': 1, 'Croatia': 6, 'Czechoslovakia': 1, 'Cuba': 2, 'Netherlands': 10, 'Sweden': 8, 'Estonia': 1, 'Scandinavia': 1, 'Finland': 7, 'Ireland': 9, 'Puerto Rico': 1, 'Greenland': 1, 'Guatemala': 1, 'Nepal': 1, 'Ukraine': 1, 'New Zealand': 3, 'Iceland': 1, 'Bulgaria': 1, 'Slovenia': 1, 'Indonesia': 1, 'Hungary': 9, 'Korea': 3, 'Tibet': 4, 'Lithuania': 1, 'Montenegro': 1, 'Vene

In [11]:
# Extracting information using function:

def count_entries(csv_file, delimiter, c_size, colname):
    """Return a dictionary with counts of
    occurrences as value for each key."""
    
    counts_dict = {}
    for chunk in pd.read_csv(csv_file, delimiter, chunksize= c_size):
        for entry in chunk[colname]:
            if entry in counts_dict.keys():
                counts_dict[entry] += 1
            else:
                counts_dict[entry] = 1
    return counts_dict

result_counts = count_entries('Dogs.csv', ';', 10, 'The Kennel Club')
print(result_counts)

{'Toy': 20, 'Hound': 26, nan: 297, 'Terrier': 25, 'Utility': 23, 'Working': 25, 'Extinct': 30, 'Gundog': 29, 'Pastoral': 32, 'Terriers': 1, 'Sporting Dog': 1, 'Gun dog': 2, 'Toys': 1, 'Not recognised': 1, 'Toy Group': 1}


## List comprehensions and generators

### List comprehensions

In [13]:
# Basic list comprehension

dog_breeds = ['beagle', 'fox terrier', 'pug', 'basenji', 'whippet']
[breed[:2].upper() for breed in dog_breeds]

['BE', 'FO', 'PU', 'BA', 'WH']

In [14]:
# Writing list comprehension

cubes = [i**3 for i in range(8)]
print(cubes)

[0, 1, 8, 27, 64, 125, 216, 343]


In [15]:
# Nested list comprehension

from numpy import random

matrix = [[col for col in random.randint(1, 6, 5)] for col in range(5)]
for row in matrix:
    print(row)

[1, 1, 2, 1, 1]
[5, 5, 3, 2, 2]
[3, 2, 5, 1, 3]
[1, 5, 1, 1, 3]
[5, 3, 2, 3, 4]


### List comprehensions with conditions

In [16]:
# if-conditionals in comprehensions

dog_breeds = ['welsh sheepdog', 'chinese imperial dog', 'pug', 'basenji', 'german spitz']
national_breeds = [breed for breed in dog_breeds if ' ' in breed]
print(national_breeds)

['welsh sheepdog', 'chinese imperial dog', 'german spitz']


In [17]:
# if-else-conditionals in comprehensions

dog_breeds = ['welsh sheepdog', 'chinese imperial dog', 'pug', 'basenji', 'german spitz']
national_breeds = [breed if ' ' in breed else 'not national' for breed in dog_breeds]
print(national_breeds)

['welsh sheepdog', 'chinese imperial dog', 'not national', 'not national', 'german spitz']


### Dictionary comprehensions

In [18]:
# Dict comprehension

dog_names = ['Bella', 'Max', 'Annabelle', 'Daisy', 'Maggie']
names_dict = {name : len(name) for name in dog_names}
print(names_dict)

{'Bella': 5, 'Max': 3, 'Annabelle': 9, 'Daisy': 5, 'Maggie': 6}


### Intro to generator expressions

In [19]:
# Write your own generator expressions

dog_names = ['Bella', 'Max', 'Annabelle', 'Daisy', 'Maggie']

names = (name for name in dog_names)
print(next(names))
print(next(names))
print(next(names))
print('')

for name in names:
    print(name)

Bella
Max
Annabelle

Daisy
Maggie


In [20]:
# Changing the output in generator expressions

dog_names = ['Bella', 'Max', 'Annabelle', 'Daisy', 'Maggie']
short_name = (name[:3] + ' for ' + name for name in dog_names)

for value in short_name:
    print(value)

Bel for Bella
Max for Max
Ann for Annabelle
Dai for Daisy
Mag for Maggie


In [21]:
# Build a generator

dog_names = ['Bella', 'Max', 'Annabelle', 'Daisy', 'Maggie']

def get_short_names(input_list):
    """Generator function that yields short string for strings in input_list."""
    for name in input_list:
        yield name[:3]

for short_name in get_short_names(dog_names):
    print(short_name)

Bel
Max
Ann
Dai
Mag


### List comprehesions for time-stamped data

In [23]:
# List comprehensions for time-stamped data

df = pd.read_csv('my_dogs.csv', index_col='Name')
dog_birthdays = df['Birthday']

years = [birthday[-4:] for birthday in dog_birthdays]
print(years)

ages = [2018 - int(birthday[-4:]) for birthday in dog_birthdays]
print(ages)

['2014', '2010', '2017', '2016', '2013']
[4, 8, 1, 2, 5]


In [24]:
# Conditional list comprehesions for time-stamped data

# Extract the Birthday column from df
df = pd.read_csv('my_dogs.csv', index_col='Name')
dog_birthdays = df['Birthday']
print(dog_birthdays)
print()

# Extract years from birthdays and choose young dogs
summer_birthdays = [birthday for birthday in dog_birthdays if 6 <= int(birthday[3:5]) <=8 ]
print(summer_birthdays)

Name
Bella        22-03-2014
Max          14-01-2010
Annabelle    06-11-2017
Daisy        05-05-2016
Maggie       17-08-2013
Name: Birthday, dtype: object

['17-08-2013']
