In [92]:
import pandas as pd

# Introduction to Iterators

* Iterables is an object that has an associated iter() method.
* Iterator is an object that has an associated next() method. 
* create an iterator from an iterable using the iter() function

Summary
* An iterable is an object that can return an iterator 
* An iterator is an object that keeps state and produces the next value when you call next()

In [93]:
phrase = 'Howdy my name is Servin'
variable = iter(phrase)

next(variable)
next(variable)
next(variable)
print(* variable)

d y   m y   n a m e   i s   S e r v i n


## Iterating through Dictionaries

In [94]:
info = {
    'first_name':'Jose',
    'last_name':'Servin',
    'age':25,
    'Goal':'Become a Full Data Scientist/Analyst in 2022'
}

for key, value in info.items():
    print('The attribute is: ' + key)
    print('The value is: ' + str(value))

The attribute is: first_name
The value is: Jose
The attribute is: last_name
The value is: Servin
The attribute is: age
The value is: 25
The attribute is: Goal
The value is: Become a Full Data Scientist/Analyst in 2022


# Enumerate Function

* allows us to add a counter to any iterable 

In [95]:
names = ['Baker','Bella','Camila','Claudia','Mayra','Melissa']

for index, value in enumerate(names):
    print(index, value)

0 Baker
1 Bella
2 Camila
3 Claudia
4 Mayra
5 Melissa


In [96]:
names = ['Baker','Bella','Camila','Claudia','Mayra','Melissa']

for index, value in enumerate(names, start=1):
    print(index, value)

1 Baker
2 Bella
3 Camila
4 Claudia
5 Mayra
6 Melissa


## using enumerate

In [97]:
# Create a list of strings: mutants
mutants = ['charles xavier', 
            'bobby drake', 
            'kurt wagner', 
            'max eisenhardt', 
            'kitty pryde']

# Create a list of tuples: mutant_list
mutant_list = list(enumerate(mutants))

# Print the list of tuples
print(mutant_list)

for i in mutant_list:
    print(i[1])

# Unpack and print the tuple pairs
for index1, value1 in enumerate(mutants):
    print(index1, value1)

# Change the start index
for index1, value1 in enumerate(mutants, start=1):
    print(index1, value1)

[(0, 'charles xavier'), (1, 'bobby drake'), (2, 'kurt wagner'), (3, 'max eisenhardt'), (4, 'kitty pryde')]
charles xavier
bobby drake
kurt wagner
max eisenhardt
kitty pryde
0 charles xavier
1 bobby drake
2 kurt wagner
3 max eisenhardt
4 kitty pryde
1 charles xavier
2 bobby drake
3 kurt wagner
4 max eisenhardt
5 kitty pryde


# zip function

* allows us to stitch together any number of iterables 
* accepts an arbitrary number of iterables and returns an iterator of tuples

In [98]:
breeds = ['dog','dog','cat','dog']
names = ['Baker','Lilo','Bella','Camila']

for z1, z2 in zip(names, breeds):
    print(z1, z2)

Baker dog
Lilo dog
Bella cat
Camila dog


In [99]:
zip_data = zip(breeds, names)
type(zip_data)
list(zip_data)

[('dog', 'Baker'), ('dog', 'Lilo'), ('cat', 'Bella'), ('dog', 'Camila')]

In [100]:
names_dict = {'Jose':'Servin', 'Baker':'Servin','Camila':'Servin'}
address_dict = {'Jose':'Dallas','Baker':'Houston','Camila':'Houston'}

In [101]:
zip_dict = zip(names_dict, address_dict)
list(zip_dict)

[('Jose', 'Jose'), ('Baker', 'Baker'), ('Camila', 'Camila')]

# Using iterators to load large files into memory

## load data in chunks using an iterator

In [102]:
# basic use-case for iterator 
MLY_CLDD_BASE45 = []
for chunk in pd.read_csv('/Users/joseservin/DataCamp/Courses/Intro_Matplotlib/seattle_weather.csv', chunksize=12):
    MLY_CLDD_BASE45.append(sum(chunk['MLY-CLDD-BASE45']))

print(MLY_CLDD_BASE45)

[2614, 2813, 3257, 3504, 3055, nan, 3301, 3395, nan, 2602, 3408, 1973, 3030, 3048, 2963, 3098, 3465]


In [103]:
# basic use-case for iterator 
total = 0
for chunk in pd.read_csv('/Users/joseservin/DataCamp/Courses/Intro_Matplotlib/seattle_weather.csv', chunksize=12):
    total += sum(chunk['MLY-CLDD-BASE45'].fillna(0))
print(total)

45526.0


# Processing large amounts of Twitter Data

## Using chunksize 

In [104]:
counts_dict = {}

for chunk in pd.read_csv('/Users/joseservin/DataCamp/Courses/Python_Toolbox/tweets.csv', chunksize=10):
    for entry in chunk['lang']:
        if entry in counts_dict.keys():
            counts_dict[entry] += 1 
        else:
            counts_dict[entry] = 1
print(counts_dict)


{'en': 97, 'et': 1, 'und': 2}


# Finding total Urban Population using ChunkSize

## Using a function and chunksize

In [105]:
def count_entries(csv_name, chunk_size, col_name):
    """returns a dictionary with count of entries per unique observation"""
    counts_dict = {}

    for temp_df in pd.read_csv(csv_name, chunksize=chunk_size):
        for entry in temp_df[col_name]:
            if entry in counts_dict.keys():
                counts_dict[entry] += 1
            else:
                counts_dict[entry] = 1
    return counts_dict
    

In [106]:
results = count_entries('/Users/joseservin/DataCamp/Courses/Python_Toolbox/tweets.csv', 10, 'lang')
print(results)

{'en': 97, 'et': 1, 'und': 2}


# List Comprehensions 

* Collapse for-loops for building lists into a single line
* components
    * iterable 
    * iterable variable
    * output expression
    

In [107]:
nums = [1,2,3,4,5,6,7]
new_nums = [i + 1 for i in nums]
print(new_nums)

[2, 3, 4, 5, 6, 7, 8]


## Matrix building

In [108]:
matrix = [[col for col in range(0,5)] for row in range(0,5)]

In [109]:
for row in matrix:
    print(row)

[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]


## Conditionals in Comprehensions 

In [110]:
[num ** 2 for num in range(0,11) if num % 2 == 0]

[0, 4, 16, 36, 64, 100]

## Conditionals on the output expression

In [111]:
[num ** 2 if num % 2 == 0 else 0 for num in range(10)]

[0, 0, 4, 0, 16, 0, 36, 0, 64, 0]

## Dictionary Comprehension

In [112]:
{num : -num for num in range(11)}

{0: 0, 1: -1, 2: -2, 3: -3, 4: -4, 5: -5, 6: -6, 7: -7, 8: -8, 9: -9, 10: -10}

# Introduction to generator expressions

* Like a list comprehension except the list is not stored in memory.
* Generators - returns a generator object 
* created by using () instead of List Comprehension []
* both list comprehensions and generators can be iterated over 

In [113]:
results  = (i for i in range(6))

In [114]:
type(results)

generator

In [115]:
for i in results:
    print(i)

0
1
2
3
4
5


In [116]:
results  = (i for i in range(6))

In [117]:
list(results)

[0, 1, 2, 3, 4, 5]

## Conditionals in Generator Expressions

In [118]:
even_nums = (i for i in range(11) if i % 2 == 0)

In [119]:
type(even_nums)

generator

In [120]:
list(even_nums)

[0, 2, 4, 6, 8, 10]

## Generator Functions

* produce generator objects when called
* Yields a sequence of values instead of returning a single value 
* Generates a value with 'yield' keyword

In [121]:
def num_sequence(n):
    """ Return a number sequence from 0 to given n value"""
    i = 0 
    while i <= n: 
        yield i # Yield first to return the initial value of i 
        i += 1 

In [122]:
return_var = num_sequence(8)

In [123]:
type(return_var)

generator

In [124]:
list(return_var)

[0, 1, 2, 3, 4, 5, 6, 7, 8]

In [125]:
def return_even(n):
    """Return even numbers from 0 to n (including n)"""

    i = 0 

    while i <= n:
        if i % 2 == 0: 
            yield i 
        else:
            pass
        i += 1



In [126]:
return_var = return_even(30)

In [127]:
list(return_var)

[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]

In [128]:
# Create a list of strings
lannister = ['cersei', 'jaime', 'tywin', 'tyrion', 'joffrey']

# Define generator function get_lengths
def get_lengths(input_list):
    """Generator function that yields the
    length of the strings in input_list."""

    # Yield the length of a string
    for person in input_list:
        yield len(person)

# Print the values generated by get_lengths()
for value in get_lengths(lannister):
    print(value)

6
5
5
6
7


# Twitter Data Analysis

In [129]:
df = pd.read_csv('/Users/joseservin/DataCamp/Courses/Python_Toolbox/tweets.csv')

In [130]:
df.head()

Unnamed: 0,contributors,coordinates,created_at,entities,extended_entities,favorite_count,favorited,filter_level,geo,id,...,quoted_status_id,quoted_status_id_str,retweet_count,retweeted,retweeted_status,source,text,timestamp_ms,truncated,user
0,,,Tue Mar 29 23:40:17 +0000 2016,"{'hashtags': [], 'user_mentions': [{'screen_na...","{'media': [{'sizes': {'large': {'w': 1024, 'h'...",0,False,low,,714960401759387648,...,,,0,False,"{'retweeted': False, 'text': "".@krollbondratin...","<a href=""http://twitter.com"" rel=""nofollow"">Tw...",RT @bpolitics: .@krollbondrating's Christopher...,1459294817758,False,"{'utc_offset': 3600, 'profile_image_url_https'..."
1,,,Tue Mar 29 23:40:17 +0000 2016,"{'hashtags': [{'text': 'cruzsexscandal', 'indi...","{'media': [{'sizes': {'large': {'w': 500, 'h':...",0,False,low,,714960401977319424,...,,,0,False,"{'retweeted': False, 'text': '@dmartosko Cruz ...","<a href=""http://twitter.com"" rel=""nofollow"">Tw...",RT @HeidiAlpine: @dmartosko Cruz video found.....,1459294817810,False,"{'utc_offset': None, 'profile_image_url_https'..."
2,,,Tue Mar 29 23:40:17 +0000 2016,"{'hashtags': [], 'user_mentions': [], 'symbols...",,0,False,low,,714960402426236928,...,,,0,False,,"<a href=""http://www.facebook.com/twitter"" rel=...",Njihuni me Zonjën Trump !!! | Ekskluzive https...,1459294817917,False,"{'utc_offset': 7200, 'profile_image_url_https'..."
3,,,Tue Mar 29 23:40:17 +0000 2016,"{'hashtags': [], 'user_mentions': [], 'symbols...",,0,False,low,,714960402367561730,...,7.149239e+17,7.149239e+17,0,False,,"<a href=""http://twitter.com/download/android"" ...",Your an idiot she shouldn't have tried to grab...,1459294817903,False,"{'utc_offset': None, 'profile_image_url_https'..."
4,,,Tue Mar 29 23:40:17 +0000 2016,"{'hashtags': [], 'user_mentions': [{'screen_na...",,0,False,low,,714960402149416960,...,,,0,False,"{'retweeted': False, 'text': 'The anti-America...","<a href=""http://twitter.com/download/iphone"" r...",RT @AlanLohner: The anti-American D.C. elites ...,1459294817851,False,"{'utc_offset': -18000, 'profile_image_url_http..."


In [131]:
tweet_time = df['created_at']

In [132]:
tweet_clock_time = [entry[11:19] for entry in tweet_time]

In [133]:
tweet_clock_time

['23:40:17',
 '23:40:17',
 '23:40:17',
 '23:40:17',
 '23:40:17',
 '23:40:17',
 '23:40:18',
 '23:40:17',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:17',
 '23:40:18',
 '23:40:18',
 '23:40:17',
 '23:40:18',
 '23:40:18',
 '23:40:17',
 '23:40:18',
 '23:40:17',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:17',
 '23:40:18',
 '23:40:18',
 '23:40:17',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:19',
 '23:40:18',
 '23:40:18',
 '23:40:18',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:18',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:18',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:18',
 '23:40:19',

In [134]:
tweet_time = df['created_at']

In [135]:
tweet_clock_time = [entry[11:19] for entry in tweet_time if entry[17:19] == '19']

In [136]:
tweet_clock_time

['23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19',
 '23:40:19']