
# Python Data Science Toolbox (Part 1)

In [1]:
import pandas as pd
df = pd.read_csv("../datasets/tweets.csv")
df.head(3)

Unnamed: 0,contributors,coordinates,created_at,entities,extended_entities,favorite_count,favorited,filter_level,geo,id,...,quoted_status_id,quoted_status_id_str,retweet_count,retweeted,retweeted_status,source,text,timestamp_ms,truncated,user
0,,,Tue Mar 29 23:40:17 +0000 2016,"{'hashtags': [], 'user_mentions': [{'screen_na...","{'media': [{'sizes': {'large': {'w': 1024, 'h'...",0,False,low,,714960401759387648,...,,,0,False,"{'retweeted': False, 'text': "".@krollbondratin...","<a href=""http://twitter.com"" rel=""nofollow"">Tw...",RT @bpolitics: .@krollbondrating's Christopher...,1459294817758,False,"{'utc_offset': 3600, 'profile_image_url_https'..."
1,,,Tue Mar 29 23:40:17 +0000 2016,"{'hashtags': [{'text': 'cruzsexscandal', 'indi...","{'media': [{'sizes': {'large': {'w': 500, 'h':...",0,False,low,,714960401977319424,...,,,0,False,"{'retweeted': False, 'text': '@dmartosko Cruz ...","<a href=""http://twitter.com"" rel=""nofollow"">Tw...",RT @HeidiAlpine: @dmartosko Cruz video found.....,1459294817810,False,"{'utc_offset': None, 'profile_image_url_https'..."
2,,,Tue Mar 29 23:40:17 +0000 2016,"{'hashtags': [], 'user_mentions': [], 'symbols...",,0,False,low,,714960402426236928,...,,,0,False,,"<a href=""http://www.facebook.com/twitter"" rel=...",Njihuni me ZonjÃ«n Trump !!! | Ekskluzive http...,1459294817917,False,"{'utc_offset': 7200, 'profile_image_url_https'..."


## 1. Writing your own functions

### User-defined functions

In [1]:
def shout(word):
    """Print a string with three exclamation marks"""
    shout_word = word + '!!!'
#     print(shout_word)
    return(shout_word)

yell = shout('congratulations')
print(yell)

congratulations!!!


### Multiple parameters and return values

In [2]:
def shout(word1, word2):
    """Concatenate strings with three exclamation marks"""
    shout1 = word1 + '!!!'
    shout2 = word2 + '!!!'
    new_shout = shout1 + shout2
    return new_shout

yell = shout('congratulations','you')
print(yell)

congratulations!!!you!!!


In [3]:
def shout_all(word1, word2): 
    shout1 = word1 + '!!!'
    shout2 = word2 + '!!!'
    shout_words = (shout1,shout2)
    return shout_words

yell1, yell2 = shout_all('congratulations','you')
print(yell1)
print(yell2)

congratulations!!!
you!!!


### Bringing it all together

In [8]:
def count_entries(df, col_name):
    """Return a dictionary with counts of 
    occurrences as value for each key."""

    langs_count = {}
    col = df[col_name]
    for entry in col:
        if entry in langs_count.keys():
            langs_count[entry] = langs_count[entry] + 1
        else:
            langs_count[entry] = 1
    return langs_count

result = count_entries(df,'lang')
print(result)

{'en': 97, 'et': 1, 'und': 2}


### Congratulations!

## 2. Default arguments, variable-length arguments and scope

### Scope and user-defined functions

In [10]:
team = "teen titans"

def change_team():
    """Cambiando el valor de la variable global."""
    global team
    team = "justice league"
    
print(team)

change_team()
print(team)

teen titans
justice league


### Nested functions

In [11]:
def three_shouts(word1, word2, word3):
    """Returns a tuple of strings
    concatenated with '!!!'."""

    def inner(word):
        """Returns a string concatenated with '!!!'."""
        return word + '!!!'

    return (inner(word1), inner(word2), inner(word3))

print(three_shouts('a', 'b', 'c'))

('a!!!', 'b!!!', 'c!!!')


In [12]:
def echo_shout(word):
    """Change the value of a nonlocal variable"""

    echo_word = word + word
    print(echo_word)
    
    def shout():
        """Alter a variable in the enclosing scope"""    
        nonlocal echo_word
        echo_word = echo_word +  '!!!'
    
    shout()
    print(echo_word)

echo_shout('hello')

hellohello
hellohello!!!


### Default and flexible arguments

In [2]:
def shout_echo(word1, echo=1, intense=False):
    """Concatenate echo copies of word1 and three
    exclamation marks at the end of the string."""

    echo_word = word1 * echo
    if intense is True:
        echo_word_new = echo_word.upper() + '!!!'
    else:
        echo_word_new = echo_word + '!!!'
    return echo_word_new


with_big_echo = shout_echo("Hey",5,True)
big_no_echo = shout_echo("Mayu",intense=True)

print(with_big_echo)
print(big_no_echo)

HEYHEYHEYHEYHEY!!!
MAYU!!!


In [3]:
# (*args)
# args is a tuple

def gibberish(*args):
    """Concatenate strings in *args together."""
    hodgepodge = ""
    for word in args:
        hodgepodge += word
    return hodgepodge


one_word = gibberish("luke")
many_words = gibberish("luke", "leia", "han", "obi", "darth")

print(one_word)
print(many_words)

luke
lukeleiahanobidarth


In [4]:
# (**kwargs)
# kwargs is a dictionary

def report_status(**kwargs):
    """Print out the status of a movie character."""

    print("\nBEGIN: REPORT\n")
    for keys , values in kwargs.items():
        print(keys + ": " + values)

    print("\nEND REPORT")

report_status(name="luke", affiliation="jedi",status="missing")
report_status(name="anakin", affiliation="sith lord", status="deceased")


BEGIN: REPORT

name: luke
affiliation: jedi
status: missing

END REPORT

BEGIN: REPORT

name: anakin
affiliation: sith lord
status: deceased

END REPORT


### Bringing it all together

In [8]:
def count_entries(df, col_name="lang"):
    """Return a dictionary with counts of
    occurrences as value for each key."""

    cols_count = {}
    col = df[col_name]
    
    for entry in col:
        if entry in cols_count.keys():
            cols_count[entry] += 1
        else:
            cols_count[entry] = 1
    return cols_count

result1 = count_entries(df)
result2 = count_entries(df,'source')

print(result1)
print(result2)

{'en': 97, 'et': 1, 'und': 2}
{'<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>': 24, '<a href="http://www.facebook.com/twitter" rel="nofollow">Facebook</a>': 1, '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>': 26, '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>': 33, '<a href="http://www.twitter.com" rel="nofollow">Twitter for BlackBerry</a>': 2, '<a href="http://www.google.com/" rel="nofollow">Google</a>': 2, '<a href="http://twitter.com/#!/download/ipad" rel="nofollow">Twitter for iPad</a>': 6, '<a href="http://linkis.com" rel="nofollow">Linkis.com</a>': 2, '<a href="http://rutracker.org/forum/viewforum.php?f=93" rel="nofollow">newzlasz</a>': 2, '<a href="http://ifttt.com" rel="nofollow">IFTTT</a>': 1, '<a href="http://www.myplume.com/" rel="nofollow">PlumeÂ forÂ Android</a>': 1}


In [11]:
# Define count_entries()
def count_entries(df, *args):
    """Return a dictionary with counts of
    occurrences as value for each key."""
    
    cols_count = {}

    for col_name in args:
        col = df[col_name]
        for entry in col:
            if entry in cols_count.keys():
                cols_count[entry] += 1
            else:
                cols_count[entry] = 1
    return cols_count

result1 = count_entries(df, 'lang')
result2 = count_entries(df, 'lang', 'source')

print(result1)
print('----------')
print(result2)

{'en': 97, 'et': 1, 'und': 2}
----------
{'en': 97, 'et': 1, 'und': 2, '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>': 24, '<a href="http://www.facebook.com/twitter" rel="nofollow">Facebook</a>': 1, '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>': 26, '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>': 33, '<a href="http://www.twitter.com" rel="nofollow">Twitter for BlackBerry</a>': 2, '<a href="http://www.google.com/" rel="nofollow">Google</a>': 2, '<a href="http://twitter.com/#!/download/ipad" rel="nofollow">Twitter for iPad</a>': 6, '<a href="http://linkis.com" rel="nofollow">Linkis.com</a>': 2, '<a href="http://rutracker.org/forum/viewforum.php?f=93" rel="nofollow">newzlasz</a>': 2, '<a href="http://ifttt.com" rel="nofollow">IFTTT</a>': 1, '<a href="http://www.myplume.com/" rel="nofollow">PlumeÂ forÂ Android</a>': 1}


## 3. Lambda functions and error-handling

### Lambda functions


In [1]:
add_bangs = (lambda a: a + '!!!')
add_bangs("MAYU")

'MAYU!!!'

In [3]:
echo_word = (lambda word1,echo: word1 * echo)
result = echo_word('heY',5)
print(result)

heYheYheYheYheY


In [4]:
# ANTES
def square(n):
    return n*n
my_list = [2,3,4,5,6,7,8,9]
updated_list = map(square, my_list)
print(updated_list)
print(list(updated_list))

<map object at 0x000001AD8E3F5A60>
[4, 9, 16, 25, 36, 49, 64, 81]


In [5]:
# AHORA CON LAMBDA ------ MAP
spells = ["protego", "accio", "expecto patronum", "legilimens"]
shout_spells = map(lambda item: item + '!!!' , spells)
shout_spells_list = list(shout_spells)
print(shout_spells_list)

['protego!!!', 'accio!!!', 'expecto patronum!!!', 'legilimens!!!']


In [6]:
# FILTER ()
fellowship = ['frodo', 'samwise', 'merry', 'pippin', 'aragorn', 
              'boromir', 'legolas', 'gimli', 'gandalf']
result = filter( lambda a: len(a)>6, fellowship)
result_list = list(result)
print(result_list)

['samwise', 'aragorn', 'boromir', 'legolas', 'gandalf']


In [7]:
# REDUCE
from functools import reduce 

stark = ['robb', 'sansa', 'arya', 'brandon', 'rickon']
result = reduce(lambda item1,item2: item1+item2, stark)
print(result)

robbsansaaryabrandonrickon


### Introduction to error handling



In [8]:
def shout_echo(word1, echo=1):
    """Concatenate echo copies of word1 and three
    exclamation marks at the end of the string."""

    echo_word =''
    shout_words =''
    try:
        echo_word = echo * word1
        shout_words = echo_word + '!!!'
    except:
        print("word1 must be a string and echo must be an integer.")
    return shout_words

shout_echo("particle", echo="accelerator")

word1 must be a string and echo must be an integer.


''

In [11]:
# ERROR CON RAISE

def shout_echo(word1, echo=1):
    """Concatenate echo copies of word1 and three
    exclamation marks at the end of the string."""
    
    if echo<0:
        raise ValueError('echo must be greater than or equal to 0')
        
    echo_word = word1 * echo
    shout_word = echo_word + '!!!'
    return shout_word

shout_echo("particle", echo=5)

'particleparticleparticleparticleparticle!!!'

### Bringing it all together


In [13]:
result = filter(lambda x: x[0:2]=="RT", df['text'])

res_list = list(result)
for tweet in res_list:
    print(tweet)

RT @bpolitics: .@krollbondrating's Christopher Whalen says Clinton is the weakest Dem candidate in 50 years https://t.co/pLk7rvoRSn https:/â€¦
RT @HeidiAlpine: @dmartosko Cruz video found.....racing from the scene.... #cruzsexscandal https://t.co/zuAPZfQDk3
RT @AlanLohner: The anti-American D.C. elites despise Trump for his America-first foreign policy. Trump threatens their gravy train. https:â€¦
RT @BIackPplTweets: Young Donald trump meets his neighbor  https://t.co/RFlu17Z1eE
RT @trumpresearch: @WaitingInBagdad @thehill Trump supporters have selective amnisia.
RT @HouseCracka: 29,000+ PEOPLE WATCHING TRUMP LIVE ON ONE STREAM!!!

https://t.co/7QCFz9ehNe
RT @urfavandtrump: RT for Brendon Urie
Fav for Donald Trump https://t.co/PZ5vS94lOg
RT @trapgrampa: This is how I see #Trump every time he speaks. https://t.co/fYSiHNS0nT
RT @trumpresearch: @WaitingInBagdad @thehill Trump supporters have selective amnisia.
RT @Pjw20161951: NO KIDDING: #SleazyDonald just attacked Scott Walker for NO

In [16]:
def count_entries(df, col_name='lang'):
    """Return a dictionary with counts of
    occurrences as value for each key."""

    cols_count = {}
    try:
        col = df[col_name]
        
        for entry in col:
            if entry in cols_count.keys():
                cols_count[entry] += 1
            else:
                cols_count[entry] = 1
    
        return cols_count
    except:
        print('The DataFrame does not have a ' + col_name + ' column.')

result1 = count_entries(df, 'lang')

print(result1)

{'en': 97, 'et': 1, 'und': 2}


In [18]:
def count_entries(df, col_name='lang'):
    """Return a dictionary with counts of
    occurrences as value for each key."""

    if col_name not in df.columns:
        raise ValueError('The DataFrame does not have a ' + col_name + ' column.')
    cols_count = {}
    col = df[col_name]
    
    for entry in col:
        if entry in cols_count.keys():
            cols_count[entry] += 1
        else:
            cols_count[entry] = 1
    return cols_count

result1=count_entries(df,'lang')
print(result1)

{'en': 97, 'et': 1, 'und': 2}


In [2]:
def count_entries(df, col_name='lang'):
    """Return a dictionary with counts of
    occurrences as value for each key."""

    if col_name not in df.columns:
        raise ValueError('The DataFrame does not have a ' + col_name + ' column.')
        
    cols_count = {}
    try:
        col = df[col_name]
        
        for entry in col:
            if entry in cols_count.keys():
                cols_count[entry] += 1
            else:
                cols_count[entry] = 1
        return cols_count
    except:
        print('The DataFrame does not have a ' + col_name + ' column.')


result1=count_entries(df,'lang')
print(result1)

{'en': 97, 'et': 1, 'und': 2}


### Congratulations!