# The Series Data Structure

In [None]:
import pandas as pd
pd.Series?

In [None]:
#How to create a series
animals = ['Tiger', 'Bear', 'Moose']
pd.Series(animals)

In [None]:
numbers = [1, 2, 3]
pd.Series(numbers)

In [None]:
#How series handles missing values
animals = ['Tiger', 'Bear', None]
pd.Series(animals)

In [None]:
numbers = [1, 2, None]
pd.Series(numbers)

In [None]:
#None is not nan
import numpy as np
np.nan == None

In [None]:
np.nan == np.nan

In [None]:
#how to test for the presence of not a number
np.isnan(np.nan)

In [None]:
#Creating a series from a dictionary, keys bcome indices
sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s

In [None]:
s.index

In [None]:
#You could also separate your index creation from the data by passing in the index as a list explicitly to the series.
s = pd.Series(['Tiger', 'Bear', 'Moose'], index=['India', 'America', 'Canada'])
s

In [None]:
'''if your list of values in the index object are not aligned with the keys in your dictionary for creating the series,
pandas overrides the automatic creation to favor only and all of the indices values that you provided.'''

sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports, index=['Golf', 'Sumo', 'Hockey'])
s

# Querying a Series

In [None]:
#A panda.Series can be queried, either by the index position or the index label.
sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s

In [None]:
#To query by numeric location, starting at zero, use the iloc attribute.
s.iloc[3]

In [None]:
#To query by the index label, you can use the loc attribute.
s.loc['Golf']

In [None]:
'''Pandas tries to make our code a bit more readable and provides a smart syntax using the indexing operator directly on the 
series itself. For instance, if you pass in an integer parameter, the operator will behave as if you want it to query via 
the iloc attribute. If you pass in an object, it will query as if you wanted to use the label based loc attribute.'''
s[3]

In [None]:
s['Golf']

In [None]:
'''So what happens if your index is a list of integers? This is a bit complicated, and Pandas can't determine automatically 
whether you're intending to query by index position or index label. '''

sports = {99: 'Bhutan',
          100: 'Scotland',
          101: 'Japan',
          102: 'South Korea'}
s = pd.Series(sports)

In [None]:
#s[0] #This won't call s.iloc[0] as one might expect, it generates an error instead

In [None]:
#Working with data
s = pd.Series([100.00, 120.00, 101.00, 3.00])
s

In [None]:
#A typical programmatic approach to this would be to iterate over all the items in the series......slow
total = 0
for item in s:
    total+=item
print(total)

In [None]:
#A faster way is to use the built-in functions in Pandas and Numpy
total = np.sum(s)
print(total)

In [None]:
#this creates a big series of random numbers
s = pd.Series(np.random.randint(0,1000,10000))
s.head()

In [None]:
len(s)

In [None]:
%%timeit -n 100
summary = 0
for item in s:
    summary+=item

In [None]:
%%timeit -n 100
summary = np.sum(s)

In [None]:
%%timeit -n 10
s = pd.Series(np.random.randint(0,1000,1000))
for label, value in s.iteritems():
    s.loc[label]= value+2

In [None]:
%%timeit -n 10
s = pd.Series(np.random.randint(0,1000,10000))
s+=2


In [None]:
#The .loc attribute lets you not only modify data in place, but also add new data as well
s = pd.Series([1, 2, 3])
s.loc['Animal'] = 'Bears' #We see that mixed types for data values or index labels are no problem for Pandas
s

In [None]:
#Pandas series can work index values that are not unique
original_sports = pd.Series({'Archery': 'Bhutan',
                             'Golf': 'Scotland',
                             'Sumo': 'Japan',
                             'Taekwondo': 'South Korea'})
cricket_loving_countries = pd.Series(['Australia',
                                      'Barbados',
                                      'Pakistan',
                                      'England'], 
                                   index=['Cricket',
                                          'Cricket',
                                          'Cricket',
                                          'Cricket'])
all_countries = original_sports.append(cricket_loving_countries)

In [None]:
original_sports

In [None]:
cricket_loving_countries

In [None]:
all_countries

In [None]:
all_countries.loc['Cricket']

# The DataFrame Data Structure

In [None]:
'''The DataFrame is conceptually a two-dimensional series object, where there's an index and multiple 
columns of content, with each column having a label'''

purchase_1 = pd.Series({'Name': 'Chris',
                        'Item Purchased': 'Dog Food',
                        'Cost': 22.50})
purchase_2 = pd.Series({'Name': 'Kevyn',
                        'Item Purchased': 'Kitty Litter',
                        'Cost': 2.50})
purchase_3 = pd.Series({'Name': 'Vinod',
                        'Item Purchased': 'Bird Seed',
                        'Cost': 5.00})
df = pd.DataFrame([purchase_1, purchase_2, purchase_3], index=['Store 1', 'Store 1', 'Store 2'])
df.head()

In [None]:
#Similar to the series, we can extract data using the iLock and Lock attributes
df.loc['Store 2']

In [None]:
#DataFrame is two-dimensional, passing a single value to the lock operator will return series if there's only one row to return
type(df.loc['Store 2'])

In [None]:
df.loc['Store 1']

In [None]:
#can select data based on multiple axes
df.loc['Store 1', 'Cost']

In [None]:
df.loc['Store 1']['Cost']

In [None]:
df.T

In [None]:
'''Since iloc and loc are used for row selection, the Panda's developers reserved indexing operator directly on 
the DataFrame for column selection'''
df['Cost']

#### Quiz: For the purchase records from the pet store, how would you get a list of all items which had been purchased (regardless of where they might have been purchased, or by whom)?

In [None]:
#Showing specific columns
df.loc[:,['Name', 'Cost']]

In [None]:
#It's easy to delete data in series and DataFrames, and we can use the drop function to do so.
df.drop('Store 1')

In [None]:
'''The drop function doesn't change the DataFrame by default. And instead, returns to you a copy of the DataFrame 
with the given rows removed'''
df

In [None]:
copy_df = df.copy()
copy_df = copy_df.drop('Store 1')
copy_df

In [None]:
copy_df.drop?

In [None]:
#There is a second way to drop a column. And that's directly through the use of the indexing operator, using the del keyword.
del copy_df['Name']
copy_df

In [None]:
#Adding a new column to the DataFrame
df['Location'] = None
df

#### Quiz: For the purchase records from the pet store, how would you update the DataFrame, applying a discount of 20% across
all the values in the 'Cost' column?

# Dataframe Indexing and Loading

In [None]:
''' Panda's toolkit tries to give you views on a DataFrame. This is much faster than copying data and much more memory 
efficient too'''

costs = df['Cost']
costs

In [None]:
'''But it does mean that if you're manipulating the data you have to be aware that any changes to the DataFrame you're working 
on may have an impact on the base data frame you used originally.'''

costs+=2
costs

In [None]:
df

In [None]:
'''Pandas has built-in support for delimited files such as CSV files as well as a variety of other data formats including 
Excel, and HTML

olympics.csv, which has data from Wikipedia that contains a summary list of the medal various countries have won 
at the Olympics'''

df = pd.read_csv('olympics.csv')
df.head()

In [None]:
'''Let's re-import that data and center index value to be 0 which is the first column and let set a column headers to be read 
from the second row of data.'''

df = pd.read_csv('olympics.csv', index_col = 0, skiprows=1)
df.head()

In [None]:
#Panda stores a list of all of the columns in the .columns attribute.
df.columns

In [None]:
#We can change the values of the column names by iterating over this list and calling the rename method of the data frame
for col in df.columns:
    if col[:2]=='01':
        df.rename(columns={col:'Gold' + col[4:]}, inplace=True)
    if col[:2]=='02':
        df.rename(columns={col:'Silver' + col[4:]}, inplace=True)
    if col[:2]=='03':
        df.rename(columns={col:'Bronze' + col[4:]}, inplace=True)
    if col[:1]=='№':
        df.rename(columns={col:'#' + col[1:]}, inplace=True) 

df.head()

#### Quiz: Suppose we have a CSV file exercise.csv that looks like this:

Exercise CSV			
![exercisecsv.png](exercisecsv.png)
Write code thay would return a DataFrame with the columns = ['Activity Type', 'Activity Duration', 'Calories'] and index = [125, 126, 127, 128] with the name 'Activity ID'?

# Querying a DataFrame

In [None]:
#Boolean masking
df['Gold'] > 0

In [None]:
only_gold = df.where(df['Gold'] > 0)
only_gold.head()

In [None]:
only_gold['Gold'].count()

In [None]:
only_gold = only_gold.dropna()
only_gold.head()

In [None]:
only_gold = df[df['Gold'] > 0]
only_gold.head()

In [None]:
len(df[(df['Gold'] > 0) | (df['Gold.1'] > 0)])

In [None]:
df[(df['Gold.1'] > 0) & (df['Gold'] == 0)]

# Indexing Dataframes

In [None]:
df.head()

In [None]:
df['country'] = df.index
df = df.set_index('Gold')
df.head()

In [None]:
df = df.reset_index()
df.head()

In [None]:
df = pd.read_csv('census.csv')
df.head()

In [None]:
df['SUMLEV'].unique()

In [None]:
df=df[df['SUMLEV'] == 50]
df.head()

In [None]:
columns_to_keep = ['STNAME',
                   'CTYNAME',
                   'BIRTHS2010',
                   'BIRTHS2011',
                   'BIRTHS2012',
                   'BIRTHS2013',
                   'BIRTHS2014',
                   'BIRTHS2015',
                   'POPESTIMATE2010',
                   'POPESTIMATE2011',
                   'POPESTIMATE2012',
                   'POPESTIMATE2013',
                   'POPESTIMATE2014',
                   'POPESTIMATE2015']
df = df[columns_to_keep]
df.head()

In [None]:
df = df.set_index(['STNAME', 'CTYNAME'])
df.head()

In [None]:
df.loc['Michigan', 'Washtenaw County']

In [None]:
df.loc[ [('Michigan', 'Washtenaw County'),
         ('Michigan', 'Wayne County')] ]

# Missing values

In [None]:
df = pd.read_csv('log.csv')
df

In [None]:
df.fillna?

In [None]:
df = df.set_index('time')
df = df.sort_index()
df

In [None]:
df = df.reset_index()
df = df.set_index(['time', 'user'])
df

In [None]:
df = df.fillna(method='ffill')
df.head()