In [1]:
#---The Series Data Structure---
import pandas as pd

In [2]:
#creating a series with an array-like object, like a list
students = ['Alice', 'Jack', 'Molly']

pd.Series(students)

0    Alice
1     Jack
2    Molly
dtype: object

In [3]:
#the result is a series object

#turning a list of numbers into a series
numbers = [1, 2, 3]

pd.Series(numbers)

0    1
1    2
2    3
dtype: int64

In [4]:
#same student list but with the las one as a None
students = ['Alice', 'Jack', None]
pd.Series(students)

0    Alice
1     Jack
2     None
dtype: object

In [5]:
#a number list with a None value in it
numbers = [1, 2, None]
pd.Series(numbers)

0    1.0
1    2.0
2    NaN
dtype: float64

In [6]:
#pandas represents NaN as a floating point number. If a list of integers in a list is not float, it's probably because
#there is some missing data.

#Note that None and NaN serve the same purpose but are not the same
import numpy as np

np.nan == None

False

In [7]:
#funny enough, an equality test between NaN is also false
np.nan == np.nan

False

In [8]:
#for this, we need to use special functions to test for the presence of NaN
np.isnan(np.nan)

True

In [10]:
#we can often have label data that we want to manipulate. this is where series comes in handy
#a series can be creates directly from dictionary data.
#the index is automatically assigned to the keys of the dictionary provided (the first column)

students_scores = {'Alice': 'Physics', 
                   'Jack': 'Chemistry', 
                   'Molly': 'English'}
s = pd.Series(students_scores)
s

Alice      Physics
Jack     Chemistry
Molly      English
dtype: object

In [11]:
s.index

Index(['Alice', 'Jack', 'Molly'], dtype='object')

In [12]:
#the dtype objest is also for arbitrary objects. here's a list of tuples
students = [("Alice", "Brown"), ("Jack", "White"), ("Molly", "Green")]
pd.Series(students)

0    (Alice, Brown)
1     (Jack, White)
2    (Molly, Green)
dtype: object

In [14]:
#we can also separate an index creation from the data by passing in the index as a list explicitly to the series
s = pd.Series(['Physics', 'Chemistry', 'English'], index=['Alice', 'Jack', 'Molly'])
s

Alice      Physics
Jack     Chemistry
Molly      English
dtype: object

In [17]:
#pandas overrides the automatic creation to favor only aand all of the indices values that you provided.
#so, it will ignore from the dictionary all keys which are not in the index and pandas will add None/NaN type values
#for instance:

students_scores = {'Alice': 'Physics',
                  'Jack': 'Chemistry',
                  'Molly': 'English'}
#excluding jack and wanting three students
s = pd.Series(students_scores, index=['Alice', 'Molly', 'Sam'])
s

Alice    Physics
Molly    English
Sam          NaN
dtype: object

In [18]:
#---Querying a Series---

#to query by numeric location, starting from zero, use the iloc attribute.
#to query by the index label, use the loc attribute

import pandas as pd
students_classes = {'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English',
                   'Sam': 'History'}

s = pd.Series(students_classes)
s

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [19]:
#to see the fourth entry
s.iloc[3]

'History'

In [20]:
#to see Molly's class
s.loc['Molly']

'English'

In [21]:
#iloc and loc are not methods, but attributes. 
#we don't use parenthesis to query them, but [], which is called the indexing operator.

#if you pass in an integer parameter, the operator will behave as if you want it to query via the iloc attribute
s[3]

'History'

In [22]:
#if you pass in an object, it will query as if you wanted to use the label based loc attribute
s['Molly']

'English'

In [23]:
#for a list of integers, the safer option is to use iloc or loc, to avoid confussion
class_code = {99: 'Physics',
             100: 'Chemistry',
             101: 'English',
             102: 'History'}
s = pd.Series(class_code)

In [24]:
#to call the fist item, s[0] isn't wise, we'll use this instead
s.iloc[0]

'Physics'

In [25]:
#working with data
#an example would be to iterate over all the items in the series,
#and invoke the operation one is interested in.
#for instance, we could create a series of integers representing student grandes
#and try to get the average grade

grades = pd.Series([90, 80, 70, 60])

total = 0
for grade in grades:
    total += grade
print(total/len(grades))

75.0


In [26]:
#this is a slow method. 
#we can use the numpy sum method
import numpy as np

grades = pd.Series([90, 80, 70, 60])

total = np.sum(grades)
print(total/len(grades))

75.0


In [27]:
#when demostrating techniques with pandas, we can see the speed of a code with a magic function
numbers = pd.Series(np.random.randint(0, 1000, 10000))

#to see the first five items
numbers.head()

0    892
1    400
2    681
3    845
4    672
dtype: int64

In [28]:
#to verify the length of the series
len(numbers)

10000

In [None]:
#we'll use timeit to  run the code a few times to determine on average how long it takes
#we'll run it 100 times
#NOTE: %%timeit must be the first line in a cell, if not, it will print out an error

In [31]:
%%timeit -n 100
total = 0
for number in numbers:
    total+=number
total/len(numbers)

1.25 ms ± 36.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
#let's try with vectorization

In [32]:
%%timeit -n 100
total = np.sum(numbers)
total/len(numbers)

67.2 µs ± 3.24 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [33]:
#this difference in speed demonstrates why one should be aware of parallel computing features.

#Broadcasting is a numpy and pandas feature with which you can apply an operation to every value in the series, changing the series
#for inxtance, if we wanted to increate every random variable by 2, we could do so quickly using the += operator 
#directly on the sries object
numbers.head()

0    892
1    400
2    681
3    845
4    672
dtype: int64

In [34]:
numbers+=2
numbers.head()

0    894
1    402
2    683
3    847
4    674
dtype: int64

In [35]:
#iteration
#iteritems() returns a label and a value
for label, value in numbers.iteritems():
    #setvalue() or at() for more recent versions of pandas for the item which is returned
    numbers.set_value(label, value+2)
numbers.head()

0    896
1    404
2    685
3    849
4    676
dtype: int64

In [None]:
#speed compatisons
#first, five loops using the iterative approach

In [37]:
%%timeit -n 10

s = pd.Series(np.random.randint(0,1000,1000))
for label, value in s.iteritems():
    s.loc[label] = value+2

139 ms ± 1.43 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [38]:
#with the broadcating method

In [39]:
%%timeit -n 10
s = pd.Series(np.random.randint(0,1000,1000))
s+=2 #broadcasting with +=

257 µs ± 34.9 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [40]:
#the loc attribute lets you not only modify data in place, but aldo add new data as well.
#if the value you pass in as the index doesn't exist, then a new entry is added.
#and keep in mind, indices can have mixed types.
#while it's important to be aware of the typing going on underneath, pandas will automatically
#change the underlying numpy types as appropriate
#for instance

s = pd.Series([1, 2, 3])

s.loc['History'] = 102 #a new value

s

0            1
1            2
2            3
History    102
dtype: int64

In [41]:
#mixed types for data  values or index labels are no issue

In [43]:
#An example where index values are not unique:
students_classes = pd.Series({'Alice': 'Physics',
                             'Jack': 'Chemistry',
                             'Molly': 'English',
                             'Sam': 'History'})
students_classes

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [44]:
#a list of a students which lists all of the courses she has taken
kelly_classes = pd.Series(['Philosophy', 'Arts', 'Math'], index=['Kelly', 'Kelly', 'Kelly'])
kelly_classes

Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object

In [45]:
#appending the new series to the first one
all_students_classes = students_classes.append(kelly_classes)

all_students_classes

Alice       Physics
Jack      Chemistry
Molly       English
Sam         History
Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object

In [46]:
#pandas will take the series and try to infer the best data types to use
#the append method doesn't actually change the underlying series objects, it returns a new series 
students_classes

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [47]:
all_students_classes.loc['Kelly']

Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object