# Series

Series are a cross between a list and a disctionary.

The items are all stored in an order and there's labels with which you can retrieve them.

In [1]:
# Import pandas
import pandas as pd

In [3]:
# We can create a series by passing in a list of values
# When we do this, Pandas automatically assigns an index starting with
# zero and sets the name of the series to None

# One of the easiest ways to create a series is to use an array-like
# object, like a list

students = ['Alice', 'Jack', 'Molly']
pd.Series(students)

0    Alice
1     Jack
2    Molly
dtype: object

In [4]:
# The result is a series object
# Panda has automatically identified the type of data in this Series as
# "object" and set the dtype parameter as appropiate.
# The values are indexed with integers starting at zero

In [5]:
# We don't have to use strings. If we passed in a list of whole numbers,
# we could see that panda sets the type to int64. Underneath, panda stores
# series values in a typed array using the Numpy library. This offers
# significant speedup when processing data vs traditional python lists

# List of integers
numbers = [1, 2, 3]
pd.Series(numbers)

0    1
1    2
2    3
dtype: int64

In [6]:
# On my architecture the result is a dtype of int64 objects

In [8]:
# Other important thing to know: how numpy and pandas handle missing data

# In Python, we have the none type to indicate a lack of data

# Underneath, pandas does some type conversion. If we create a list of
# strings and we have one element, a None type, pandas inserts it as
# a None and uses the type object for the underlying array

# Let's do an example
students2 = ['Alice', 'Jack', None]
pd.Series(students2)

0    Alice
1     Jack
2     None
dtype: object

In [9]:
# However, if we create a list of numbers, integers or floats, and put
# in the None type, pandas automatically converts this to a special
# floating point value designated as NaN, which standads for "Not a Number"

numbers2 = [1, 2, None]
pd.Series(numbers2)

0    1.0
1    2.0
2    NaN
dtype: float64

In [10]:
# Couple of thins:
# 1) NaN is a different value
# 2) Pandas set the dtype of this series to floating point numbers instead
# of object or ints. Underneath, pandas represents NaN as a floating point
# number, and because integers can be typecast to floats, pandas went and
# converted our integers to floats. 

# So, when you're wondering why the list of integers you put into a Series
# is not floats, it's probably because there is some missing data

In [11]:
# None and NaN might be used by the data scientist in the same way, to
# denote missing data, but underneath these are not represented by pandas
# in the same way.

# NaN is NOT equivalent to None and when we try an equality test, the
# result is false

import numpy as np
np.nan == None

False

In [13]:
# It turns out that we actually can't do an equality test of NaN to
# itself. When we do, the answer is always false

np.nan == np.nan

False

In [14]:
# Instead, we need to use special functions to test for the presence of
# not a number, such as the Numpy library isnan()

np.isnan(np.nan)

True

In [15]:
# So keep in mind when we see NaN, its meaning is similar to None, but
# it's a numeric value and treated differently for efficiency reasons

In [16]:
# While creating pandas Series from lists is a common way to do it, we
# often have labeled data that you want to manipulate.
# A series can be created directly from dicionary data. If we do this, 
# the index is automatically assigned to the keys of the dictionary that
# we provided and not just incrementing integers

students_scores = {'Alice' : 'Physics',
                  'Jack' : 'Chemistry',
                   'Molly' : 'English'}

s = pd.Series(students_scores)
s

Alice      Physics
Jack     Chemistry
Molly      English
dtype: object

In [17]:
# Since it was string data, pandas set the data type to object
# The index, the first column, is also a list of strings

In [18]:
# Once the series has been created, we can get the index object using
# the index attribute

s.index

Index(['Alice', 'Jack', 'Molly'], dtype='object')

In [19]:
# A lot of things are implemented in pandas as numpy arrays and have the
# dtype value set. This is true of indices, and here pandas infered that
# we were using objects of the index

In [21]:
# The dtype object is nos just for strings, but for arbitrary objects
students3 = [('Alice', 'Brown'), ('Jack', 'White'), ('Molly', 'Green')]
pd.Series(students3)

0    (Alice, Brown)
1     (Jack, White)
2    (Molly, Green)
dtype: object

In [22]:
# Tuples are also stored as object

In [23]:
# We can also separate our index from the data by passing in the index as
# a list explicitly to the series

s2 = pd.Series(['Physics', 'Chemistry', 'English'], index = ['Alice', 'Jack', 'Molly'])
s2

Alice      Physics
Jack     Chemistry
Molly      English
dtype: object

In [25]:
# If the list of vales is not aligned with the keys in the dictionary,
# pandas overrides the automatic creation to favor only and all of the
# indices values that we provided. So, it will ignore from our dict all
# keys which are not in our index, and pandas will add None or NaN type
# values for any index value we provide, which is not in our dict key list

# Example
students_scores2 = {'Alice' : ' Physics',
                   'Jack' : 'Chemistry',
                   'Molly' : 'English'}

# We will ask for an index with three students, and exclude Jack
s3 = pd.Series(students_scores2, index = ['Alice', 'Molly', 'Sam'])
s3

Alice     Physics
Molly     English
Sam           NaN
dtype: object

In [None]:
# The result is that the Series object doesn't have Jack in it, even
# though he was in our original dataset. However, it explicitly does have
# Sam as a missing value