In [8]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame


# 5.1 Introduction to pandas Data Structures

### Series

In [9]:
obj = pd.Series([4, 7, -5, 3])

In [10]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

And so on...

In [11]:
obj.array

<NumpyExtensionArray>
[np.int64(4), np.int64(7), np.int64(-5), np.int64(3)]
Length: 4, dtype: int64

In [12]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [13]:
obj2 = pd.Series([4, 7, -5, 3], index = ["d", "b", "a", "c"])

In [14]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [15]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [16]:
obj2["a"]

np.int64(-5)

In [17]:
obj["d"] = 6

In [18]:
obj

0    4
1    7
2   -5
3    3
d    6
dtype: int64

In [19]:
obj2[["c", "a", "d"]]

c    3
a   -5
d    4
dtype: int64

In [20]:
obj2[obj2 > 0]

d    4
b    7
c    3
dtype: int64

In [21]:
obj2 * 2

d     8
b    14
a   -10
c     6
dtype: int64

In [22]:
import numpy as np

In [23]:
np.exp(obj2)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

Another way to think about a Series is as a fixed-length, ordered dictionary, as it is a mapping of index values to data values. It can be used in many contexts where you might use a dictionary:

In [24]:
"b" in obj2

True

In [25]:
"e" in obj2

False

In [26]:
sdata = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}

In [27]:
obj3 = pd.Series(sdata)

In [28]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [29]:
sdata

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

you can convert a series back to a dict also 

In [30]:
obj3.to_dict()

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [31]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [32]:
states = ["California", "Ohio", "Oregon", "Texas"]

In [33]:
states


['California', 'Ohio', 'Oregon', 'Texas']

In [34]:
obj4 = pd.Series(sdata, index=states)

In [35]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

he isna and notna functions in pandas should be used to detect missing data:

In [36]:
pd.isna(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [37]:
pd.notna(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

Series also has these as instance methods

In [38]:
obj4.isna()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [39]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [40]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [41]:
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

Both the Series object itself and its index have a name attribute, which integrates with other areas of pandas functionality:

In [42]:
obj4.name = "population"

In [43]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [44]:
obj4.index.name = "state"

In [45]:
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

A Series’s index can be altered in place by assignment:

In [46]:
obj

0    4
1    7
2   -5
3    3
d    6
dtype: int64

In [47]:
obj.index = ["Bob", "Steve", "Jeff", "Ryan"]

ValueError: Length mismatch: Expected axis has 5 elements, new values have 4 elements

There are many ways to construct a DataFrame, though one of the most common is from a dictionary of equal-length lists or NumPy arrays:

In [48]:
data = {"state" : ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"], "year": [2000, 2001, 2002, 2001, 2002, 2003], "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)

The resulting DataFrame will have its index assigned automatically, as with Series, and the columns are placed according to the order of the keys in data (which depends on their insertion order in the dictionary):

In [49]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [50]:
data

{'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
 'year': [2000, 2001, 2002, 2001, 2002, 2003],
 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

For large DataFrames, the head method selects only the first five rows:

In [51]:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


tail returns the last five rows:

In [52]:
frame.tail()

Unnamed: 0,state,year,pop
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


If you specify a sequence of columns, the DataFrame’s columns will be arranged in that order:



In [54]:
pd.DataFrame(data, columns=["year", "state", "pop"])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


If you pass a column that isn’t contained in the dictionary, it will appear with missing values in the result:

In [55]:
frame2 = pd.DataFrame(data, columns=["year", "state", "pop", "debt"])

In [56]:
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [57]:
frame.columns

Index(['state', 'year', 'pop'], dtype='object')

A column in a DataFrame can be retrieved as a Series either by dictionary-like notation or by using the dot attribute notation:

In [58]:
frame["state"]

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

In [59]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [60]:
frame.year

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [61]:
frame.pop

<bound method DataFrame.pop of     state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
5  Nevada  2003  3.2>

returned Series have the same index as the DataFrame, and their name attribute has been appropriately set.

Rows can also be retrieved by position or name with the special iloc and loc attributes

In [62]:
frame2.loc[1]

year     2001
state    Ohio
pop       1.7
debt      NaN
Name: 1, dtype: object

In [63]:
frame2.iloc[2]

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: 2, dtype: object

Columns can be modified by assignment. For example, the empty debt column could be assigned a scalar value or an array of values

In [64]:
frame2["debt"] = 16.5

In [65]:
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,16.5
1,2001,Ohio,1.7,16.5
2,2002,Ohio,3.6,16.5
3,2001,Nevada,2.4,16.5
4,2002,Nevada,2.9,16.5
5,2003,Nevada,3.2,16.5


In [67]:
frame2["debt"] = np.arange(6.)

In [68]:
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,0.0
1,2001,Ohio,1.7,1.0
2,2002,Ohio,3.6,2.0
3,2001,Nevada,2.4,3.0
4,2002,Nevada,2.9,4.0
5,2003,Nevada,3.2,5.0


When you are assigning lists or arrays to a column, the value’s length must match the length of the DataFrame. If you assign a Series, its labels will be realigned exactly to the DataFrame’s index, inserting missing values in any index values not present:



In [69]:
val = pd.Series([-1.2, -1.5, 1.7], index=[2,4,5])

In [70]:
val

2   -1.2
4   -1.5
5    1.7
dtype: float64

In [71]:
frame2["debt"] = val

In [73]:
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,-1.2
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,-1.5
5,2003,Nevada,3.2,1.7


Assigning a column that doesn’t exist will create a new column.

The del keyword will delete columns like with a dictionary. As an example, I first add a new column of Boolean values where the state column equals "Ohio":

In [74]:
frame2["eastern"] = frame2["state"] == "Ohio"

In [75]:
frame2

Unnamed: 0,year,state,pop,debt,eastern
0,2000,Ohio,1.5,,True
1,2001,Ohio,1.7,,True
2,2002,Ohio,3.6,-1.2,True
3,2001,Nevada,2.4,,False
4,2002,Nevada,2.9,-1.5,False
5,2003,Nevada,3.2,1.7,False


In [77]:
frame2.eastern

0     True
1     True
2     True
3    False
4    False
5    False
Name: eastern, dtype: bool

In [78]:
del frame2["eastern"]

In [79]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [80]:
populations = {"Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6}, "Nevada": {2001: 2.4, 2002: 2.9}}

In [81]:
populations

{'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}, 'Nevada': {2001: 2.4, 2002: 2.9}}

If the nested dictionary is passed to the DataFrame, pandas will interpret the outer dictionary keys as the columns, and the inner keys as the row indices:

In [82]:
frame3 = pd.DataFrame(populations)

In [83]:
frame3

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


You can transpose the DataFrame (swap rows and columns) with similar syntax to a NumPy array:

In [84]:
frame3.T

Unnamed: 0,2000,2001,2002
Ohio,1.5,1.7,3.6
Nevada,,2.4,2.9


The keys in the inner dictionaries are combined to form the index in the result. This isn’t true if an explicit index is specified:

In [85]:
pd.DataFrame(populations, index=[2001, 2002,2003])

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9
2003,,


Dictionaries of Series are treated in much the same way:



In [86]:
pdata = {"Ohio": frame3["Ohio"][:-1], "Nevada": frame3["Nevada"][:2]}

In [87]:
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4


In [88]:
frame3.index.name = "year"

In [89]:
frame3.columns.name = "state"

In [90]:
frame3

state,Ohio,Nevada
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


Unlike Series, DataFrame does not have a name attribute. DataFrame's to_numpy method returns the data contained in the DataFrame as a two-dimensional ndarray:

In [91]:
frame3.to_numpy()

array([[1.5, nan],
       [1.7, 2.4],
       [3.6, 2.9]])

If the DataFrame’s columns are different data types, the data type of the returned array will be chosen to accommodate all of the columns:

In [92]:
frame2.to_numpy()

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, nan],
       [2002, 'Ohio', 3.6, -1.2],
       [2001, 'Nevada', 2.4, nan],
       [2002, 'Nevada', 2.9, -1.5],
       [2003, 'Nevada', 3.2, 1.7]], dtype=object)

pandas’s Index objects are responsible for holding the axis labels (including a DataFrame's column names) and other metadata (like the axis name or names). Any array or other sequence of labels you use when constructing a Series or DataFrame is internally converted to an Index

In [93]:
obj = pd.Series(np.arange(3), index = ["a", "b", "c"])

In [94]:
index = obj.index

In [95]:
index

Index(['a', 'b', 'c'], dtype='object')

In [99]:
index[1:]

Index(['b', 'c'], dtype='object')

Index objects are immutable and thus can’t be modified by the user:



In [100]:
index[1] = "d"

TypeError: Index does not support mutable operations

Immutability makes it safer to share Index objects among data structures:



In [101]:
labels = pd.Index(np.arange(3))

In [102]:
labels

Index([0, 1, 2], dtype='int64')

In [103]:
obj2 = pd.Series([1.5, -2.5, 0], index=labels)

In [104]:
obj2

0    1.5
1   -2.5
2    0.0
dtype: float64

In [105]:
obj2.index is not labels

False

In [106]:
obj2.index is labels

True