In [1]:
# Data Manipulation with Pandas
# DataFrames are essentially multidimensional arrays with attatched row and column labels
# often with heterogeneous types and/or missing data as well as a convenient storage interface

# NumPy's ndarray data structure provides essential features for the type of clean, 
# well-organized data seen in numerical computing tasks. It's faults show when we need more
# flexibility and when operations do not map well to element-wise broadcasting each
# of whcih is an important piece of analyzing the less structured data available in many forms
# in the world around us. Pandas, and in particular it' Series and DataFram objects

# This chapter will focus on the mechanics of using Series, DataFrame, and related structures
# effectively

In [2]:
import pandas
pandas.__version__

'1.3.4'

In [3]:
import pandas as pd

In [4]:
import numpy as np

In [5]:
# A Panda Series is a one-dimensional array of indexed data which can be created from a list or 
# array as follows:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [6]:
# The Series wraps both a sequence of values and a sequence of indices, 
# we can access with the values and index attributes
# Values are similar to a NumPy array
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [7]:
# the index is an array-like object of type pd.Index
data.index

RangeIndex(start=0, stop=4, step=1)

In [8]:
# Like with a NumPy array, data can be accessed by the associated index via the familiar
# Python square-bracket notation:
data[1]

0.5

In [9]:
data[1:3]

1    0.50
2    0.75
dtype: float64

In [10]:
# Panda Series is much more general and flexible than the one-dimensional NumPy array

In [11]:
# Series as a generalized NumPy array
# It may look like the Series object is interchangeable with a one-dimensional NumPy array
# The essetial difference is the presence of the index: while NumPy array is an implicitly
# use to access the values, the Pandas Series has an explicitly defined index associate with the 
# values

# This gives Series additional capabilities:
# index does not need to be an integer, and can consist of values of any desired type:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [12]:
# And the item works as expected
data['b']

0.5

In [13]:
# Can use noncontiguous or nonsequential indices:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=[2, 5, 3, 7])

In [14]:
data[5]

0.5

In [15]:
# Series as a specialized dictionary: 
# Can think of a Pandas Series a bit like a specialization of a Python dictionary.
# Just as the type-specific compiled code behind a NumPy array makes it more
# effecient than a Python list for certain operations, the type information of a Pandas Series
# makes it much more effecient than Python dictionaries for certain operations

# Can make the series-as-dictionary analogy even more clear by constructing a Series
# directly from a Python dictionary:
population_dict = {'California': 38332521, 'Texas': 26448193, 'New York': 19651127, 'Florida': 19552860, 'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [16]:
# By default a Series will be created where the index is drawn from the sorted keys.
# From here, typical dictionary-style item access can be performed:
population['California']

38332521

In [17]:
# unlike a dictionary, the Series supports array-style operations such as slicing:
population['California':'Illinois']

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [18]:
# Constructing Series Objects:
# We've already seen a few ways of constructing a Pandas Series from scratch;
# all of them are some verison of the followeing: 
# >>> pd.Series(data, index=index)
# where index is an optional argument, and data can be one of the many entities.
# For example, data can be a list or NumPy array, in which case index defaults to an integer series

pd.Series([2, 4, 6])

0    2
1    4
2    6
dtype: int64

In [19]:
# data can be scalar, which is repeated to fill the specified index:
pd.Series(5, index=[100, 200, 300])

100    5
200    5
300    5
dtype: int64

In [20]:
# data can be a dictionary, in which index defaults to the sorted dictionary keys:
pd.Series({2:'a', 1:'b', 3:'c'})

2    a
1    b
3    c
dtype: object

In [21]:
# In each case, the index can be explicitly set if a different result is preferred:
pd.Series({2:'a', 1:'b', 3:'c'}, index=[3, 2])

3    c
2    a
dtype: object

In [22]:
# in this case the series is populated only with explicitly identified keys

In [23]:
# The Pandas DataFrame Object:
# Like the series object discussed in the previous section, the DataFrame can be thought of
# either a generalization of NumPy array, or as a specialization of a Python dictionary.

In [24]:
# DataFrame as a generalized NumPy array

# If a Series is an analog of a one-dimensional array with felxible indices, 
# a DataFrame is an analog of a two-dimensional array with both flexible row indices
# and flexible column names.
# Can think a DataFrame as a sequence of aligned Series objects 
# (aligned meaning they share the same objects)

area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297, 'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [25]:
# Can use a dictionary to construct a single two-dimensional object containing this information:
states = pd.DataFrame({'population': population, 'area':area})

In [26]:
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [27]:
# Like Series, the DataFrame has an index attribute that gives access to the index labels:
states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [28]:
# the DataFrame has a columns attribute, which is an Index object holding the column lables:
states.columns

Index(['population', 'area'], dtype='object')

In [29]:
# DataFrame can be thought of as a generalization of a two-dimensional NumPy array, where both
# the rows and columns have a generalized index for accesing the data.

In [30]:
# DataFrame as specialized dictionary

# We can think of a DataFrame as a specialization of a dictionary. Where a dictionary maps a key
# to a value, a DataFrame maps a column name to a Series of a column data.
# For example, asking the 'area' attribute returns the Series object containing the areas
# we saw earlier:
states['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [31]:
# A potential point of confusion: a two-dimensional NumPy array, 
# data[0] will return the first row
# For a DataFrame, data['col0] will return the first column. 
# Better to think about DataFrames as a generalized dictionary rather than a generalized array

In [32]:
# Constructing DataFrame objects
# A Pandas DataFrame can be constructed in a variety of ways. Here we'll give several examples

# From a single Series object. 
# A DataFrame is a collecion of Series objects and a single Column DataFrame can be constructed
# from a single Series:
pd.DataFrame(population, columns=['population'])

Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


In [33]:
data = [{'a':i, 'b':2 * i}
        for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [34]:
# Even if some keys in the dictionary are missing, Pandas will fill them in with NaN 
# (i.e., "not a number") values
pd.DataFrame({'a':1, 'b':2}, {'b':3, 'c':4})

Unnamed: 0,a,b
b,1,2
c,1,2


In [35]:
# Even if some keys in the dictionary are missing, Pandas will fill them in with NaN 
# (i.e., "not a number") values
pd.DataFrame([{'a':1, 'b':2}, {'b':3, 'c':4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [36]:
# From a dictionary of Series objects.
# As a we saw before, a DataFrame can be constructed from a dictionary of Series objects as well:
pd.DataFrame({'population': population, 'area': area})

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [37]:
# From a two-dimensional NumPy array
# Given a two-dimensional array of data, we can create a DataFrame with any specified column 
# and index names. If omitted, and integer index will be used for each:
pd.DataFrame(np.random.rand(3, 2),
            columns=['foo', 'bar']
            index=['a', 'b', 'c'])

SyntaxError: invalid syntax (2659133906.py, line 6)

In [38]:
# From a two-dimensional NumPy array
# Given a two-dimensional array of data, we can create a DataFrame with any specified column 
# and index names. If omitted, and integer index will be used for each:
pd.DataFrame(np.random.rand(3, 2),
            columns=['foo', 'bar'],
            index=['a', 'b', 'c'])

Unnamed: 0,foo,bar
a,0.274916,0.397227
b,0.627093,0.486894
c,0.618092,0.436013


In [40]:
# From a NumPy structured array
# A Pandas DataFrame operats much like a structured array, and can be created directly from one:

A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])
A

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])

In [41]:
pd.DataFrame(A)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


In [42]:
# The Pandas Index Object

# Both Series and DataFrame objects contain an explicit index that lets you reference 
# and modify data. The index object is an intersting structure in itself, and can be thought
# of either as an immutable array or as an ordered set

ind = pd.Index([2,3,5,7,11])
ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

In [43]:
# Index as immutable array

# The index object in many ways operates like an array. For example, 
# we can use standard Python indexing notation to retrieve values or slices:

ind[1]


3

In [44]:
ind[::2]

Int64Index([2, 5, 11], dtype='int64')

In [45]:
# Index objects also have many of the attributes familiar from NumPy arrays:
print(ind.size, ind.shape, ind.ndim, ind.dtype)

5 (5,) 1 int64


In [47]:
# One difference between Index objects and NumPy arrays is that indices are immutable
ind[1] = 0

TypeError: Index does not support mutable operations

In [48]:
# The immutability makes it safer to share indices between multiple DataFrames and arrays
# without the potential side effects from inadvertent index modification

# Index as ordered set
# Pandas objects are designed to facilitate operation such as joins across datasets, 
# which depend on many aspects of set arithmetic. 
# Index object follows many of the conventions used by Python's built-in data structures
# unions, intersections, differences, and other combination can be computed in a familiar way:
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])

In [49]:
indA & indB # intersection

  indA & indB # intersection


Int64Index([3, 5, 7], dtype='int64')

In [51]:
indA | indB # union

  indA | indB # union


Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [52]:
indA ^ indB # symmetric difference

  indA ^ indB # symmetric difference


Int64Index([1, 2, 9, 11], dtype='int64')

In [53]:
# These operatoins may also be accessed via object methods - for example, indA.intersection(indB)