# Getting started with Pandas

In [2]:
import numpy as np 
import pandas as pd 

In [8]:
# Series is one-dimentional array-like object containing a sequence of values of the same type and associated array of data labels (index)
# the simplest series is formed from only an array of data
obj = pd.Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [12]:
# we can get array representation and index object of the Series via "Index" and "Array" methods
print(obj.array)
print(obj.index)

<NumpyExtensionArray>
[4, 7, -5, 3]
Length: 4, dtype: int64
RangeIndex(start=0, stop=4, step=1)


In [14]:
# often we will want to create a series with an index identifying each data point with a label
obj2 = pd.Series([4,7,-5,3], index=["d","b","a","c"])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [18]:
obj2.index # as we can see, it's no longer RangeIndex -> now its Index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [26]:
# now we can use labels in the index when we want to access elements
print(obj2["a"])
print(obj2[["c", "d", "b"]])

-5
c    3
d    4
b    7
dtype: int64


In [28]:
# using NumPy/NumPy-like operations, such as filtering with a Boolean array or scalar multiplication will preserve the index-value link
obj2[obj2>0]

d    4
b    7
c    3
dtype: int64

In [32]:
# scalar multiplication example
obj2 * 2

d     8
b    14
a   -10
c     6
dtype: int64

In [34]:
np.exp(obj2)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [40]:
# another way to think about a Series is a fixed length, ordered dictionary, as it is a mapping of index values to data values
print("b" in obj2)
print("e" in obj2)

True
False


In [44]:
# should you have a python dict, we can create a pandas series by passing it as an argument 
sdata = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}
obj3 = pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [48]:
# a series can be converted back to python dict with the use of "to_dict" method
converted_series = obj3.to_dict()
converted_series

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [50]:
states = ["California", "Ohio", "Oregon", "Texas"]

obj4 = pd.Series(sdata, index=states)
obj4 # three values found in sdata were placed in appropriate locations

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [54]:
# "isna" and "notna" functions should be used to detect missing data
print(pd.isna(obj4))
print()
print(pd.notna(obj4))

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool


In [56]:
# Series also has these as instance methods 
print(obj4.isna())
print()
print(obj4.notna())

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool


In [62]:
# a useful Series future is that it automatically alignes by index label in arthmetic operations
print(obj3)
print(obj4)
print()
obj3 + obj4 # as we see this is somewhat similar to database join operations

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64



California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [66]:
# both the Series object and its Index have attribute "name", which itegrates with other pandas functionalities
obj4.name = "Population"
obj4.index.name = "State"

In [70]:
print(obj4.name)
print(obj4.index.name)

Population
State


In [74]:
obj4 # NOTE name AT THE BOTTOM

State
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: Population, dtype: float64

In [78]:
# A Series index can be altered in place by assignment
print(obj)
print()
obj.index = ["Piotr", "Michał", "Mikołaj", "Karol"]
obj

0    4
1    7
2   -5
3    3
dtype: int64



Piotr      4
Michał     7
Mikołaj   -5
Karol      3
dtype: int64

# DataFrame

In [83]:
# Df represents a rectangular table of data and contains an ordered, named collection of columns, each of which can be of different value type
# (numeric, string, boolean, etc.)
# DataFrame have both row and column index; it can be thought of as dictionary of Series all sharing the same index

In [87]:
# there are many ways to construct a df but the most common is probably constructing from a dict of equal length lists / numpy arrays
data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
        "year": [2000, 2001, 2002, 2001, 2002, 2003],
        "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

frame = pd.DataFrame(data)
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [89]:
# if we specify a sequence of columns, the dataframe columns will be arranged in that order
frame = pd.DataFrame(data, columns=["pop", "year", "state"])
frame.tail() # in order to get 5 last elements

Unnamed: 0,pop,year,state
1,1.7,2001,Ohio
2,3.6,2002,Ohio
3,2.4,2001,Nevada
4,2.9,2002,Nevada
5,3.2,2003,Nevada


In [91]:
# if we pass a column that isn't contained in the dictionary, it will be displayed as NaNs
frame = pd.DataFrame(data, columns=["pop", "year", "state", "debt"])
frame.head()

Unnamed: 0,pop,year,state,debt
0,1.5,2000,Ohio,
1,1.7,2001,Ohio,
2,3.6,2002,Ohio,
3,2.4,2001,Nevada,
4,2.9,2002,Nevada,


In [93]:
frame.columns

Index(['pop', 'year', 'state', 'debt'], dtype='object')

In [105]:
# a column in dataframe can be accessed EITHER BY dictionary-like notation df["sth"] or by "dot notation" df.sth
print(frame["year"])
print()
print(frame.year) # NOTE: index is the same as in our df AND Name: atribute was set with the name of the column

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64


In [113]:
# rows can also be retreived by position or name with "iloc" and "loc" attributes
print(frame.loc[1])
print()
print(frame.iloc[2])

pop       1.7
year     2001
state    Ohio
debt      NaN
Name: 1, dtype: object

pop       3.6
year     2002
state    Ohio
debt      NaN
Name: 2, dtype: object


In [115]:
# Columns can be modified by assignment. We could for example assign "debt" a scalar value or an entire array of values
frame.debt = 16.5
frame

Unnamed: 0,pop,year,state,debt
0,1.5,2000,Ohio,16.5
1,1.7,2001,Ohio,16.5
2,3.6,2002,Ohio,16.5
3,2.4,2001,Nevada,16.5
4,2.9,2002,Nevada,16.5
5,3.2,2003,Nevada,16.5


In [123]:
frame.debt = np.arange(len(frame))
frame

Unnamed: 0,pop,year,state,debt
0,1.5,2000,Ohio,0
1,1.7,2001,Ohio,1
2,3.6,2002,Ohio,2
3,2.4,2001,Nevada,3
4,2.9,2002,Nevada,4
5,3.2,2003,Nevada,5


In [131]:
# If you assign a Series, its labels will be realigned exactly to the DataFrame’s index, inserting missing values in any index values not present
val = pd.Series([-1.2, -1.5, -1.7], index=[2,4,5])
frame['debt'] = val
frame

Unnamed: 0,pop,year,state,debt
0,1.5,2000,Ohio,
1,1.7,2001,Ohio,
2,3.6,2002,Ohio,-1.2
3,2.4,2001,Nevada,
4,2.9,2002,Nevada,-1.5
5,3.2,2003,Nevada,-1.7


In [161]:
# assigning a column that doesn't exist will create a new column
frame["eastern"] = frame["state"] == "Ohio"
frame #CAUTION: new columns cannot be created with dot attribute notation

Unnamed: 0,pop,year,state,debt,easter,eastern
0,1.5,2000,Ohio,,True,True
1,1.7,2001,Ohio,,True,True
2,3.6,2002,Ohio,-1.2,True,True
3,2.4,2001,Nevada,,False,False
4,2.9,2002,Nevada,-1.5,False,False
5,3.2,2003,Nevada,-1.7,False,False


In [165]:
# the del method can be used to remove a column
del frame["easter"]
frame

Unnamed: 0,pop,year,state,debt
0,1.5,2000,Ohio,
1,1.7,2001,Ohio,
2,3.6,2002,Ohio,-1.2
3,2.4,2001,Nevada,
4,2.9,2002,Nevada,-1.5
5,3.2,2003,Nevada,-1.7


In [167]:
frame.columns

Index(['pop', 'year', 'state', 'debt'], dtype='object')

In [169]:
# similarly to NumPy array, column accessed by dict-like notation is a VIEW and not a copy. Any inplace modification will be reflected
# the column can be explicitly copied with Seried copy method 

In [171]:
# another common form of data is nested dict. Dictionary of Dictionaries
In [72]: populations = {"Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6},
   ....:                "Nevada": {2001: 2.4, 2002: 2.9}}

In [173]:
# if nested dict is passed as an argument to pandas dataframe -> the outer keys will be treated as column labels and inner keys as row indices
frame3 = pd.DataFrame(populations)
frame3

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [177]:
print(frame3.index)
print(frame3.columns)

Index([2000, 2001, 2002], dtype='int64')
Index(['Ohio', 'Nevada'], dtype='object')


In [179]:
# we can transpose dataframe (switch rows and columns) with the following syntax
frame3.T

Unnamed: 0,2000,2001,2002
Ohio,1.5,1.7,3.6
Nevada,,2.4,2.9


In [181]:
# the keys in the inner dictionaries are combined to form an Index -> this isn't the case when index is explicitly defined 
pd.DataFrame(populations, index=[2001, 2002, 2003])

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9
2003,,


In [193]:
# Dictionaries of Series are treated in much the same way
print(frame3["Ohio"][:-1].shape) # as we can see this is a pandas Series -> WE TAKE ENTIRE SERIES WITHOUT THE LAST ELEMENT
print()

pdata = {"Ohio": frame3["Ohio"][:-1],
        "Nevada": frame3["Nevada"][:2]}
pd.DataFrame(pdata)

(2,)



Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4


In [196]:
frame3.index.name = "Year"
frame3.columns.name = "State"
frame3 # NOTE: both names are displayed

State,Ohio,Nevada
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [200]:
# unlike Series, Dataframe DOES NOT have "name" attribute!!!
# "to_numpy() method returns a 2D-array containing data contained in our dataframe
frame3.to_numpy()

array([[1.5, nan],
       [1.7, 2.4],
       [3.6, 2.9]])

In [206]:
# ATTENTION: If the dataframe columns are of different types, the data type of returned array will be chosen to accomodate all the columns. 
# dtype = object !!!!

# THIS IS AN EXCEPTION TO GENERAL NUMPY RULE THAT ALL OBJECTS STORED IN AN ARRAY SHOULD BE HOMOGENOUS
# we sacrifice performence in order to store all of these 
frame.to_numpy()

array([[1.5, 2000, 'Ohio', nan],
       [1.7, 2001, 'Ohio', nan],
       [3.6, 2002, 'Ohio', -1.2],
       [2.4, 2001, 'Nevada', nan],
       [2.9, 2002, 'Nevada', -1.5],
       [3.2, 2003, 'Nevada', -1.7]], dtype=object)

# Index Objects

In [217]:
# pandas Index objects are responsible for holding axis labels (including dataframe column names) and other metadata (like axis name or names)
# An array or other sequence of labels is internally converted into Index 
obj = pd.Series(np.arange(3), index=["a", "b", "c"])
print(obj.index)
print(obj.index[1:])

Index(['a', 'b', 'c'], dtype='object')
Index(['b', 'c'], dtype='object')


In [219]:
# Index objects are immutable, and therefore cannot be modified by the user
# obj.index[1] = "d" #TypeError

In [221]:
# Immutability makes it easier to share Index objects among data structures
labels = pd.Index(np.arange(6))

In [229]:
obj2 = pd.Series([1.5, -2.5, 6.6, 2.6, 6.4, 3.0], index = labels)
obj2

0    1.5
1   -2.5
2    6.6
3    2.6
4    6.4
5    3.0
dtype: float64

In [231]:
obj2.index is labels

True

In [233]:
frame3

State,Ohio,Nevada
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [237]:
print(frame3.columns)
"Ohio" in frame3.columns

Index(['Ohio', 'Nevada'], dtype='object', name='State')


True

In [239]:
print(frame3.index)
2003 in frame3.index

Index([2000, 2001, 2002], dtype='int64', name='Year')


False

In [243]:
# Unlike python sets, pandas Index CAN contain duplicates
pd.Index(["foo", "foo", "bar", "bar"])

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

# 5.2 Essential Functionality

In [246]:
# this section will walk us through the fundamental mechanics of interacting with the data contained in Series/DataFrame

In [248]:
# Reindexing
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=["d", "b", "a", "c"])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [252]:
# calling reindex method on this series rearanges the data according to the new index -> if missing values if some index values were not present
obj2 = obj.reindex(["a", "b", "c", "d", "e"])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [254]:
# for ordered data like "time-series" we may want to do some interpolation or filling of values when reindexing
obj3 = pd.Series(["blue", "yellow", "orange"], index=[0,2,4])
obj3

0      blue
2    yellow
4    orange
dtype: object

In [256]:
# method parameter allows us to fill missing values -> in this case I used ffill, which stands for forward-fills
obj4 = obj3.reindex(np.arange(6), method="ffill") #bfill fills backwards
obj4

0      blue
1      blue
2    yellow
3    yellow
4    orange
5    orange
dtype: object

In [258]:
# with DataFrame "reindex" can alter the row index, columns or both
# when passed only a sequence, it reindexes the rows in the result
frame = pd.DataFrame(np.arange(9).reshape((3,3)), index = ["a", "c", "d"], columns=["Ohio", "Texas", "California"])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [260]:
frame2 = frame.reindex(index=["a", "b", "c", "d"])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [262]:
# the columns can be reindex with columns keyword
states = ["Texas", "Utah", "California"]
frame.reindex(columns=states) # because Ohio was not in "states" list, it was dropped 

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [266]:
# another way to reindex a particular axis is to pass the new axis labels as a positional argument and then specify the axis to reindex with the axis keyword
frame.reindex(states, axis="columns")

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [274]:
# Droping Entries from an axis 
obj = pd.Series(np.arange(5), index=["a", "b", "c", "d", "e"])
obj

a    0
b    1
c    2
d    3
e    4
dtype: int64

In [276]:
new_obj = obj.drop("c")
new_obj

a    0
b    1
d    3
e    4
dtype: int64

In [284]:
cos = obj.drop(["d", "c"])
cos

a    0
b    1
e    4
dtype: int64

In [288]:
# with DataFrame, index values can be deleted from either axis
data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=["Ohio", "Colorado", "Utah", "Oregon"], columns=["one", "two", "three", "four"])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
Oregon,12,13,14,15


In [292]:
# calling "drop" with a sequence of labels will drop values from the row labels (axis 0)
print(data.drop(index=["Colorado", "Utah"]))


        one  two  three  four
Ohio      0    1      2     3
Oregon   12   13     14    15


In [294]:
# to drop labels from the columns, instead use "columns" keyword
print(data.drop(columns=["two"]))

          one  three  four
Ohio        0      2     3
Colorado    4      6     7
Utah        8     10    11
Oregon     12     14    15


In [296]:
# you can also drop values from the columns by passing axis=1 (which is like numpy) or axis="columns"
print(data.drop("two", axis=1))

          one  three  four
Ohio        0      2     3
Colorado    4      6     7
Utah        8     10    11
Oregon     12     14    15


In [300]:
print(data.drop(["two", "three"], axis="columns"))

          one  four
Ohio        0     3
Colorado    4     7
Utah        8    11
Oregon     12    15


## Indexing, Selection and Filtering

In [305]:
# Series indexing work analogously to NumPy indexing, except you can also use Series index values 
obj = pd.Series(np.arange(4.), index=["a", "b", "c", "d"])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [311]:
# now, how do index -> pretty straightforward
print(obj["b"])
print(obj[1]) # as we can see the result is exactly the same 
print(obj[2:4])

1.0
1.0
c    2.0
d    3.0
dtype: float64


  print(obj[1]) # as we can see the result is exactly the same


In [313]:
# order matters 
obj[["b", "c", "a"]]

b    1.0
c    2.0
a    0.0
dtype: float64

In [315]:
obj[[1,3]]

  obj[[1,3]]


b    1.0
d    3.0
dtype: float64

In [317]:
# while we can select data by label this way, the preffered way is to use loc operator
obj.loc[["b", "a", "c"]]

b    1.0
a    0.0
c    2.0
dtype: float64

In [321]:
# The reason to prefer loc is normal []-indexing, will treat integers as labels if index contains integers
# BEHAVIOUR DIFFERS DEPENDING ON THE DATA TYPE OF THE INDEX
obj1 = pd.Series(np.arange(3), index=[2,0,1])
obj2 = pd.Series(np.arange(3), index=["a", "b", "c"])

print(obj1)
print()
print(obj2)

2    0
0    1
1    2
dtype: int64

a    0
b    1
c    2
dtype: int64


In [329]:
print(obj1[[0,1,2]])

0    1
1    2
2    0
dtype: int64


In [325]:
obj2[[0,1,2]]

  obj2[[0,1,2]]


a    0
b    1
c    2
dtype: int64

In [335]:
# in order to access elements with integers -> we need to use iloc
obj1.iloc[[0,1,2]]

2    0
0    1
1    2
dtype: int64

In [337]:
obj2.iloc[[0,1,2]]

a    0
b    1
c    2
dtype: int64

In [339]:
obj2.loc[["a", "b"]]

a    0
b    1
dtype: int64

In [341]:
# CAUTION: we can also slice with labels (obj2["b":"c"]), but it works differently from Python slicing in that the last element is inclusive.

In [345]:
# assigning values using these methods modifies the corresponding section of the Series
obj2.loc["b":"c"] = 5
obj2

a    0
b    5
c    5
dtype: int64