# Getting started with Pandas

In [2]:
import numpy as np 
import pandas as pd 

In [8]:
# Series is one-dimentional array-like object containing a sequence of values of the same type and associated array of data labels (index)
# the simplest series is formed from only an array of data
obj = pd.Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [12]:
# we can get array representation and index object of the Series via "Index" and "Array" methods
print(obj.array)
print(obj.index)

<NumpyExtensionArray>
[4, 7, -5, 3]
Length: 4, dtype: int64
RangeIndex(start=0, stop=4, step=1)


In [14]:
# often we will want to create a series with an index identifying each data point with a label
obj2 = pd.Series([4,7,-5,3], index=["d","b","a","c"])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [18]:
obj2.index # as we can see, it's no longer RangeIndex -> now its Index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [26]:
# now we can use labels in the index when we want to access elements
print(obj2["a"])
print(obj2[["c", "d", "b"]])

-5
c    3
d    4
b    7
dtype: int64


In [28]:
# using NumPy/NumPy-like operations, such as filtering with a Boolean array or scalar multiplication will preserve the index-value link
obj2[obj2>0]

d    4
b    7
c    3
dtype: int64

In [32]:
# scalar multiplication example
obj2 * 2

d     8
b    14
a   -10
c     6
dtype: int64

In [34]:
np.exp(obj2)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [40]:
# another way to think about a Series is a fixed length, ordered dictionary, as it is a mapping of index values to data values
print("b" in obj2)
print("e" in obj2)

True
False


In [44]:
# should you have a python dict, we can create a pandas series by passing it as an argument 
sdata = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}
obj3 = pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [48]:
# a series can be converted back to python dict with the use of "to_dict" method
converted_series = obj3.to_dict()
converted_series

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [50]:
states = ["California", "Ohio", "Oregon", "Texas"]

obj4 = pd.Series(sdata, index=states)
obj4 # three values found in sdata were placed in appropriate locations

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [54]:
# "isna" and "notna" functions should be used to detect missing data
print(pd.isna(obj4))
print()
print(pd.notna(obj4))

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool


In [56]:
# Series also has these as instance methods 
print(obj4.isna())
print()
print(obj4.notna())

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool


In [62]:
# a useful Series future is that it automatically alignes by index label in arthmetic operations
print(obj3)
print(obj4)
print()
obj3 + obj4 # as we see this is somewhat similar to database join operations

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64



California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [66]:
# both the Series object and its Index have attribute "name", which itegrates with other pandas functionalities
obj4.name = "Population"
obj4.index.name = "State"

In [70]:
print(obj4.name)
print(obj4.index.name)

Population
State


In [74]:
obj4 # NOTE name AT THE BOTTOM

State
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: Population, dtype: float64

In [78]:
# A Series index can be altered in place by assignment
print(obj)
print()
obj.index = ["Piotr", "Michał", "Mikołaj", "Karol"]
obj

0    4
1    7
2   -5
3    3
dtype: int64



Piotr      4
Michał     7
Mikołaj   -5
Karol      3
dtype: int64

# DataFrame

In [83]:
# Df represents a rectangular table of data and contains an ordered, named collection of columns, each of which can be of different value type
# (numeric, string, boolean, etc.)
# DataFrame have both row and column index; it can be thought of as dictionary of Series all sharing the same index

In [87]:
# there are many ways to construct a df but the most common is probably constructing from a dict of equal length lists / numpy arrays
data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
        "year": [2000, 2001, 2002, 2001, 2002, 2003],
        "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

frame = pd.DataFrame(data)
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [89]:
# if we specify a sequence of columns, the dataframe columns will be arranged in that order
frame = pd.DataFrame(data, columns=["pop", "year", "state"])
frame.tail() # in order to get 5 last elements

Unnamed: 0,pop,year,state
1,1.7,2001,Ohio
2,3.6,2002,Ohio
3,2.4,2001,Nevada
4,2.9,2002,Nevada
5,3.2,2003,Nevada


In [91]:
# if we pass a column that isn't contained in the dictionary, it will be displayed as NaNs
frame = pd.DataFrame(data, columns=["pop", "year", "state", "debt"])
frame.head()

Unnamed: 0,pop,year,state,debt
0,1.5,2000,Ohio,
1,1.7,2001,Ohio,
2,3.6,2002,Ohio,
3,2.4,2001,Nevada,
4,2.9,2002,Nevada,


In [93]:
frame.columns

Index(['pop', 'year', 'state', 'debt'], dtype='object')

In [105]:
# a column in dataframe can be accessed EITHER BY dictionary-like notation df["sth"] or by "dot notation" df.sth
print(frame["year"])
print()
print(frame.year) # NOTE: index is the same as in our df AND Name: atribute was set with the name of the column

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64


In [113]:
# rows can also be retreived by position or name with "iloc" and "loc" attributes
print(frame.loc[1])
print()
print(frame.iloc[2])

pop       1.7
year     2001
state    Ohio
debt      NaN
Name: 1, dtype: object

pop       3.6
year     2002
state    Ohio
debt      NaN
Name: 2, dtype: object


In [115]:
# Columns can be modified by assignment. We could for example assign "debt" a scalar value or an entire array of values
frame.debt = 16.5
frame

Unnamed: 0,pop,year,state,debt
0,1.5,2000,Ohio,16.5
1,1.7,2001,Ohio,16.5
2,3.6,2002,Ohio,16.5
3,2.4,2001,Nevada,16.5
4,2.9,2002,Nevada,16.5
5,3.2,2003,Nevada,16.5


In [123]:
frame.debt = np.arange(len(frame))
frame

Unnamed: 0,pop,year,state,debt
0,1.5,2000,Ohio,0
1,1.7,2001,Ohio,1
2,3.6,2002,Ohio,2
3,2.4,2001,Nevada,3
4,2.9,2002,Nevada,4
5,3.2,2003,Nevada,5


In [131]:
# If you assign a Series, its labels will be realigned exactly to the DataFrame’s index, inserting missing values in any index values not present
val = pd.Series([-1.2, -1.5, -1.7], index=[2,4,5])
frame['debt'] = val
frame

Unnamed: 0,pop,year,state,debt
0,1.5,2000,Ohio,
1,1.7,2001,Ohio,
2,3.6,2002,Ohio,-1.2
3,2.4,2001,Nevada,
4,2.9,2002,Nevada,-1.5
5,3.2,2003,Nevada,-1.7
