In [20]:
################################  5.1 INTODUCTION TO DATA STRUCTURES PANDAS #############################

#@@ OBJECT SERIES

import pandas as pd

obj = pd.Series([4, 7, -5, 3])
"""0    4
   1    7
   2   -5
   3    3"""

obj.array # <NumpyExtensionArray>[4, 7, -5, 3]   wrapper about numpy array
obj.index # RangeIndex(start=0, stop=4, step=1)

obj = pd.Series([4, 7, -5, 3], index=["d", "b", "a", "c"])
"""d    4
   b    7
   a   -5
   c    3"""
# here we use mark as index
obj["a"] # -5
obj[["b", "c", "d"]]
"""b    7
   c    3
   d    4"""

# as well as in NumPy
obj[obj > 0] 
"""
d    4
b    7
c    3"""

obj * 2

import numpy as np

np.exp(obj)

"b" in obj # True

sdata = {"Ohio":35000, "Texas":71000, "Oregon":16000, "Utah":5000} # dictionary
obj2 = pd.Series(sdata) # dictionary in Series
"""
Ohio      35000
Texax     71000
Oregon    16000
Utah       5000"""

obj2.to_dict() # {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

# we can change the key order (Utah - not include, California - added with NaN)
order = ["California", "Texas", "Ohio", "Oregon"]
obj3 = pd.Series(sdata, index = order)
"""
California        NaN
Texas         71000.0
Ohio          35000.0
Oregon        16000.0 """

# looking for null
pd.isna(obj3)
"""
California     True
Texas         False
Ohio          False
Oregon        False"""

pd.notna(obj3)
"""
California    False
Texas          True
Ohio           True
Oregon         True"""

obj3.isna()
"""
California     True
Texas         False
Ohio          False
Oregon        False"""

obj2
"""
Ohio      35000
Texas     71000
Oregon    16000
Utah       5000"""

obj3
"""
California        NaN
Texas         71000.0
Ohio          35000.0
Oregon        16000.0"""

# full join (in SQL) + sum()
obj2 + obj3
"""
California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN"""

# atribut of Series - name
obj3.name = "Hello world!"
obj3.index.name = 'I love Python'
obj3
"""
I love Python
California        NaN
Texas         71000.0
Ohio          35000.0
Oregon        16000.0
Name: Hello world!, dtype: float64"""

# change index (instead get Serias from dictionary)
obj = pd.Series([4, 7, -5, 3])
"""0    4
   1    7
   2   -5
   3    3"""

obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
"""
Bob      4
Steve    7
Jeff    -5
Ryan     3"""


#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ OBJECT DATAFRAME @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@#

data = {'state':['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year':[2000, 2001, 2002, 2001, 2002, 2003],
        'pop':[1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
       }
frame = pd.DataFrame(data)
""" 	
   state 	year 	pop
0 	Ohio 	2000 	1.5
1 	Ohio 	2001 	1.7
2 	Ohio 	2002 	3.6
3 	Nevada 	2001 	2.4
4 	Nevada 	2002 	2.9
5 	Nevada 	2003 	3.2"""

frame.head() # as well as top 5 in SQL
"""state 	year 	pop
0 	Ohio 	2000 	1.5
1 	Ohio 	2001 	1.7
2 	Ohio 	2002 	3.6
3 	Nevada 	2001 	2.4
4 	Nevada 	2002 	2.9"""

frame.tail() # last 5 rows

# set column order ---> column name already exists in 'data'
pd.DataFrame(data, columns = ['year', 'state', 'pop'])
"""year 	state 	pop
0 	2000 	Ohio 	1.5
1 	2001 	Ohio 	1.7..."""

# if column name not exists in 'data' it's filling up by NaN
frame2 = pd.DataFrame(data, columns = ['state', 'pop', 'year', 'debt'])
"""state 	pop 	year 	debt
0 	Ohio 	1.5 	2000 	NaN
1 	Ohio 	1.7 	2001 	NaN..."""

# get COLUMN as Series-object
frame2['state']
"""
0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada"""

frame2.state # name must comply with the naming rules
"""
0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada"""

# get ROW
frame.loc[1]
"""
state    Ohio
year     2001
pop       1.7"""

frame.iloc[2]
"""
state    Ohio
year     2002
pop       3.6"""

# modify column values
frame2['debt'] = 16.5
"""
   state 	pop 	year 	debt
0 	Ohio 	1.5 	2000 	16.5
1 	Ohio 	1.7 	2001 	16.5
2 	Ohio 	3.6 	2002 	16.5"""

np.arange(6.) # array([0., 1., 2., 3., 4., 5.])
frame2['debt'] = np.arange(6.)  # lenght of arange must be equal rows count
"""state 	pop 	year 	debt
0 	Ohio 	1.5 	2000 	0.0
1 	Ohio 	1.7 	2001 	1.0
2 	Ohio 	3.6 	2002 	2.0..."""

val = pd.Series([1.2, -1.5, -1.7], index = ['two', 2, 5])  # lenght of Series NOT necessary be equal rows count
frame2['debt'] = val
"""state 	pop 	year  	debt
0 	Ohio 	1.5 	2000 	NaN
1 	Ohio 	1.7 	2001 	NaN
2 	Ohio 	3.6 	2002 	-1.5
3 	Nevada 	2.4 	2001 	NaN
4 	Nevada 	2.9 	2002 	NaN
5 	Nevada 	3.2 	2003 	-1.7"""

# create new column
frame2['new_column'] = val 
""" state 	pop 	year 	debt 	new_column
0 	Ohio 	1.5 	2000 	NaN 	NaN
1 	Ohio 	1.7 	2001 	NaN 	NaN
2 	Ohio 	3.6 	2002 	-1.5 	-1.5
3 	Nevada 	2.4 	2001 	NaN 	NaN
4 	Nevada 	2.9 	2002 	NaN 	NaN
5 	Nevada 	3.2 	2003 	-1.7 	-1.7"""

frame2['eastern'] = frame2['state'] == 'Ohio'
"""state 	pop 	year 	 dept 	new_column 	eastern
0 	Ohio 	1.5 	2000 	 NaN 	NaN 	True
1 	Ohio 	1.7 	2001 	 NaN 	NaN 	True
2 	Ohio 	3.6 	2002 	 -1.5 	-1.5 	True
3 	Nevada 	2.4 	2001 	 NaN 	NaN 	False
4 	Nevada 	2.9 	2002 	 NaN 	NaN 	False
5 	Nevada 	3.2 	2003 	 -1.7 	-1.7 	False"""

# frame2.new_column2 = val  # ERROR!

# delete column
del frame2['eastern']
frame2.columns # Index(['state', 'pop', 'year', 'debt', 'new_column']  ---> get column list

# Series object from DataFrame is the VIEW !
view_column = frame['pop']
"""
0    1.5
1    1.7
2    3.6
3    2.4
4    2.9
5    3.2"""

view_column[3] = 777.77
frame # it changed
"""
state 	year 	pop
0 	Ohio 	2000 	1.50
1 	Ohio 	2001 	1.70
2 	Ohio 	2002 	3.60
3 	Nevada 	2001 	777.77
4 	Nevada 	2002 	2.90
5 	Nevada 	2003 	3.20"""

# dictionary of dictionary
populations = {'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6},
               'Nevada': {2000: 2.4, 2002: 2.9}
              }
frame3 = pd.DataFrame(populations)
"""
        Ohio 	Nevada
2000 	1.5 	2.4
2001 	1.7 	NaN
2002 	3.6 	2.9"""

# transponition (pivot in SQL)  ----> we can lose type of columns if they differ !
frame3.T
""" 	2000 	2001 	2002
Ohio 	1.5 	1.7 	3.6
Nevada 	2.4 	NaN 	2.9"""

# index as mask for source dictionary
# analog operation with Series
pd.DataFrame(populations, index = [2000, 2002, 2003])
""" 	Ohio 	Nevada
2000 	1.5 	2.4
2002 	3.6 	2.9
2003 	NaN 	NaN"""

# designer DataFrame from source DataFrame through the dictionary
frame3
"""    Ohio 	Nevada
2000 	1.5 	2.4
2001 	1.7 	NaN
2002 	3.6 	2.9"""

frame3['Ohio'][:-1] # if :0 then all rows ---> :-1 without last row
"""
2000    1.5
2001    1.7"""

frame3['Nevada'][:2]
"""
2000    2.4
2001    NaN"""

pdata = {'Ohio': frame3['Ohio'][:-1],
         'Nevada': frame3['Nevada'][2:]
        }
pd.DataFrame(pdata) # insert by columns with row constraints
"""
       Ohio 	Nevada
2000 	1.5 	NaN
2001 	1.7 	NaN
2002 	NaN 	2.9"""

frame3
"""Ohio 	Nevada
2000 	1.5 	2.4
2001 	1.7 	NaN
2002 	3.6 	2.9"""

frame3.index.name = 'year:'
frame3.columns.name = 'state:'
frame3
"""state: 	Ohio 	Nevada
year: 		
2000 	1.5 	2.4
2001 	1.7 	NaN
2002 	3.6 	2.9"""

frame3.to_numpy() # return ndarray
"""array([[1.5, 2.4],
       [1.7, nan],
       [3.6, 2.9]])"""

frame2
"""state 	pop 	year 	debt 	new_column
0 	Ohio 	1.5 	2000 	NaN 	NaN
1 	Ohio 	1.7 	2001 	NaN 	NaN
2 	Ohio 	3.6 	2002 	-1.5 	-1.5
3 	Nevada 	2.4 	2001 	NaN 	NaN
4 	Nevada 	2.9 	2002 	NaN 	NaN
5 	Nevada 	3.2 	2003 	-1.7 	-1.7"""

frame2.to_numpy() # get the most common type = object
"""array([['Ohio', 1.5, 2000, nan, nan],
       ['Ohio', 1.7, 2001, nan, nan],
       ['Ohio', 3.6, 2002, -1.5, -1.5],
       ['Nevada', 2.4, 2001, nan, nan],
       ['Nevada', 2.9, 2002, nan, nan],
       ['Nevada', 3.2, 2003, -1.7, -1.7]], dtype=object)"""

#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@  INDEXING OBJECTS @@@@@@@@@@@@@@@@@@@@@@@@@@#

obj = pd.Series(np.arange(3), index = ['a', 'b', 'c'])
"""
a    0
b    1
c    2"""

index = obj.index # Index(['a', 'b', 'c'], dtype='object')
index[1:] # Index(['b', 'c'], dtype='object')
# index[1] = 'd' # ERROR! can't change indexing object

# create indexing object
labels = pd.Index(np.arange(3)) # Index([0, 1, 2], dtype='int64')

obj2 = pd.Series([1.5, -2.5, 0], index = labels)
"""
0    1.5
1   -2.5
2    0.0"""

obj2.index is labels # True

frame3
"""state: 	Ohio 	Nevada
year: 		
2000 	1.5 	2.4
2001 	1.7 	NaN
2002 	3.6 	2.9"""

frame3.columns # Index(['Ohio', 'Nevada'], dtype='object', name='state:')
'Ohio' in frame3.columns # True
2003 in frame3.index # False

# index can be not unique
pd.Index(['foo', 'foo', 'bar', 'bar']) # Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  view_column[3] = 777.77


Index(['foo', 'foo', 'bar', 'bar'], dtype='object')