In [2]:
import pandas as pd
import numpy as np

## Pandas Data Structure and Operation


### Series

In [4]:
#  The Series is the object of the pandas library designed to represent one-dimensional data structures

In [5]:
s = pd.Series([12,-4,7,9])

In [6]:
s

0    12
1    -4
2     7
3     9
dtype: int64

In [31]:
# If you do not specify any index during the definition of the Series, by default, pandas will assign 
# numerical values increasing from 0 as labels.

In [8]:
s = pd.Series([12,-4,7,9], index=['a','b','c','d'])

In [9]:
s

a    12
b    -4
c     7
d     9
dtype: int64

In [10]:
s.values

array([12, -4,  7,  9], dtype=int64)

In [11]:
s.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [14]:
s[2]

  s[2]


7

In [15]:
s['b']

-4

In [16]:
s[0:2]

a    12
b    -4
dtype: int64

In [17]:
s[['b','c']]

b   -4
c    7
dtype: int64

In [33]:
# How are indexes helpful?
# 1. Label-Based Access: When using the index, you can easily perform label-based access using .loc[] or .iloc[] methods.
# 2. Alignment of Data: Operations between two DataFrames or between a DataFrame and a Series automatically align based on the index
# 3. Efficient Merging and Joining: When merging or joining DataFrames, you can specify the index as the key for the operation, which can be more efficient than using regular columns.
# 4. Resampling and Frequency Conversion: The index plays a crucial role in time-related operations, such as resampling and frequency conversion. Pandas has dedicated functions like resample that work seamlessly with time-based indices.
# 5. Multi-level Indexing: Pandas supports multi-level indexing, allowing you to create hierarchical indices.

# While you can certainly perform aggregations on a regular column, using the index can often lead to more concise, readable, and efficient code, especially in the context of time-series data and when performing operations involving multiple DataFrames. It provides a structured and powerful way to organize and manipulate data in pandas.

In [18]:
s[1] = 0

  s[1] = 0


In [19]:
s

a    12
b     0
c     7
d     9
dtype: int64

In [20]:
s['b'] = 1

In [21]:
s

a    12
b     1
c     7
d     9
dtype: int64

In [22]:
#  You can define new Series starting with NumPy arrays or existing Series.

In [23]:
arr = np.array([1,2,3,4])

In [24]:
s3 = pd.Series(arr)

In [25]:
s3

0    1
1    2
2    3
3    4
dtype: int32

In [27]:
# Filtering the values
s[s>8]

a    12
d     9
dtype: int64

In [29]:
s/2

a    6.0
b    0.5
c    3.5
d    4.5
dtype: float64

In [30]:
np.log(s)

a    2.484907
b    0.000000
c    1.945910
d    2.197225
dtype: float64

In [34]:
serd = pd.Series([1,0,2,1,2,3], index=['white','white','blue','green','green','yellow'])

In [35]:
serd

white     1
white     0
blue      2
green     1
green     2
yellow    3
dtype: int64

In [36]:
serd.unique()

array([1, 0, 2, 3], dtype=int64)

In [37]:
serd.value_counts()

1    2
2    2
0    1
3    1
Name: count, dtype: int64

In [38]:
serd.isin([0,3])

white     False
white      True
blue      False
green     False
green     False
yellow     True
dtype: bool

In [40]:
# Filtering the rows
serd[serd.isin([0,3])]

white     0
yellow    3
dtype: int64

In [41]:
# NaN values -> NaN - Not a Number
# This specific value NaN (Not a Number) is used within pandas data structures to indicate the 
# presence of an empty field or not definable numerically.

In [42]:
s2 = pd.Series([5,-3,np.NaN,14])

In [43]:
s2

0     5.0
1    -3.0
2     NaN
3    14.0
dtype: float64

In [45]:
s2.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [46]:
s2[s2.notnull()]

0     5.0
1    -3.0
3    14.0
dtype: float64

In [47]:
mydict = {'red': 2000, 'blue': 1000, 'yellow': 500, 'orange': 1000}

In [48]:
myseries = pd.Series(mydict)

In [49]:
myseries

red       2000
blue      1000
yellow     500
orange    1000
dtype: int64

In [50]:
mydict2 = {'red':400,'yellow':1000,'black':700}

In [51]:
myseries2 = pd.Series(mydict2)

In [52]:
myseries2

red        400
yellow    1000
black      700
dtype: int64

In [53]:
myseries + myseries2

black        NaN
blue         NaN
orange       NaN
red       2400.0
yellow    1500.0
dtype: float64

### DataFrame

In [55]:
# The DataFrame is a tabular data structure very similar to the Spreadsheet 
# This data structure is designed to extend the case of the Series to multiple dimensions.

In [56]:
data = {'color' : ['blue','green','yellow','red','white'],
                     'object' : ['ball','pen','pencil','paper','mug'],
                     'price' : [1.2,1.0,0.6,0.9,1.7]}

In [57]:
frame = pd.DataFrame(data)

In [58]:
frame

Unnamed: 0,color,object,price
0,blue,ball,1.2
1,green,pen,1.0
2,yellow,pencil,0.6
3,red,paper,0.9
4,white,mug,1.7


In [59]:
frame2 = pd.DataFrame(data, columns=['object','price'])

In [60]:
frame2

Unnamed: 0,object,price
0,ball,1.2
1,pen,1.0
2,pencil,0.6
3,paper,0.9
4,mug,1.7


In [61]:
frame2 = pd.DataFrame(data, index=['one','two','three','four','five'])

In [62]:
frame2

Unnamed: 0,color,object,price
one,blue,ball,1.2
two,green,pen,1.0
three,yellow,pencil,0.6
four,red,paper,0.9
five,white,mug,1.7


In [63]:
frame3 = pd.DataFrame(np.arange(16).reshape((4,4)),
                    index=['red','blue','yellow','white'],
                    columns=['ball','pen','pencil','paper']) 

In [64]:
frame3

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [65]:
frame.columns

Index(['color', 'object', 'price'], dtype='object')

In [66]:
frame['price']

0    1.2
1    1.0
2    0.6
3    0.9
4    1.7
Name: price, dtype: float64

In [67]:
frame.price

0    1.2
1    1.0
2    0.6
3    0.9
4    1.7
Name: price, dtype: float64

In [69]:
frame.iloc[2]

color     yellow
object    pencil
price        0.6
Name: 2, dtype: object

In [79]:
# gives 0th to 1st row data, same as slicing the list
frame[0:2]

Unnamed: 0,color,object,price
0,blue,ball,1.2
1,green,pen,1.0


In [80]:
frame[1:4]

Unnamed: 0,color,object,price
1,green,pen,1.0
2,yellow,pencil,0.6
3,red,paper,0.9


In [81]:
frame['object'][3]

'paper'

In [83]:
frame.index.name = 'id'
frame.columns.name = 'item'

In [84]:
frame

item,color,object,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,blue,ball,1.2
1,green,pen,1.0
2,yellow,pencil,0.6
3,red,paper,0.9
4,white,mug,1.7


In [85]:
frame['new'] = 12
# Careful with the initalization, this will replace all the records with the value

In [86]:
frame

item,color,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,blue,ball,1.2,12
1,green,pen,1.0,12
2,yellow,pencil,0.6,12
3,red,paper,0.9,12
4,white,mug,1.7,12


In [87]:
frame['new'] = [3.0,1.3,2.2,0.8,1.1]

In [88]:
frame

item,color,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,blue,ball,1.2,3.0
1,green,pen,1.0,1.3
2,yellow,pencil,0.6,2.2
3,red,paper,0.9,0.8
4,white,mug,1.7,1.1


In [89]:
ser = pd.Series(np.arange(5))

In [90]:
ser

0    0
1    1
2    2
3    3
4    4
dtype: int32

In [91]:
frame["new"] = ser

In [92]:
frame

item,color,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,blue,ball,1.2,0
1,green,pen,1.0,1
2,yellow,pencil,0.6,2
3,red,paper,0.9,3
4,white,mug,1.7,4


In [93]:
frame

item,color,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,blue,ball,1.2,0
1,green,pen,1.0,1
2,yellow,pencil,0.6,2
3,red,paper,0.9,3
4,white,mug,1.7,4


In [94]:
frame.isin([1.0,'pen'])

item,color,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,False,False,False,False
1,False,True,True,True
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False


In [95]:
frame[frame.isin([1.0,'pen'])]

item,color,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,,,,
1,,pen,1.0,1.0
2,,,,
3,,,,
4,,,,


In [96]:
del frame['new']

In [97]:
frame

item,color,object,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,blue,ball,1.2
1,green,pen,1.0
2,yellow,pencil,0.6
3,red,paper,0.9
4,white,mug,1.7


In [100]:
# There are tons of stuff that we can do with the pandas methods.

### Other Functionalities on Indexes

In [101]:
ser = pd.Series([2,5,7,4], index=['one','two','three','four'])

In [103]:
ser

one      2
two      5
three    7
four     4
dtype: int64

In [107]:
ser.reindex(['three','four','five','one'])

three    7.0
four     4.0
five     NaN
one      2.0
dtype: float64

In [105]:
ser

one      2
two      5
three    7
four     4
dtype: int64

In [106]:
# As you can see from the value returned, the order of the labels has been completely rearranged. The 
# value corresponding to the label ‘two’ has been dropped and a new label ‘five’ is present in the Series.

In [110]:
ser = pd.Series(np.arange(4.), index=['red','blue','yellow','white'])

In [111]:
ser

red       0.0
blue      1.0
yellow    2.0
white     3.0
dtype: float64

In [112]:
ser.drop('yellow')

red      0.0
blue     1.0
white    3.0
dtype: float64

In [113]:
ser.drop(['blue','white'])

red       0.0
yellow    2.0
dtype: float64

In [114]:
frame = pd.DataFrame(np.arange(16).reshape((4,4)),
                    index=['red','blue','yellow','white'],
                    columns=['ball','pen','pencil','paper'])

In [115]:
frame

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [116]:
frame.drop(['blue','yellow'])

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
white,12,13,14,15


In [117]:
frame.drop(['pen','pencil'],axis=1)

Unnamed: 0,ball,paper
red,0,3
blue,4,7
yellow,8,11
white,12,15


In [118]:
s1 = pd.Series([3,2,5,1],['white','yellow','green','blue'])

In [119]:
s2 = pd.Series([1,4,7,2,1],['white','yellow','black','blue','brown'])

In [121]:
s1

white     3
yellow    2
green     5
blue      1
dtype: int64

In [122]:
s2

white     1
yellow    4
black     7
blue      2
brown     1
dtype: int64

In [120]:
s1 + s2

black     NaN
blue      3.0
brown     NaN
green     NaN
white     4.0
yellow    6.0
dtype: float64

In [123]:
frame1 = pd.DataFrame(np.arange(16).reshape((4,4)),
                    index=['red','blue','yellow','white'],
                    columns=['ball','pen','pencil','paper'])

In [124]:
frame2 = pd.DataFrame(np.arange(12).reshape((4,3)),
                    index=['blue','green','white','yellow'],
                    columns=['mug','pen','ball'])

In [125]:
frame1

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [126]:
frame2

Unnamed: 0,mug,pen,ball
blue,0,1,2
green,3,4,5
white,6,7,8
yellow,9,10,11


In [127]:
frame1 + frame2

Unnamed: 0,ball,mug,paper,pen,pencil
blue,6.0,,,6.0,
green,,,,,
red,,,,,
white,20.0,,,20.0,
yellow,19.0,,,19.0,


In [128]:
frame1.add(frame2)

Unnamed: 0,ball,mug,paper,pen,pencil
blue,6.0,,,6.0,
green,,,,,
red,,,,,
white,20.0,,,20.0,
yellow,19.0,,,19.0,


In [134]:
# Operations between DataFrame and Series

In [129]:
frame = pd.DataFrame(np.arange(16).reshape((4,4)),
                    index=['red','blue','yellow','white'],
                    columns=['ball','pen','pencil','paper'])

In [130]:
frame

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [131]:
ser = pd.Series(np.arange(4), index=['ball','pen','pencil','paper'])

In [132]:
ser

ball      0
pen       1
pencil    2
paper     3
dtype: int32

In [133]:
frame - ser

Unnamed: 0,ball,pen,pencil,paper
red,0,0,0,0
blue,4,4,4,4
yellow,8,8,8,8
white,12,12,12,12


### Function Application and Mapping

In [135]:
#  Functions by Element

In [137]:
frame = pd.DataFrame(np.arange(16).reshape((4,4)),
                    index=['red','blue','yellow','white'],
                    columns=['ball','pen','pencil','paper'])

In [138]:
frame

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [139]:
np.sqrt(frame)

Unnamed: 0,ball,pen,pencil,paper
red,0.0,1.0,1.414214,1.732051
blue,2.0,2.236068,2.44949,2.645751
yellow,2.828427,3.0,3.162278,3.316625
white,3.464102,3.605551,3.741657,3.872983


In [140]:
# Functions by Row or Column

In [141]:
f = lambda x: x.max() - x.min()

In [142]:
def f(x):
    return x.max() - x.min()

In [143]:
frame.apply(f)

ball      12
pen       12
pencil    12
paper     12
dtype: int32

In [144]:
frame.apply(f, axis=1)

red       3
blue      3
yellow    3
white     3
dtype: int32

In [150]:
# Function returning a series
def f(x):
    return pd.Series([x.min(), x.max()], index=['min','max'])

In [151]:
frame.apply(f)

Unnamed: 0,ball,pen,pencil,paper
min,0,1,2,3
max,12,13,14,15


In [145]:
# Here we can see the real use of index!
frame.sum()

ball      24
pen       28
pencil    32
paper     36
dtype: int64

In [146]:
frame.mean()

ball      6.0
pen       7.0
pencil    8.0
paper     9.0
dtype: float64

In [147]:
frame.describe()

Unnamed: 0,ball,pen,pencil,paper
count,4.0,4.0,4.0,4.0
mean,6.0,7.0,8.0,9.0
std,5.163978,5.163978,5.163978,5.163978
min,0.0,1.0,2.0,3.0
25%,3.0,4.0,5.0,6.0
50%,6.0,7.0,8.0,9.0
75%,9.0,10.0,11.0,12.0
max,12.0,13.0,14.0,15.0


## Reading and Writing Data
