In [2]:
import pandas as pd
pd.__version__

'0.20.3'

In [3]:
import numpy as np

In [4]:
data = pd.Series([11,22,33,44,55])
data

0    11
1    22
2    33
3    44
4    55
dtype: int64

In [5]:
data.values

array([11, 22, 33, 44, 55])

In [7]:
data.index

RangeIndex(start=0, stop=5, step=1)

In [8]:
data[1]

22

In [9]:
data[1:3]

1    22
2    33
dtype: int64

In [11]:
data = pd.Series([11,22,33,44,55], index=['a', 'b', 'c', 'd', 'e'])
data

a    11
b    22
c    33
d    44
e    55
dtype: int64

In [12]:
data['a']

11

In [13]:
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135
                  }
population = pd.Series(population_dict)
population # ordered as alpha

California    38332521
Florida       19552860
Illinois      12882135
New York      19651127
Texas         26448193
dtype: int64

In [20]:
population['California':'Texas']

California    38332521
Florida       19552860
Illinois      12882135
New York      19651127
Texas         26448193
dtype: int64

In [19]:
population[1:3]

Florida     19552860
Illinois    12882135
dtype: int64

In [23]:
pd.Series({1:'a', 3:'c', 2: 'b'}) # index defaults to be sorted keys

1    a
2    b
3    c
dtype: object

In [26]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
                 'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)

In [30]:
states = pd.DataFrame({'population':population, 'area':area})
states

Unnamed: 0,area,population
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [31]:
states.index

Index(['California', 'Florida', 'Illinois', 'New York', 'Texas'], dtype='object')

In [32]:
states.columns

Index(['area', 'population'], dtype='object')

In [35]:
states['area']

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [36]:
states['population']

California    38332521
Florida       19552860
Illinois      12882135
New York      19651127
Texas         26448193
Name: population, dtype: int64

In [38]:
pd.DataFrame(population, columns=['population'])
# A DataFrame is a collection of Series objects, and a single- column DataFrame can be constructed from a single Series

Unnamed: 0,population
California,38332521
Florida,19552860
Illinois,12882135
New York,19651127
Texas,26448193


In [39]:
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])
# if some keys in dictionary are missing, will be filled with NaN(Not A Number)

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [45]:
randomMatrix = np.random.rand(4,3)

In [48]:
pd.DataFrame(randomMatrix, columns=['1st', '2nd', '3rd'], index=['1','2','3','4']
)

Unnamed: 0,1st,2nd,3rd
1,0.984403,0.406631,0.166577
2,0.572669,0.38728,0.472555
3,0.45832,0.943291,0.165592
4,0.642826,0.484246,0.51731


In [54]:
ind = pd.Index([11,9,7,5,3,1])
ind

Int64Index([11, 9, 7, 5, 3, 1], dtype='int64')

In [51]:
print(ind.size, ind.shape, ind.dtype)

6 (6,) int64


In [53]:
# ind[1] = 3 # immutable will get error

In [55]:
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])

In [56]:
indA & indB # intersection = indA.intersection

Int64Index([3, 5, 7], dtype='int64')

In [57]:
indA | indB # union = indA.union

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [58]:
indA ^ indB # symmetric difference = indA.symmetric_difference

Int64Index([1, 2, 9, 11], dtype='int64')

In [70]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [71]:
data.drop(['Ohio', 'Utah'], axis=0)

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
New York,12,13,14,15


In [72]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [73]:
data.drop('one', axis=1, inplace=True)

In [74]:
data

Unnamed: 0,two,three,four
Ohio,1,2,3
Colorado,5,6,7
Utah,9,10,11
New York,13,14,15


In [76]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [77]:
data['one']

Ohio         0
Colorado     4
Utah         8
New York    12
Name: one, dtype: int64

In [80]:
data[:1]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3


In [81]:
data.loc['Ohio']

one      0
two      1
three    2
four     3
Name: Ohio, dtype: int64

In [82]:
data.iloc[0]

one      0
two      1
three    2
four     3
Name: Ohio, dtype: int64

In [83]:
data.iloc[:, :3][data.three > 5]

Unnamed: 0,one,two,three
Colorado,4,5,6
Utah,8,9,10
New York,12,13,14


In [86]:
df1 = pd.DataFrame(np.arange(12.).reshape((3,4)))
df1

Unnamed: 0,0,1,2,3
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [87]:
df1 + 1

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0
2,9.0,10.0,11.0,12.0


In [88]:
df1.add(1)

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0
2,9.0,10.0,11.0,12.0


In [89]:
df1.radd(1)

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0
2,9.0,10.0,11.0,12.0


In [98]:
df1/2

Unnamed: 0,0,1,2,3
0,0.0,0.5,1.0,1.5
1,2.0,2.5,3.0,3.5
2,4.0,4.5,5.0,5.5


In [100]:
df1.rdiv(2)

Unnamed: 0,0,1,2,3
0,inf,2.0,1.0,0.666667
1,0.5,0.4,0.333333,0.285714
2,0.25,0.222222,0.2,0.181818
