In [1]:
import pandas as pd

In [3]:
data = pd.Series([0.5, 1, 2.5, 3])
data

0    0.5
1    1.0
2    2.5
3    3.0
dtype: float64

In [4]:
data.values

array([0.5, 1. , 2.5, 3. ])

In [5]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
data[1:3]

1    1.0
2    2.5
dtype: float64

In [7]:
data = pd.Series([0.5, 1, 2.5, 3], index=['a', 'b', 'c', 'd']) # creates both implicit & explicit index
data

a    0.5
b    1.0
c    2.5
d    3.0
dtype: float64

In [8]:
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [9]:
population['Texas':'Florida']

Texas       26448193
New York    19651127
Florida     19552860
dtype: int64

In [11]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [12]:
states = pd.DataFrame({'population': population, 'area': area})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [13]:
states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [14]:
states.columns

Index(['population', 'area'], dtype='object')

In [15]:
states.head(3)

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297


In [16]:
states.shape

(5, 2)

In [17]:
index = pd.Index([1,3,5]) # immutable
index

Int64Index([1, 3, 5], dtype='int64')

In [19]:
states.loc['Texas':'Florida'] # access based on explicit index

Unnamed: 0,population,area
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312


In [21]:
states.iloc[1:4] # access based on implicit index

Unnamed: 0,population,area
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312


In [27]:
states.iloc[1:4,0]

Texas       26448193
New York    19651127
Florida     19552860
Name: population, dtype: int64

In [22]:
states['population']

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
Name: population, dtype: int64

In [23]:
states.population # alternate syntax

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
Name: population, dtype: int64

In [24]:
states['density'] = states['population']/states['area']

In [25]:
states

Unnamed: 0,population,area,density
California,38332521,423967,90.413926
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763


In [28]:
states[states['population'] > 20000000]

Unnamed: 0,population,area,density
California,38332521,423967,90.413926
Texas,26448193,695662,38.01874


In [31]:
states[(states['population'] > 20000000) & (states['population'] < 30000000)]

Unnamed: 0,population,area,density
Texas,26448193,695662,38.01874


In [36]:
import numpy as np
data = pd.Series([1, np.nan, 'hello', None])
data

0        1
1      NaN
2    hello
3     None
dtype: object

In [39]:
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [40]:
data[data.notnull()]

0        1
2    hello
dtype: object

In [41]:
data.dropna()

0        1
2    hello
dtype: object

In [42]:
df = pd.DataFrame([[1,      np.nan, 2],
                   [2,      3,      5],
                   [np.nan, 4,      6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [43]:
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [44]:
df.dropna(axis='columns')

Unnamed: 0,2
0,2
1,5
2,6


In [45]:
df.dropna(axis='columns', how='all') # drop if the entire column is NaN

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [46]:
df.fillna(0) # fill values for NaN

Unnamed: 0,0,1,2
0,1.0,0.0,2
1,2.0,3.0,5
2,0.0,4.0,6


In [48]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,2.0,4.0,6


In [49]:
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',
                   'Eric Idle', 'Terry Jones', 'Michael Palin'])

In [52]:
monte.str.split()

0    [Graham, Chapman]
1       [John, Cleese]
2     [Terry, Gilliam]
3         [Eric, Idle]
4       [Terry, Jones]
5     [Michael, Palin]
dtype: object