In [1]:
import pandas as pd
import numpy as np

## The Pandas Series Object

In [2]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [3]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [4]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
data[1]

0.5

In [8]:
data[0:1]

0    0.25
dtype: float64

### Series as Generalized Numpy array

In [9]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index = ['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [10]:
data['b']

0.5

In [11]:
data[1]

0.5

### Series as specialized dictionary

In [12]:
population_dict = {'California': 38332521, 
                   'Texas': 26448193, 
                   'New York': 19651127, 
                   'Florida': 19552860,
                   'Illinois': 12882135}

population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [13]:
population['California']

38332521

In [16]:
population['New York':'Illinois']

New York    19651127
Florida     19552860
Illinois    12882135
dtype: int64

In [17]:
population['Texas':]

Texas       26448193
New York    19651127
Florida     19552860
Illinois    12882135
dtype: int64

In [19]:
population[:'Florida']

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
dtype: int64

### More Series examples 


In [20]:
pd.Series(5, index=[100,200,300])

100    5
200    5
300    5
dtype: int64

## The Pandas DataFrame Object

In [21]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
 'Florida': 170312, 'Illinois': 149995}

area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [22]:
states = pd.DataFrame({'population':population,
                      'area': area})

states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [23]:
states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [24]:
states.columns

Index(['population', 'area'], dtype='object')

## Data Selection in Series

In [25]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index = ['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [26]:
# slicing by explicit index
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [27]:
# slicing by implicit integer index
data[0:3]

a    0.25
b    0.50
c    0.75
dtype: float64

In [28]:
# x> 0.3  and 0.8 < x
# masking
data[(data >0.3) & (data <0.8)]

b    0.50
c    0.75
dtype: float64

In [31]:
data['e'] = 1.25
data
data[['a', 'e', 'b']]

a    0.25
e    1.25
b    0.50
dtype: float64

## Data Selection in DataFrame

In [32]:
area = pd.Series({'California': 423967, 
                  'Texas': 695662,
                  'New York': 141297, 
                  'Florida': 170312,
                  'Illinois': 149995})

pop = pd.Series({'California': 38332521, 
                 'Texas': 26448193,
                 'New York': 19651127, 
                 'Florida': 19552860,
                 'Illinois': 12882135})

data = pd.DataFrame({'area': area,
                    'pop': pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [34]:
data['area']

Unnamed: 0,area
California,423967
Texas,695662
New York,141297
Florida,170312
Illinois,149995


In [35]:
data.area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [36]:
data.area is data['area']

True

In [37]:
data.pop is data['pop']

False

In [38]:
data['density'] = data['pop'] / data['area']
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [42]:
data_copy = data.reset_index()
data_copy
data_copy.loc[:2, 'pop':]

Unnamed: 0,pop,density
0,38332521,90.413926
1,26448193,38.01874
2,19651127,139.076746


In [40]:
data.iloc[:3, :2]

Unnamed: 0,pop,density
California,38332521,90.413926
Texas,26448193,38.01874
New York,19651127,139.076746


In [43]:
data.loc[data.density >100, ['pop','density']]

Unnamed: 0,pop,density
New York,19651127,139.076746
Florida,19552860,114.806121


In [44]:
data

Unnamed: 0,pop,density
California,38332521,90.413926
Texas,26448193,38.01874
New York,19651127,139.076746
Florida,19552860,114.806121
Illinois,12882135,85.883763


In [53]:
data.iloc[0,1] = 100
data

Unnamed: 0,pop,density
California,38332521,100.0
Texas,26448193,38.01874
New York,19651127,139.076746
Florida,19552860,114.806121
Illinois,12882135,85.883763


## Missing Data in Pandas

In [54]:
data1 = pd.Series([1, np.nan, 'hello', None])
data1 #

0        1
1      NaN
2    hello
3     None
dtype: object

In [55]:
data1.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [56]:
data1[data1.notnull()]

0        1
2    hello
dtype: object

### Dropping Null Values

In [57]:
data1.dropna()

0        1
2    hello
dtype: object

In [58]:
df = pd.DataFrame([[1, np.nan, 2],
                   [2, 3, 5],
                   [np.nan, 4, 6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [59]:
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [60]:
df.dropna(axis='columns')
# df.dropna(axis=1)

Unnamed: 0,2
0,2
1,5
2,6


In [61]:
df[3] = np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [63]:
df.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


### Filling null values

In [64]:
data3 = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data3

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [65]:
data3.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [67]:
data3.fillna(method='ffill')

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

## Combining Datasets: Concatenation

In [68]:
def make_df(cols, ind):
    """Quickly make a DataFrame"""
    data = {c: [str(c) + str(i) for i in ind]
    for c in cols}
    return pd.DataFrame(data, ind)

# example DataFrame
make_df('ABC', range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [69]:
df1 = make_df('AB', range(1,3))
df2 = make_df('AB', range(3,5))
print(df1); print(df2);

    A   B
1  A1  B1
2  A2  B2
    A   B
3  A3  B3
4  A4  B4


In [70]:
pd.concat([df1, df2])

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


In [71]:
df3 = make_df('AB', range(0,2))
df4 = make_df('CD', range(0,2))
print(df3); print(df4);

    A   B
0  A0  B0
1  A1  B1
    C   D
0  C0  D0
1  C1  D1


In [72]:
pd.concat([df3, df4], axis=1)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1


### Contenation with joins

In [None]:
df5 = make_df('ABC', [1, 2])
df6 = make_df('BCD', [3, 4])
print(df5); print(df6); print(pd.concat([df5, df6]))

In [None]:
print(df5); print(df6);