In [1]:
import numpy as np
import pandas as pd

In [2]:
sdata = pd.Series([0.25, 0.50, 0.75, 1.0])
print(sdata)
print(type(sdata))

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64
<class 'pandas.core.series.Series'>


In [3]:
sdata[1]

0.5

In [4]:
print(sdata.index)
print(sdata.values)

RangeIndex(start=0, stop=4, step=1)
[0.25 0.5  0.75 1.  ]


In [4]:
# defualt index is numbers
sdata = pd.Series([0.25, 0.50, 0.75, 1.0], index = ['a', 'b', 'c', 'd'])
sdata

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [6]:
sdata['b']

0.5

# Series as a specialized dictionary

In [5]:
# create a dictionary
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population_dict

{'California': 38332521,
 'Texas': 26448193,
 'New York': 19651127,
 'Florida': 19552860,
 'Illinois': 12882135}

In [7]:
# this is the second way to create a series
myIndex=['California', 'Texas',  'New York', 'Florida']
mySeries=pd.Series([38332521, 26448193, 19651127, 19552860], index=myIndex)
mySeries

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
dtype: int64

In [10]:
# convert the dict to series

population = pd.Series(population_dict)
print(population, type(population))

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64 <class 'pandas.core.series.Series'>


In [9]:
population['California']

38332521

In [10]:
# we can do slicing without numbers!!!!!!!!!!!!!
# notice it includes the end value 'Florida'
population['Texas': 'Florida']

Texas       26448193
New York    19651127
Florida     19552860
dtype: int64

In [11]:
# sorting depend on column values, not row indices
population.sort_values()

Illinois      12882135
Florida       19552860
New York      19651127
Texas         26448193
California    38332521
dtype: int64

In [12]:
# Series 以 index 的數量為準，多退少補

pd.Series(5, index = [100, 200, 300])

100    5
200    5
300    5
dtype: int64

In [13]:
# the final order is depend on the index
pd.Series({2:'a', 1:'b', 3:'c'}, index=[3, 2])

3    c
2    a
dtype: object

In [14]:
area_dict = {'California': 423967,
             'Texas': 695692,
             'New York': 141297,
             'Florida': 170312,
             'Illinois': 149995}
area_dict

{'California': 423967,
 'Texas': 695692,
 'New York': 141297,
 'Florida': 170312,
 'Illinois': 149995}

In [15]:
area = pd.Series(area_dict)
area

California    423967
Texas         695692
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [16]:
# series: 1d, dataframe: 2d
# when convert dict to dataframe, 
# multiple columns -> pd.DataFrame({'column1': , 'column2': , ...})
# single column -> pd.DataFrame( , index= , columns='')

states = pd.DataFrame({'population': population, 'area': area})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695692
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [23]:
states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [24]:
states.columns

Index(['population', 'area'], dtype='object')

In [21]:
# extract 1 column
states['area']

# if we want to extract 2 columns -> states[['population', 'area']]

California    423967
Texas         695692
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

# Constructing DataFrame object
- From a single Series object
- From a list of dictionaries
- From a dictionary of Series object
- From a two-dimensional numpy array

In [31]:
print(type(population))
print(type(states))

<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>


In [17]:
print(population)
print(states)

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64
            population    area
California    38332521  423967
Texas         26448193  695692
New York      19651127  141297
Florida       19552860  170312
Illinois      12882135  149995


In [33]:
population_df = pd.DataFrame(population, columns=['population'])
population_df

Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


In [22]:
data = [{'a': i, 'b': 2*i} for i in range(3)]
print(data, type(data))

[{'a': 0, 'b': 0}, {'a': 1, 'b': 2}, {'a': 2, 'b': 4}] <class 'list'>


In [23]:
# default index are numbers(0, 1, 2, ...)
print(pd.DataFrame(data), type(pd.DataFrame(data)))

   a  b
0  0  0
1  1  2
2  2  4 <class 'pandas.core.frame.DataFrame'>


In [38]:
# pandas DataFrame can hold missing values, but numpy array needs to clean data first

pd.DataFrame([{'a': 1,'b': 2}, {'b': 3, 'c': 4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [20]:
# in pandas DateFrame, column -> columns, row -> index

pd.DataFrame(np.random.rand(3, 2), 
             columns=['foo', 'bar'], 
             index=['a', 'b', 'c'])

Unnamed: 0,foo,bar
a,0.943786,0.116786
b,0.077776,0.185253
c,0.247583,0.039736


In [41]:
ind = pd.Index([2, 3, 5, 7, 11])
ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

In [42]:
# index is immutable
ind[2] = 33

TypeError: Index does not support mutable operations

# index as ordered set

In [21]:
a = pd.Index([1, 3, 5, 7, 9])
b = pd.Index([2, 3, 5, 7, 11])

In [22]:
a & b

# python doesn't want to us to use this, it provides another method.

  a & b


Int64Index([3, 5, 7], dtype='int64')

In [23]:
a.intersection(b)

Int64Index([3, 5, 7], dtype='int64')

In [24]:
a | b

  a | b


Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [25]:
a.union(b)

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [26]:
# 大家獨有的東西
a ^ b

  a ^ b


Int64Index([1, 2, 9, 11], dtype='int64')

In [27]:
a.symmetric_difference(b)

Int64Index([1, 2, 9, 11], dtype='int64')

# Data indexing and selection
- Series as dictionary

In [31]:
# Series has no column name
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [30]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [58]:
# in pandas Series, value -> items
# data.items() is a zip object, so we split it

[*data.items()]

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [59]:
data['e'] = 1.25 # add new value directly
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [60]:
data['a':'c'] # slicing without numbers includes the last value!!!!!!!

a    0.25
b    0.50
c    0.75
dtype: float64

In [61]:
data[2:4] # we can also do slicing normally

c    0.75
d    1.00
dtype: float64

In [32]:
data[(data > 0.3) & (data < 0.8)] # masking

# (data > 0.3) & (data < 0.8) returns boolean values

b    0.50
c    0.75
dtype: float64

In [63]:
data[['a', 'd']] # a list of indices

a    0.25
d    1.00
dtype: float64

# indexing with loc and iloc

In [24]:
data = pd.Series(['a', 'b', 'c'], index = [1, 3, 5])
data

1    a
3    b
5    c
dtype: object

In [67]:
print(data.loc[1]) # get the value whose index is 1
print(data.iloc[1]) # get 1th value

a
b


In [71]:
data.loc[3] # get the value whose index is 3

'b'

# DataFrame as a dictionary

In [37]:
area = {'California': 423967,
        'Texas': 695692,
        'New York': 141297,
        'Florida': 170312,
        'Illinois': 149995}

In [38]:
population = {'California': 38332521,
              'Texas': 26448193,
              'New York': 19651127,
              'Florida': 19552860,
              'Illinois': 12882135}

In [39]:
# in fact, we can put Series and dictionaries into DataFrame; we don't have to put data with same types

data = pd.DataFrame({'area': area, 'pop': population})
data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695692,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [77]:
print(data['area'], '\n\n')
print(data.area)

# same thing

California    423967
Texas         695692
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64 


California    423967
Texas         695692
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64


In [78]:
data.area is data['area'] # prove that thet are same

True

In [82]:
data.pop is data['pop']
# they should be same, I don't know why????
# thus, avoid using data.attribute

False

In [84]:
# add a new column
data['density'] = data['pop'] / data['area']
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695692,26448193,38.017101
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [85]:
# al values in the df
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [6.95692000e+05, 2.64481930e+07, 3.80171010e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01]])

In [89]:
data.iloc[:4, :2]
# row 0, 1, 2, 3, column 0, 1

Unnamed: 0,area,pop
California,423967,38332521
Texas,695692,26448193
New York,141297,19651127
Florida,170312,19552860


In [92]:
# 用數字 slicing 沒差，用非數字才會包含尾巴
data.loc[:'Florida', : 'density']

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695692,26448193,38.017101
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121


In [93]:
data.loc[data['density'] > 100, ['pop', 'density']]

Unnamed: 0,pop,density
New York,19651127,139.076746
Florida,19552860,114.806121


In [94]:
data.iloc[0, 2] = 90
data

# California density has been changed

Unnamed: 0,area,pop,density
California,423967,38332521,90.0
Texas,695692,26448193,38.017101
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


# Operating on data in pandas

In [43]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
ser

0    6
1    3
2    7
3    4
dtype: int32

In [96]:
df = pd.DataFrame(rng.randint(0, 10, (3, 4)), 
                  columns = ['a', 'b', 'c', 'd'])
df

Unnamed: 0,a,b,c,d
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


In [97]:
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [98]:
np.sin(df * np.pi/4)

Unnamed: 0,a,b,c,d
0,-1.0,0.7071068,1.0,-1.0
1,-0.707107,1.224647e-16,0.707107,-0.7071068
2,-0.707107,1.0,-0.707107,1.224647e-16


# Index alignment in Series

In [99]:
area = pd.Series({'Alaska': 1723337, 
                  'Texas': 695662, 
                  'California': 423967})
area

Alaska        1723337
Texas          695662
California     423967
dtype: int64

In [100]:
population = pd.Series({'California': 38332521,
                        'Texas': 26448193,
                        'New York': 19651127})
population

California    38332521
Texas         26448193
New York      19651127
dtype: int64

In [101]:
# the keys in the above two Series are different

area / population

Alaska             NaN
California    0.011060
New York           NaN
Texas         0.026303
dtype: float64

In [102]:
area.index.union(population.index)

Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')

In [41]:
pd.DataFrame({'area': area, 'population': population})

Unnamed: 0,area,population
California,423967,38332521
Texas,695692,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


# Index alignment in DataFrame

In [44]:
a = pd.DataFrame(rng.randint(0, 10, (2, 2)),
                 columns = list('xy'))
a

Unnamed: 0,x,y
0,6,9
1,2,6


In [9]:
b = pd.DataFrame(rng.randint(0, 10, (3, 3)),
                 columns = list('xyz'))
b

Unnamed: 0,x,y,z
0,7,4,3
1,7,7,2
2,5,4,1


In [10]:
a + b
# b has column z and row 2, but a doesn't
# so a+b fill NaN in column z and row 2

Unnamed: 0,x,y,z
0,13.0,13.0,
1,9.0,13.0,
2,,,


In [12]:
a.add(b, fill_value = 0)
# fill 0 means first fill NaN in a with 0, then add a and b
# there will be no NaN anymore

Unnamed: 0,x,y,z
0,13.0,13.0,3.0
1,9.0,13.0,2.0
2,5.0,4.0,1.0


In [45]:
a.stack()

0  x    6
   y    9
1  x    2
   y    6
dtype: int32

In [13]:
# 把 df 縮成一排，這樣才可以算平均數
fill = a.stack().mean() # (6 + 9 + 2 + 6) / 4 = 5.75
print(fill)

# 把 a 重新加上 b，用 numpy 的 add
# 會把 b 的 column z 和 row 2 加上 fill
a.add(b, fill_value = fill)

5.75


Unnamed: 0,x,y,z
0,13.0,13.0,8.75
1,9.0,13.0,7.75
2,10.75,9.75,6.75
