In [1]:
import numpy as np
import pandas as pd

## 1. DataFrame 创建

In [7]:
## create by list
li = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
indices = ['a', 'b', 'c']
columns = ['aa', 'bb', 'cc']
df = pd.DataFrame(li, index=indices, columns=columns, dtype=float)
df

Unnamed: 0,aa,bb,cc
a,1.0,2.0,3.0
b,4.0,5.0,6.0
c,7.0,8.0,9.0


In [12]:
## create by dict
dic = {'aa':[1,2,3],'bb': [4,5,6], 'cc': [7,8,9]}
indices = ['a', 'b', 'c']
df = pd.DataFrame(dic, index=indices)
df

Unnamed: 0,aa,bb,cc
a,1,4,7
b,2,5,8
c,3,6,9


In [20]:
## create by list of dicts
li = [{'aa': 1, 'bb': 2, 'cc': 3},{'aa': 5, 'bb': 10, 'cc': 20}]
indices = ['a', 'b']
df = pd.DataFrame(li, index=indices)
df

Unnamed: 0,aa,bb,cc
a,1,2,3
b,5,10,20


In [23]:
## create by dict of series
dic = {'one': pd.Series([1, 2, 3],    index=['a', 'b', 'c']),
       'two': pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(dic)
df

Unnamed: 0,one,two
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


In [70]:
## create by arr
arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
indices = ['a', 'b', 'c']
columns = ['aa', 'bb', 'cc']
df = pd.DataFrame(arr, index=indices, columns=columns, dtype=float)
df

Unnamed: 0,aa,bb,cc
a,1.0,2.0,3.0
b,4.0,5.0,6.0
c,7.0,8.0,9.0


## 2. DataFrame operations
### 2.1 Column operations

In [46]:
dic = {'one': pd.Series([1, 2, 3],    index=['a', 'b', 'c']),
       'two': pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(dic)
df

Unnamed: 0,one,two
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


In [47]:
## selection
print(df['one'])
print(df.one)
print(df[['one', 'two']])

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64
a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64
   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4


In [48]:
## addition

df['three']=pd.Series([10, 20, 30, 40])
print(df)

df['three']=pd.Series([10, 20, 30, 40], index=['a','b','c', 'd'])
print(df)

   one  two  three
a  1.0    1    NaN
b  2.0    2    NaN
c  3.0    3    NaN
d  NaN    4    NaN
   one  two  three
a  1.0    1     10
b  2.0    2     20
c  3.0    3     30
d  NaN    4     40


In [49]:
## deletion

del df['one']
print(df)

df.pop('two')
print(df)

   two  three
a    1     10
b    2     20
c    3     30
d    4     40
   three
a     10
b     20
c     30
d     40


### 2.2 Row operations

In [50]:
dic = {'one': pd.Series([1, 2, 3],    index=['a', 'b', 'c']),
       'two': pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(dic)
df

Unnamed: 0,one,two
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


In [63]:
## selection

print(df.loc['a'])        # select by index name
print(df.loc[['a', 'c']])

print(df.iloc[3])         # select by index location
print(df.iloc[[1, 3]])

# print(df[0])             # 不支持， 此法相当于在按column选
# print(df[0, 3])          # 不支持， 此法相当于在按column选
print(df[1:3])             # index slice

one    1.0
two    1.0
Name: a, dtype: float64
   one  two
a  1.0    1
c  3.0    3
one    NaN
two    4.0
Name: d, dtype: float64
   one  two
b  2.0    2
d  NaN    4
   one  two
b  2.0    2
c  3.0    3


In [67]:
## addition 
df1 = pd.DataFrame([[1, 2], [3, 4]], columns = ['a','b'])
df2 = pd.DataFrame([[5, 6], [7, 8]], columns = ['a','b'])
print(df1)
print(df2)

df1 = df1.append(df2)
print(df1)


   a  b
0  1  2
1  3  4
   a  b
0  5  6
1  7  8
   a  b
0  1  2
1  3  4
0  5  6
1  7  8


In [69]:
## deletion
df = pd.DataFrame([[1, 2], [3, 4]], columns = ['a','b'])
print(df)

df = df.drop(0)
print(df)


   a  b
0  1  2
1  3  4
   a  b
1  3  4


## 3. Basic Functionality

In [71]:
arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
indices = ['a', 'b', 'c']
columns = ['aa', 'bb', 'cc']
df = pd.DataFrame(arr, index=indices, columns=columns, dtype=float)
df

Unnamed: 0,aa,bb,cc
a,1.0,2.0,3.0
b,4.0,5.0,6.0
c,7.0,8.0,9.0


In [75]:
print(df.axes)
print(df.index)
print(df.columns)

[Index(['a', 'b', 'c'], dtype='object'), Index(['aa', 'bb', 'cc'], dtype='object')]
Index(['a', 'b', 'c'], dtype='object')
Index(['aa', 'bb', 'cc'], dtype='object')


In [76]:
print(df.dtypes)
print(df.empty)
print(df.ndim)
print(df.shape)
print(df.size)

aa    float64
bb    float64
cc    float64
dtype: object
False
2
(3, 3)
9


In [77]:
print(df.values)
print(df.head(2))
print(df.tail(2))
print(df.T)

[[1. 2. 3.]
 [4. 5. 6.]
 [7. 8. 9.]]
    aa   bb   cc
a  1.0  2.0  3.0
b  4.0  5.0  6.0
    aa   bb   cc
b  4.0  5.0  6.0
c  7.0  8.0  9.0
      a    b    c
aa  1.0  4.0  7.0
bb  2.0  5.0  8.0
cc  3.0  6.0  9.0


## 4. Descriptive_Statistics

In [78]:
d = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack',
   'Lee','David','Gasper','Betina','Andres']),
   'Age':pd.Series([25,26,25,23,30,29,23,34,40,30,51,46]),
   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8,3.78,2.98,4.80,4.10,3.65])}

df = pd.DataFrame(d)
df

Unnamed: 0,Name,Age,Rating
0,Tom,25,4.23
1,James,26,3.24
2,Ricky,25,3.98
3,Vin,23,2.56
4,Steve,30,3.2
5,Smith,29,4.6
6,Jack,23,3.8
7,Lee,34,3.78
8,David,40,2.98
9,Gasper,30,4.8


In [92]:
print(df.sum())      # default is axis=0, add by row, can be written by def.sum(0)
print(df.mean())     # default is axis=0, mean by row, can be written by def.mean(0)
print(df.std())      # default is axis=0, row by row, can be written by def.std(0)

print(df.sum(1))     
print(df.mean(1))     
print(df.std(1))

Name      TomJamesRickyVinSteveSmithJackLeeDavidGasperBe...
Age                                                     382
Rating                                                44.92
dtype: object
Age       31.833333
Rating     3.743333
dtype: float64
Age       9.232682
Rating    0.661628
dtype: float64


In [99]:
print(df.count())        # count how many samples
print(df.median())
print(df.min())
print(df.max())
print(df.prod())
print(df.cumsum())

Name      12
Age       12
Rating    12
dtype: int64
Age       29.50
Rating     3.79
dtype: float64
Name      Andres
Age           23
Rating      2.56
dtype: object
Name      Vin
Age        51
Rating    4.8
dtype: object
Age       7.158408e+17
Rating    6.320128e+06
dtype: float64
                                                 Name  Age Rating
0                                                 Tom   25   4.23
1                                            TomJames   51   7.47
2                                       TomJamesRicky   76  11.45
3                                    TomJamesRickyVin   99  14.01
4                               TomJamesRickyVinSteve  129  17.21
5                          TomJamesRickyVinSteveSmith  158  21.81
6                      TomJamesRickyVinSteveSmithJack  181  25.61
7                   TomJamesRickyVinSteveSmithJackLee  215  29.39
8              TomJamesRickyVinSteveSmithJackLeeDavid  255  32.37
9        TomJamesRickyVinSteveSmithJackLeeDavidGasper  285 

In [100]:
print(df.describe())                  # number: Summarizes Numeric columns. by default, 'number'.
print(df.describe(include='object'))  # object: Summarizes String columns
print(df. describe(include='all'))    # all:Summarizes all columns together (Should not pass it as a list value)

             Age     Rating
count  12.000000  12.000000
mean   31.833333   3.743333
std     9.232682   0.661628
min    23.000000   2.560000
25%    25.000000   3.230000
50%    29.500000   3.790000
75%    35.500000   4.132500
max    51.000000   4.800000
          Name
count       12
unique      12
top     Andres
freq         1
          Name        Age     Rating
count       12  12.000000  12.000000
unique      12        NaN        NaN
top     Andres        NaN        NaN
freq         1        NaN        NaN
mean       NaN  31.833333   3.743333
std        NaN   9.232682   0.661628
min        NaN  23.000000   2.560000
25%        NaN  25.000000   3.230000
50%        NaN  29.500000   3.790000
75%        NaN  35.500000   4.132500
max        NaN  51.000000   4.800000


In [102]:
print(df.info())
print(df.head())
print(df.head(3))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 3 columns):
Name      12 non-null object
Age       12 non-null int64
Rating    12 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 416.0+ bytes
None
    Name  Age  Rating
0    Tom   25    4.23
1  James   26    3.24
2  Ricky   25    3.98
3    Vin   23    2.56
4  Steve   30    3.20
    Name  Age  Rating
0    Tom   25    4.23
1  James   26    3.24
2  Ricky   25    3.98


## 5. Function_Application
 * pipe():      Table wise Function Application  
 * apply():     Row or Column Wise Function Application  
 * applymap():  Element wise Function Application on DataFrame  
 * map():       Element wise Function Application on Series  

In [104]:
df = pd.DataFrame(np.random.randn(5,3),columns=['col1','col2','col3'])
df

Unnamed: 0,col1,col2,col3
0,0.627095,-0.110971,0.596149
1,0.474866,-0.098486,-0.727912
2,-0.530917,-0.048925,-1.492865
3,-0.465561,1.002441,0.736534
4,-0.555285,0.411435,-1.114208


In [106]:
def add(a, b):
    return a + b

df2 = df.pipe(add, 2)
df2

col1   -0.089961
col2    0.231099
col3   -0.400461
dtype: float64

In [119]:
df3 = df.apply(np.sum)
# df3 = df.pipe(np.sum)
# df3 = df.sum()
df3

col1   -0.449803
col2    1.155494
col3   -2.002303
dtype: float64

In [120]:
df3 = df.apply(np.sum, axis=1)
# df3 = df.sum(1)
df3

0    1.112273
1   -0.351532
2   -2.072708
3    1.273414
4   -1.258058
dtype: float64

In [121]:
df3 = df.apply(lambda x: x - x.min())
df3

Unnamed: 0,col1,col2,col3
0,1.18238,0.0,2.089014
1,1.030151,0.012485,0.764953
2,0.024368,0.062046,0.0
3,0.089725,1.113412,2.229399
4,0.0,0.522406,0.378657


In [122]:
df2 = df.copy()

df2['col1'] = df2['col1'].map(lambda x: x * 100)
print(df2)

        col1      col2      col3
0  62.709463 -0.110971  0.596149
1  47.486586 -0.098486 -0.727912
2 -53.091748 -0.048925 -1.492865
3 -46.556061  1.002441  0.736534
4 -55.528530  0.411435 -1.114208


In [123]:
df3 = df.copy()
df3 = df3.applymap(lambda x:x*100)
print(df3)

        col1        col2        col3
0  62.709463  -11.097097   59.614892
1  47.486586   -9.848557  -72.791230
2 -53.091748   -4.892518 -149.286492
3 -46.556061  100.244098   73.653365
4 -55.528530   41.143485 -111.420805
