In [67]:
import pandas as pd
import numpy as np
import re

## Create DataFrame

In [68]:
df = pd.DataFrame({
    'id': list(range(1001, 1007)),
    'date': pd.date_range(start='20130102', periods=6),
    'location': ['China-Beijing ', 'China-SH', 'China-guangzhou ', 'China-Shenzhen', 'China-shanghai', 'China-BEIJING'],
    'age': [23,44,54,32,34,32],
    'category': ['100-A', '100-B', '110-A', '110-C', '210-A', '130-F'],
    'price': [1200, np.nan, 2133, 5433, np.nan, 4432]  
})

df

Unnamed: 0,id,date,location,age,category,price
0,1001,2013-01-02,China-Beijing,23,100-A,1200.0
1,1002,2013-01-03,China-SH,44,100-B,
2,1003,2013-01-04,China-guangzhou,54,110-A,2133.0
3,1004,2013-01-05,China-Shenzhen,32,110-C,5433.0
4,1005,2013-01-06,China-shanghai,34,210-A,
5,1006,2013-01-07,China-BEIJING,32,130-F,4432.0


In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 6 columns):
id          6 non-null int64
date        6 non-null datetime64[ns]
location    6 non-null object
age         6 non-null int64
category    6 non-null object
price       4 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 368.0+ bytes


In [70]:
df.columns

Index(['id', 'date', 'location', 'age', 'category', 'price'], dtype='object')

In [71]:
df.columns.values

array(['id', 'date', 'location', 'age', 'category', 'price'], dtype=object)

In [72]:
df.shape

(6, 6)

In [73]:
df.dtypes

id                   int64
date        datetime64[ns]
location            object
age                  int64
category            object
price              float64
dtype: object

In [74]:
df.isnull()

Unnamed: 0,id,date,location,age,category,price
0,False,False,False,False,False,False
1,False,False,False,False,False,True
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,True
5,False,False,False,False,False,False


In [75]:
df.isnull().sum()

id          0
date        0
location    0
age         0
category    0
price       2
dtype: int64

In [76]:
df['location'].unique()

array(['China-Beijing ', 'China-SH', 'China-guangzhou ', 'China-Shenzhen',
       'China-shanghai', 'China-BEIJING'], dtype=object)

In [77]:
df.age

0    23
1    44
2    54
3    32
4    34
5    32
Name: age, dtype: int64

In [78]:
df.head(3)

Unnamed: 0,id,date,location,age,category,price
0,1001,2013-01-02,China-Beijing,23,100-A,1200.0
1,1002,2013-01-03,China-SH,44,100-B,
2,1003,2013-01-04,China-guangzhou,54,110-A,2133.0


In [79]:
df.age[3]

32

## Data Transformation

### Drop & Imputate missing values

In [80]:
df.dropna()

Unnamed: 0,id,date,location,age,category,price
0,1001,2013-01-02,China-Beijing,23,100-A,1200.0
2,1003,2013-01-04,China-guangzhou,54,110-A,2133.0
3,1004,2013-01-05,China-Shenzhen,32,110-C,5433.0
5,1006,2013-01-07,China-BEIJING,32,130-F,4432.0


In [81]:
df.fillna(0)

Unnamed: 0,id,date,location,age,category,price
0,1001,2013-01-02,China-Beijing,23,100-A,1200.0
1,1002,2013-01-03,China-SH,44,100-B,0.0
2,1003,2013-01-04,China-guangzhou,54,110-A,2133.0
3,1004,2013-01-05,China-Shenzhen,32,110-C,5433.0
4,1005,2013-01-06,China-shanghai,34,210-A,0.0
5,1006,2013-01-07,China-BEIJING,32,130-F,4432.0


In [82]:
df['price'] = df['price'].fillna(df.price.mean())
df

Unnamed: 0,id,date,location,age,category,price
0,1001,2013-01-02,China-Beijing,23,100-A,1200.0
1,1002,2013-01-03,China-SH,44,100-B,3299.5
2,1003,2013-01-04,China-guangzhou,54,110-A,2133.0
3,1004,2013-01-05,China-Shenzhen,32,110-C,5433.0
4,1005,2013-01-06,China-shanghai,34,210-A,3299.5
5,1006,2013-01-07,China-BEIJING,32,130-F,4432.0


### Unify Column format 

In [83]:
df['location'].values
# need to remove space

array(['China-Beijing ', 'China-SH', 'China-guangzhou ', 'China-Shenzhen',
       'China-shanghai', 'China-BEIJING'], dtype=object)

In [84]:
df['location'] = df['location'].map(str.strip).map(str.lower)

### Split Columns

In [85]:
df['country'] = df['location'].apply(lambda x: re.split('\W+', x)[0])
df['city'] = df['location'].apply(lambda x: re.split('\W+', x)[1])
df

Unnamed: 0,id,date,location,age,category,price,country,city
0,1001,2013-01-02,china-beijing,23,100-A,1200.0,china,beijing
1,1002,2013-01-03,china-sh,44,100-B,3299.5,china,sh
2,1003,2013-01-04,china-guangzhou,54,110-A,2133.0,china,guangzhou
3,1004,2013-01-05,china-shenzhen,32,110-C,5433.0,china,shenzhen
4,1005,2013-01-06,china-shanghai,34,210-A,3299.5,china,shanghai
5,1006,2013-01-07,china-beijing,32,130-F,4432.0,china,beijing


### Change Data type

In [86]:
df['price'] = df['price'].astype('int32')

### Drop columns, rows

In [87]:
# drop columns
df.drop(columns='location', inplace=True)

In [88]:
df

Unnamed: 0,id,date,age,category,price,country,city
0,1001,2013-01-02,23,100-A,1200,china,beijing
1,1002,2013-01-03,44,100-B,3299,china,sh
2,1003,2013-01-04,54,110-A,2133,china,guangzhou
3,1004,2013-01-05,32,110-C,5433,china,shenzhen
4,1005,2013-01-06,34,210-A,3299,china,shanghai
5,1006,2013-01-07,32,130-F,4432,china,beijing


In [89]:
# drop rows
df.drop([0,3])

Unnamed: 0,id,date,age,category,price,country,city
1,1002,2013-01-03,44,100-B,3299,china,sh
2,1003,2013-01-04,54,110-A,2133,china,guangzhou
4,1005,2013-01-06,34,210-A,3299,china,shanghai
5,1006,2013-01-07,32,130-F,4432,china,beijing


In [104]:
# drop rows where price<3000
df.drop(df[df.price<3000].index)

Unnamed: 0,id,date,age,category,price,country,city
1,1002,2013-01-03,44,100-B,3299,china,sh
3,1004,2013-01-05,32,110-C,5433,china,shenzhen
4,1005,2013-01-06,34,210-A,3299,china,shanghai
5,1006,2013-01-07,32,130-F,4432,china,beijing


### Rename Columns

In [24]:
df.rename(columns={'category':'category-size'}, inplace=True)
df

Unnamed: 0,id,date,age,category-size,price,country,city
0,1001,2013-01-02,23,100-A,1200,china,beijing
1,1002,2013-01-03,44,100-B,3299,china,sh
2,1003,2013-01-04,54,110-A,2133,china,guangzhou
3,1004,2013-01-05,32,110-C,5433,china,shenzhen
4,1005,2013-01-06,34,210-A,3299,china,shanghai
5,1006,2013-01-07,32,130-F,4432,china,beijing


### Drop duplicate rows

In [25]:
df.drop_duplicates(subset=['city'], keep='last', inplace=True)
df

Unnamed: 0,id,date,age,category-size,price,country,city
1,1002,2013-01-03,44,100-B,3299,china,sh
2,1003,2013-01-04,54,110-A,2133,china,guangzhou
3,1004,2013-01-05,32,110-C,5433,china,shenzhen
4,1005,2013-01-06,34,210-A,3299,china,shanghai
5,1006,2013-01-07,32,130-F,4432,china,beijing


### Update values

In [26]:
df['city'].replace('sh', 'shanghai', inplace=True)
df

Unnamed: 0,id,date,age,category-size,price,country,city
1,1002,2013-01-03,44,100-B,3299,china,shanghai
2,1003,2013-01-04,54,110-A,2133,china,guangzhou
3,1004,2013-01-05,32,110-C,5433,china,shenzhen
4,1005,2013-01-06,34,210-A,3299,china,shanghai
5,1006,2013-01-07,32,130-F,4432,china,beijing


### Merge df, df1

In [27]:
df1 = pd.DataFrame({
    'id':list(range(1001,1009)),
    'gender':['male','female','male','female','male','female','male','female'],
    'pay':['Y','N','Y','Y','N','Y','N','Y'],
    'm-point':[10,12,20,40,40,40,30,20]
})
df1

Unnamed: 0,id,gender,pay,m-point
0,1001,male,Y,10
1,1002,female,N,12
2,1003,male,Y,20
3,1004,female,Y,40
4,1005,male,N,40
5,1006,female,Y,40
6,1007,male,N,30
7,1008,female,Y,20


In [28]:
df_inner = pd.merge(df, df1, how='inner', on='id')
df_inner

Unnamed: 0,id,date,age,category-size,price,country,city,gender,pay,m-point
0,1002,2013-01-03,44,100-B,3299,china,shanghai,female,N,12
1,1003,2013-01-04,54,110-A,2133,china,guangzhou,male,Y,20
2,1004,2013-01-05,32,110-C,5433,china,shenzhen,female,Y,40
3,1005,2013-01-06,34,210-A,3299,china,shanghai,male,N,40
4,1006,2013-01-07,32,130-F,4432,china,beijing,female,Y,40


### Set Index

In [29]:
df_inner.set_index('id')
df_inner

Unnamed: 0,id,date,age,category-size,price,country,city,gender,pay,m-point
0,1002,2013-01-03,44,100-B,3299,china,shanghai,female,N,12
1,1003,2013-01-04,54,110-A,2133,china,guangzhou,male,Y,20
2,1004,2013-01-05,32,110-C,5433,china,shenzhen,female,Y,40
3,1005,2013-01-06,34,210-A,3299,china,shanghai,male,N,40
4,1006,2013-01-07,32,130-F,4432,china,beijing,female,Y,40


### Sort  

In [30]:
df_inner.sort_values(by='age')

Unnamed: 0,id,date,age,category-size,price,country,city,gender,pay,m-point
2,1004,2013-01-05,32,110-C,5433,china,shenzhen,female,Y,40
4,1006,2013-01-07,32,130-F,4432,china,beijing,female,Y,40
3,1005,2013-01-06,34,210-A,3299,china,shanghai,male,N,40
0,1002,2013-01-03,44,100-B,3299,china,shanghai,female,N,12
1,1003,2013-01-04,54,110-A,2133,china,guangzhou,male,Y,20


In [31]:
df

Unnamed: 0,id,date,age,category-size,price,country,city
1,1002,2013-01-03,44,100-B,3299,china,shanghai
2,1003,2013-01-04,54,110-A,2133,china,guangzhou
3,1004,2013-01-05,32,110-C,5433,china,shenzhen
4,1005,2013-01-06,34,210-A,3299,china,shanghai
5,1006,2013-01-07,32,130-F,4432,china,beijing


### Add New Columns Based on Conditions

In [32]:
# way 1
df_inner['group'] = np.where(df_inner.price>3000, 'high', 'low')
df_inner

Unnamed: 0,id,date,age,category-size,price,country,city,gender,pay,m-point,group
0,1002,2013-01-03,44,100-B,3299,china,shanghai,female,N,12,high
1,1003,2013-01-04,54,110-A,2133,china,guangzhou,male,Y,20,low
2,1004,2013-01-05,32,110-C,5433,china,shenzhen,female,Y,40,high
3,1005,2013-01-06,34,210-A,3299,china,shanghai,male,N,40,high
4,1006,2013-01-07,32,130-F,4432,china,beijing,female,Y,40,high


In [33]:
# way 2
df_inner['group_1'] = df_inner.price.apply(lambda x: 'high' if x>3000 else 'low')
df_inner

Unnamed: 0,id,date,age,category-size,price,country,city,gender,pay,m-point,group,group_1
0,1002,2013-01-03,44,100-B,3299,china,shanghai,female,N,12,high,high
1,1003,2013-01-04,54,110-A,2133,china,guangzhou,male,Y,20,low,low
2,1004,2013-01-05,32,110-C,5433,china,shenzhen,female,Y,40,high,high
3,1005,2013-01-06,34,210-A,3299,china,shanghai,male,N,40,high,high
4,1006,2013-01-07,32,130-F,4432,china,beijing,female,Y,40,high,high


In [34]:
# way 3
df_inner.loc[df_inner.price>3000, 'group_2'] = 'high'
df_inner.loc[df_inner.price<=3000, 'group_2'] = 'low'
df_inner

Unnamed: 0,id,date,age,category-size,price,country,city,gender,pay,m-point,group,group_1,group_2
0,1002,2013-01-03,44,100-B,3299,china,shanghai,female,N,12,high,high,high
1,1003,2013-01-04,54,110-A,2133,china,guangzhou,male,Y,20,low,low,low
2,1004,2013-01-05,32,110-C,5433,china,shenzhen,female,Y,40,high,high,high
3,1005,2013-01-06,34,210-A,3299,china,shanghai,male,N,40,high,high,high
4,1006,2013-01-07,32,130-F,4432,china,beijing,female,Y,40,high,high,high


In [35]:
df_inner['category'] = df_inner['category-size'].apply(lambda x: re.split('\W+', x)[0])
df_inner['size'] = df_inner['category-size'].apply(lambda x: re.split('\W+', x)[1])
df_inner

Unnamed: 0,id,date,age,category-size,price,country,city,gender,pay,m-point,group,group_1,group_2,category,size
0,1002,2013-01-03,44,100-B,3299,china,shanghai,female,N,12,high,high,high,100,B
1,1003,2013-01-04,54,110-A,2133,china,guangzhou,male,Y,20,low,low,low,110,A
2,1004,2013-01-05,32,110-C,5433,china,shenzhen,female,Y,40,high,high,high,110,C
3,1005,2013-01-06,34,210-A,3299,china,shanghai,male,N,40,high,high,high,210,A
4,1006,2013-01-07,32,130-F,4432,china,beijing,female,Y,40,high,high,high,130,F


In [36]:
df_inner.sort_values(by='age', inplace=True)
df_inner

Unnamed: 0,id,date,age,category-size,price,country,city,gender,pay,m-point,group,group_1,group_2,category,size
2,1004,2013-01-05,32,110-C,5433,china,shenzhen,female,Y,40,high,high,high,110,C
4,1006,2013-01-07,32,130-F,4432,china,beijing,female,Y,40,high,high,high,130,F
3,1005,2013-01-06,34,210-A,3299,china,shanghai,male,N,40,high,high,high,210,A
0,1002,2013-01-03,44,100-B,3299,china,shanghai,female,N,12,high,high,high,100,B
1,1003,2013-01-04,54,110-A,2133,china,guangzhou,male,Y,20,low,low,low,110,A


In [37]:
df_inner.loc[2:3]

Unnamed: 0,id,date,age,category-size,price,country,city,gender,pay,m-point,group,group_1,group_2,category,size
2,1004,2013-01-05,32,110-C,5433,china,shenzhen,female,Y,40,high,high,high,110,C
4,1006,2013-01-07,32,130-F,4432,china,beijing,female,Y,40,high,high,high,130,F
3,1005,2013-01-06,34,210-A,3299,china,shanghai,male,N,40,high,high,high,210,A


In [38]:
df_inner.reset_index()
#reset_index: generate new sequencial index, and the previous index label become a column 'index'

Unnamed: 0,index,id,date,age,category-size,price,country,city,gender,pay,m-point,group,group_1,group_2,category,size
0,2,1004,2013-01-05,32,110-C,5433,china,shenzhen,female,Y,40,high,high,high,110,C
1,4,1006,2013-01-07,32,130-F,4432,china,beijing,female,Y,40,high,high,high,130,F
2,3,1005,2013-01-06,34,210-A,3299,china,shanghai,male,N,40,high,high,high,210,A
3,0,1002,2013-01-03,44,100-B,3299,china,shanghai,female,N,12,high,high,high,100,B
4,1,1003,2013-01-04,54,110-A,2133,china,guangzhou,male,Y,20,low,low,low,110,A


In [39]:
df_inner.set_index('date', inplace=True)

In [40]:
df_inner

Unnamed: 0_level_0,id,age,category-size,price,country,city,gender,pay,m-point,group,group_1,group_2,category,size
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2013-01-05,1004,32,110-C,5433,china,shenzhen,female,Y,40,high,high,high,110,C
2013-01-07,1006,32,130-F,4432,china,beijing,female,Y,40,high,high,high,130,F
2013-01-06,1005,34,210-A,3299,china,shanghai,male,N,40,high,high,high,210,A
2013-01-03,1002,44,100-B,3299,china,shanghai,female,N,12,high,high,high,100,B
2013-01-04,1003,54,110-A,2133,china,guangzhou,male,Y,20,low,low,low,110,A


In [106]:
df_inner.loc['first_hand']='yes'
df_inner

Unnamed: 0_level_0,id,age,category-size,price,country,city,gender,pay,m-point,group,group_1,group_2,category,size
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2013-01-05 00:00:00,1004,32,110-C,5433,china,shenzhen,female,Y,40,high,high,high,110,C
2013-01-07 00:00:00,1006,32,130-F,4432,china,beijing,female,Y,40,high,high,high,130,F
2013-01-06 00:00:00,1005,34,210-A,3299,china,shanghai,male,N,40,high,high,high,210,A
2013-01-03 00:00:00,1002,44,100-B,3299,china,shanghai,female,N,12,high,high,high,100,B
2013-01-04 00:00:00,1003,54,110-A,2133,china,guangzhou,male,Y,20,low,low,low,110,A
first_hand,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes


In [107]:
df_inner.drop('first_hand', inplace=True)

In [108]:
df_inner

Unnamed: 0_level_0,id,age,category-size,price,country,city,gender,pay,m-point,group,group_1,group_2,category,size
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2013-01-05 00:00:00,1004,32,110-C,5433,china,shenzhen,female,Y,40,high,high,high,110,C
2013-01-07 00:00:00,1006,32,130-F,4432,china,beijing,female,Y,40,high,high,high,130,F
2013-01-06 00:00:00,1005,34,210-A,3299,china,shanghai,male,N,40,high,high,high,210,A
2013-01-03 00:00:00,1002,44,100-B,3299,china,shanghai,female,N,12,high,high,high,100,B
2013-01-04 00:00:00,1003,54,110-A,2133,china,guangzhou,male,Y,20,low,low,low,110,A


In [109]:
df_inner.loc[:, 'first_hand']='yes'
df_inner

Unnamed: 0_level_0,id,age,category-size,price,country,city,gender,pay,m-point,group,group_1,group_2,category,size,first_hand
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2013-01-05 00:00:00,1004,32,110-C,5433,china,shenzhen,female,Y,40,high,high,high,110,C,yes
2013-01-07 00:00:00,1006,32,130-F,4432,china,beijing,female,Y,40,high,high,high,130,F,yes
2013-01-06 00:00:00,1005,34,210-A,3299,china,shanghai,male,N,40,high,high,high,210,A,yes
2013-01-03 00:00:00,1002,44,100-B,3299,china,shanghai,female,N,12,high,high,high,100,B,yes
2013-01-04 00:00:00,1003,54,110-A,2133,china,guangzhou,male,Y,20,low,low,low,110,A,yes


### Access Data

In [41]:
df_inner.iloc[:3,:2]

Unnamed: 0_level_0,id,age
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-01-05,1004,32
2013-01-07,1006,32
2013-01-06,1005,34


In [42]:
df_inner.iloc[[0,2,4], [3,6]]

Unnamed: 0_level_0,price,gender
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-01-05,5433,female
2013-01-06,3299,male
2013-01-04,2133,male


In [43]:
df_inner['city'].isin(['beijing'])

date
2013-01-05    False
2013-01-07     True
2013-01-06    False
2013-01-03    False
2013-01-04    False
Name: city, dtype: bool

### Filtering

In [44]:
df_inner.loc[(df_inner['city']=='beijing')|(df_inner['city']=='shanghai')]

Unnamed: 0_level_0,id,age,category-size,price,country,city,gender,pay,m-point,group,group_1,group_2,category,size
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2013-01-07,1006,32,130-F,4432,china,beijing,female,Y,40,high,high,high,130,F
2013-01-06,1005,34,210-A,3299,china,shanghai,male,N,40,high,high,high,210,A
2013-01-03,1002,44,100-B,3299,china,shanghai,female,N,12,high,high,high,100,B


In [45]:
df_inner.query("city=='beijing' or city=='shanghai'")

Unnamed: 0_level_0,id,age,category-size,price,country,city,gender,pay,m-point,group,group_1,group_2,category,size
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2013-01-07,1006,32,130-F,4432,china,beijing,female,Y,40,high,high,high,130,F
2013-01-06,1005,34,210-A,3299,china,shanghai,male,N,40,high,high,high,210,A
2013-01-03,1002,44,100-B,3299,china,shanghai,female,N,12,high,high,high,100,B


In [46]:
df_inner[(df_inner['city']=='beijing')|(df_inner['city']=='shanghai')]

Unnamed: 0_level_0,id,age,category-size,price,country,city,gender,pay,m-point,group,group_1,group_2,category,size
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2013-01-07,1006,32,130-F,4432,china,beijing,female,Y,40,high,high,high,130,F
2013-01-06,1005,34,210-A,3299,china,shanghai,male,N,40,high,high,high,210,A
2013-01-03,1002,44,100-B,3299,china,shanghai,female,N,12,high,high,high,100,B


In [47]:
df_inner.loc[df_inner.city.isin(['beijing','shanghai'])]

Unnamed: 0_level_0,id,age,category-size,price,country,city,gender,pay,m-point,group,group_1,group_2,category,size
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2013-01-07,1006,32,130-F,4432,china,beijing,female,Y,40,high,high,high,130,F
2013-01-06,1005,34,210-A,3299,china,shanghai,male,N,40,high,high,high,210,A
2013-01-03,1002,44,100-B,3299,china,shanghai,female,N,12,high,high,high,100,B


In [48]:
df_inner.query("city==['beijing','shanghai']")

Unnamed: 0_level_0,id,age,category-size,price,country,city,gender,pay,m-point,group,group_1,group_2,category,size
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2013-01-07,1006,32,130-F,4432,china,beijing,female,Y,40,high,high,high,130,F
2013-01-06,1005,34,210-A,3299,china,shanghai,male,N,40,high,high,high,210,A
2013-01-03,1002,44,100-B,3299,china,shanghai,female,N,12,high,high,high,100,B


In [49]:
df_inner[(df_inner.city=='beijing')|(df_inner.city=='shanghai')].price.sum()

11030

In [50]:
df_inner[(df_inner.city=='beijing')|(df_inner.city=='shanghai')].price.mean()

3676.6666666666665

In [51]:
df_inner[(df_inner.city=='beijing')|(df_inner.city=='shanghai')].price.median()

3299.0

In [52]:
df_inner.loc[(df_inner.age>25)|(df_inner.city=='beijing'),['id','city','age','category','gender']]

Unnamed: 0_level_0,id,city,age,category,gender
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-01-05,1004,shenzhen,32,110,female
2013-01-07,1006,beijing,32,130,female
2013-01-06,1005,shanghai,34,210,male
2013-01-03,1002,shanghai,44,100,female
2013-01-04,1003,guangzhou,54,110,male


In [53]:
df_inner.loc[(df_inner.city!='beijing'),['id','city','age','category','gender']]

Unnamed: 0_level_0,id,city,age,category,gender
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-01-05,1004,shenzhen,32,110,female
2013-01-06,1005,shanghai,34,210,male
2013-01-03,1002,shanghai,44,100,female
2013-01-04,1003,guangzhou,54,110,male


In [54]:
df_inner.query("city!='beijing'")[['id','city','age','category','gender']]

Unnamed: 0_level_0,id,city,age,category,gender
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-01-05,1004,shenzhen,32,110,female
2013-01-06,1005,shanghai,34,210,male
2013-01-03,1002,shanghai,44,100,female
2013-01-04,1003,guangzhou,54,110,male


In [55]:
df_inner[df_inner.city!='beijing'][['id','city','age','category','gender']]

Unnamed: 0_level_0,id,city,age,category,gender
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-01-05,1004,shenzhen,32,110,female
2013-01-06,1005,shanghai,34,210,male
2013-01-03,1002,shanghai,44,100,female
2013-01-04,1003,guangzhou,54,110,male


In [56]:
df_inner.loc[(df_inner.city!='beijing'),['id','city','age','category','gender']].sort_values(by='id')

Unnamed: 0_level_0,id,city,age,category,gender
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-01-03,1002,shanghai,44,100,female
2013-01-04,1003,guangzhou,54,110,male
2013-01-05,1004,shenzhen,32,110,female
2013-01-06,1005,shanghai,34,210,male


In [57]:
df_inner[['id','city']]

Unnamed: 0_level_0,id,city
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-01-05,1004,shenzhen
2013-01-07,1006,beijing
2013-01-06,1005,shanghai
2013-01-03,1002,shanghai
2013-01-04,1003,guangzhou


### Groupby & Aggregation

In [58]:
df_inner.groupby('city').size()

city
beijing      1
guangzhou    1
shanghai     2
shenzhen     1
dtype: int64

In [59]:
df_inner.groupby('city').count()

Unnamed: 0_level_0,id,age,category-size,price,country,gender,pay,m-point,group,group_1,group_2,category,size
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
beijing,1,1,1,1,1,1,1,1,1,1,1,1,1
guangzhou,1,1,1,1,1,1,1,1,1,1,1,1,1
shanghai,2,2,2,2,2,2,2,2,2,2,2,2,2
shenzhen,1,1,1,1,1,1,1,1,1,1,1,1,1


In [60]:
df_inner.groupby('city').count()['price']

city
beijing      1
guangzhou    1
shanghai     2
shenzhen     1
Name: price, dtype: int64

In [61]:
df_inner.groupby(['city','size']).size()

city       size
beijing    F       1
guangzhou  A       1
shanghai   A       1
           B       1
shenzhen   C       1
dtype: int64

In [62]:
df_inner.groupby(['city','size']).size().to_frame('count')

Unnamed: 0_level_0,Unnamed: 1_level_0,count
city,size,Unnamed: 2_level_1
beijing,F,1
guangzhou,A,1
shanghai,A,1
shanghai,B,1
shenzhen,C,1


In [63]:
df_inner.groupby(['city','size']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,id,age,category-size,price,country,gender,pay,m-point,group,group_1,group_2,category
city,size,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
beijing,F,1,1,1,1,1,1,1,1,1,1,1,1
guangzhou,A,1,1,1,1,1,1,1,1,1,1,1,1
shanghai,A,1,1,1,1,1,1,1,1,1,1,1,1
shanghai,B,1,1,1,1,1,1,1,1,1,1,1,1
shenzhen,C,1,1,1,1,1,1,1,1,1,1,1,1


In [64]:
df_inner.groupby(['city','size']).count()['price']

city       size
beijing    F       1
guangzhou  A       1
shanghai   A       1
           B       1
shenzhen   C       1
Name: price, dtype: int64

In [65]:
df_inner.groupby(['city','size'])['price'].sum().to_frame('sum')

Unnamed: 0_level_0,Unnamed: 1_level_0,sum
city,size,Unnamed: 2_level_1
beijing,F,4432
guangzhou,A,2133
shanghai,A,3299
shanghai,B,3299
shenzhen,C,5433


In [66]:
df_inner.groupby(['city','size'])['price'].mean().to_frame('mean')

Unnamed: 0_level_0,Unnamed: 1_level_0,mean
city,size,Unnamed: 2_level_1
beijing,F,4432
guangzhou,A,2133
shanghai,A,3299
shanghai,B,3299
shenzhen,C,5433


In [180]:
df_inner.groupby(['city','size'])['price'].aggregate([len, np.mean, np.sum])

Unnamed: 0_level_0,Unnamed: 1_level_0,len,mean,sum
city,size,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
beijing,F,1,4432,4432
guangzhou,A,1,2133,2133
shanghai,A,1,3299,3299
shanghai,B,1,3299,3299
shenzhen,C,1,5433,5433


In [181]:
df_inner.groupby(['city','size'])['price'].aggregate([np.mean])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean
city,size,Unnamed: 2_level_1
beijing,F,4432
guangzhou,A,2133
shanghai,A,3299
shanghai,B,3299
shenzhen,C,5433


### Pivot Table

In [183]:
df_inner.pivot_table(index='city', values='price', aggfunc='mean')

Unnamed: 0_level_0,price
city,Unnamed: 1_level_1
beijing,4432
guangzhou,2133
shanghai,3299
shenzhen,5433


In [185]:
df_inner.groupby('city')['price'].aggregate([np.mean])

Unnamed: 0_level_0,mean
city,Unnamed: 1_level_1
beijing,4432
guangzhou,2133
shanghai,3299
shenzhen,5433


In [188]:
df_pivot = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
                         "bar", "bar", "bar", "bar"],
                   "B": ["one", "one", "one", "two", "two",
                         "one", "one", "two", "two"],
                   "C": ["small", "large", "large", "small",
                         "small", "large", "small", "small",
                         "large"],
                   "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
                   "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
df_pivot

Unnamed: 0,A,B,C,D,E
0,foo,one,small,1,2
1,foo,one,large,2,4
2,foo,one,large,2,5
3,foo,two,small,3,5
4,foo,two,small,3,6
5,bar,one,large,4,6
6,bar,one,small,5,8
7,bar,two,small,6,9
8,bar,two,large,7,9


In [189]:
df_pivot.pivot_table(index=['A','B'], columns=['C'], aggfunc='sum')

Unnamed: 0_level_0,Unnamed: 1_level_0,D,D,E,E
Unnamed: 0_level_1,C,large,small,large,small
A,B,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
bar,one,4.0,5.0,6.0,8.0
bar,two,7.0,6.0,9.0,9.0
foo,one,4.0,1.0,9.0,2.0
foo,two,,6.0,,11.0


In [193]:
df_pivot.pivot_table(index=['A','B'], columns=['C'], aggfunc=np.sum)

Unnamed: 0_level_0,Unnamed: 1_level_0,D,D,E,E
Unnamed: 0_level_1,C,large,small,large,small
A,B,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
bar,one,4.0,5.0,6.0,8.0
bar,two,7.0,6.0,9.0,9.0
foo,one,4.0,1.0,9.0,2.0
foo,two,,6.0,,11.0


In [190]:
df_pivot.pivot_table(values=['D'], index=['A','B'], columns=['C'], aggfunc='sum')

Unnamed: 0_level_0,Unnamed: 1_level_0,D,D
Unnamed: 0_level_1,C,large,small
A,B,Unnamed: 2_level_2,Unnamed: 3_level_2
bar,one,4.0,5.0
bar,two,7.0,6.0
foo,one,4.0,1.0
foo,two,,6.0


In [205]:
df_pivot.pivot_table(values=['D'], index=['A','B'], columns=['C'], aggfunc='sum').T

Unnamed: 0_level_0,A,bar,bar,foo,foo
Unnamed: 0_level_1,B,one,two,one,two
Unnamed: 0_level_2,C,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
D,large,4.0,7.0,4.0,
D,small,5.0,6.0,1.0,6.0


In [192]:
df_pivot.groupby(['A','B','C'])['D'].aggregate([np.sum])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sum
A,B,C,Unnamed: 3_level_1
bar,one,large,4
bar,one,small,5
bar,two,large,7
bar,two,small,6
foo,one,large,4
foo,one,small,1
foo,two,small,6


### Sampling

In [195]:
df_pivot.sample(4)

Unnamed: 0,A,B,C,D,E
5,bar,one,large,4,6
6,bar,one,small,5,8
3,foo,two,small,3,5
7,bar,two,small,6,9


In [197]:
df_pivot

Unnamed: 0,A,B,C,D,E
0,foo,one,small,1,2
1,foo,one,large,2,4
2,foo,one,large,2,5
3,foo,two,small,3,5
4,foo,two,small,3,6
5,bar,one,large,4,6
6,bar,one,small,5,8
7,bar,two,small,6,9
8,bar,two,large,7,9


In [200]:
weight = [0,0,0,0,0,0.01,0.49,0.2,0.3]
df_pivot.sample(n=3, weights=weight)

Unnamed: 0,A,B,C,D,E
6,bar,one,small,5,8
8,bar,two,large,7,9
5,bar,one,large,4,6


In [202]:
weight = [0,0,0,0,0,0.01,0.49,0.2,0.3]
df_pivot.sample(n=6, weights=weight, replace=True)
# replace=True: bootstrap sampling, sampling with replacemennt. allow duplicate samples

Unnamed: 0,A,B,C,D,E
8,bar,two,large,7,9
7,bar,two,small,6,9
7,bar,two,small,6,9
6,bar,one,small,5,8
6,bar,one,small,5,8
8,bar,two,large,7,9


In [203]:
df_pivot

Unnamed: 0,A,B,C,D,E
0,foo,one,small,1,2
1,foo,one,large,2,4
2,foo,one,large,2,5
3,foo,two,small,3,5
4,foo,two,small,3,6
5,bar,one,large,4,6
6,bar,one,small,5,8
7,bar,two,small,6,9
8,bar,two,large,7,9


In [204]:
df_pivot.T

Unnamed: 0,0,1,2,3,4,5,6,7,8
A,foo,foo,foo,foo,foo,bar,bar,bar,bar
B,one,one,one,two,two,one,one,two,two
C,small,large,large,small,small,large,small,small,large
D,1,2,2,3,3,4,5,6,7
E,2,4,5,5,6,6,8,9,9


In [206]:
df_pivot.describe()

Unnamed: 0,D,E
count,9.0,9.0
mean,3.666667,6.0
std,2.0,2.345208
min,1.0,2.0
25%,2.0,5.0
50%,3.0,6.0
75%,5.0,8.0
max,7.0,9.0


In [210]:
df_pivot.describe().T.round(2)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
D,9.0,3.67,2.0,1.0,2.0,3.0,5.0,7.0
E,9.0,6.0,2.35,2.0,5.0,6.0,8.0,9.0


In [211]:
df_pivot.corr()

Unnamed: 0,D,E
D,1.0,0.959403
E,0.959403,1.0


In [212]:
df_inner.corr()

Unnamed: 0,id,age,price,m-point
id,1.0,-0.720634,0.431899,0.893685
age,-0.720634,1.0,-0.840497,-0.828906
price,0.431899,-0.840497,1.0,0.643564
m-point,0.893685,-0.828906,0.643564,1.0


## Load Data

In [213]:
df_inner

Unnamed: 0_level_0,id,age,category-size,price,country,city,gender,pay,m-point,group,group_1,group_2,category,size
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2013-01-05,1004,32,110-C,5433,china,shenzhen,female,Y,40,high,high,high,110,C
2013-01-07,1006,32,130-F,4432,china,beijing,female,Y,40,high,high,high,130,F
2013-01-06,1005,34,210-A,3299,china,shanghai,male,N,40,high,high,high,210,A
2013-01-03,1002,44,100-B,3299,china,shanghai,female,N,12,high,high,high,100,B
2013-01-04,1003,54,110-A,2133,china,guangzhou,male,Y,20,low,low,low,110,A


### Write to a csv file

In [218]:
df_inner.to_csv('df_inner.csv')

### Write to a excel file

In [219]:
df_inner.to_excel('df_inner.xlsx', sheet_name='Sheet_1')