In [7]:
import pandas as pd
import numpy as np

In [11]:
df = pd.DataFrame({
     'id': list(range(1001, 1007)),
     'date': pd.date_range(start='20130102', periods=6),
     'location':['China-Beijing ', 'China-SH', 'China-guangzhou ', 'China-Shenzhen', 'China-shanghai', 'China-BEIJING'],
     'age': [23,44,54,32,34,32],
     'category':['100-A', '100-B', '110-A', '110-C', '210-A', '130-F'],
     'price':[1200, np.nan, 2133, 5433, np.nan, 4432],
     
})

df

Unnamed: 0,id,date,location,age,category,price
0,1001,2013-01-02,China-Beijing,23,100-A,1200.0
1,1002,2013-01-03,China-SH,44,100-B,
2,1003,2013-01-04,China-guangzhou,54,110-A,2133.0
3,1004,2013-01-05,China-Shenzhen,32,110-C,5433.0
4,1005,2013-01-06,China-shanghai,34,210-A,
5,1006,2013-01-07,China-BEIJING,32,130-F,4432.0


### Check datafram 

In [10]:
df.columns

Index(['id', 'date', 'location', 'age', 'category', 'price'], dtype='object')

In [32]:
df.columns.values

array(['id', 'date', 'location', 'age', 'category', 'price'], dtype=object)

In [12]:
df.shape

(6, 6)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 6 columns):
id          6 non-null int64
date        6 non-null datetime64[ns]
location    6 non-null object
age         6 non-null int64
category    6 non-null object
price       4 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 368.0+ bytes


In [20]:
df.dtypes

id                   int64
date        datetime64[ns]
location            object
age                  int64
category            object
price              float64
dtype: object

In [21]:
df.isnull()

Unnamed: 0,id,date,location,age,category,price
0,False,False,False,False,False,False
1,False,False,False,False,False,True
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,True
5,False,False,False,False,False,False


In [22]:
df.isnull().sum()

id          0
date        0
location    0
age         0
category    0
price       2
dtype: int64

In [24]:
df['location'].unique()

array(['China-Beijing ', 'China-SH', 'China-guangzhou ', 'China-Shenzhen',
       'China-shanghai', 'China-BEIJING'], dtype=object)

In [26]:
df['age'].values

array([23, 44, 54, 32, 34, 32])

In [27]:
df['age']

0    23
1    44
2    54
3    32
4    34
5    32
Name: age, dtype: int64

In [28]:
df['age'][3]

32

In [29]:
df['age'].values[3]

32

In [34]:
df.head(3)

Unnamed: 0,id,date,location,age,category,price
0,1001,2013-01-02,China-Beijing,23,100-A,1200.0
1,1002,2013-01-03,China-SH,44,100-B,
2,1003,2013-01-04,China-guangzhou,54,110-A,2133.0


### Dataframe cleaning 

In [36]:
df.dropna()

Unnamed: 0,id,date,location,age,category,price
0,1001,2013-01-02,China-Beijing,23,100-A,1200.0
2,1003,2013-01-04,China-guangzhou,54,110-A,2133.0
3,1004,2013-01-05,China-Shenzhen,32,110-C,5433.0
5,1006,2013-01-07,China-BEIJING,32,130-F,4432.0


In [37]:
df.fillna(value=0)

Unnamed: 0,id,date,location,age,category,price
0,1001,2013-01-02,China-Beijing,23,100-A,1200.0
1,1002,2013-01-03,China-SH,44,100-B,0.0
2,1003,2013-01-04,China-guangzhou,54,110-A,2133.0
3,1004,2013-01-05,China-Shenzhen,32,110-C,5433.0
4,1005,2013-01-06,China-shanghai,34,210-A,0.0
5,1006,2013-01-07,China-BEIJING,32,130-F,4432.0


In [49]:
df['price'] = df['price'].fillna(df.price.mean())

In [50]:
df

Unnamed: 0,id,date,location,age,category,price
0,1001,2013-01-02,china-beijing,23,100-A,1200.0
1,1002,2013-01-03,china-sh,44,100-B,3299.5
2,1003,2013-01-04,china-guangzhou,54,110-A,2133.0
3,1004,2013-01-05,china-shenzhen,32,110-C,5433.0
4,1005,2013-01-06,china-shanghai,34,210-A,3299.5
5,1006,2013-01-07,china-beijing,32,130-F,4432.0


In [43]:
df['location'] = df['location'].map(str.strip)

In [44]:
df['location'] = df['location'].map(str.lower)

In [45]:
df

Unnamed: 0,id,date,location,age,category,price
0,1001,2013-01-02,china-beijing,23,100-A,1200.0
1,1002,2013-01-03,china-sh,44,100-B,
2,1003,2013-01-04,china-guangzhou,54,110-A,2133.0
3,1004,2013-01-05,china-shenzhen,32,110-C,5433.0
4,1005,2013-01-06,china-shanghai,34,210-A,
5,1006,2013-01-07,china-beijing,32,130-F,4432.0


In [57]:
df['price'] = df['price'].astype('int')

In [58]:
df.dtypes

id                   int64
date        datetime64[ns]
location            object
age                  int64
category            object
price                int64
dtype: object

In [59]:
df

Unnamed: 0,id,date,location,age,category,price
0,1001,2013-01-02,china-beijing,23,100-A,1200
1,1002,2013-01-03,china-sh,44,100-B,3299
2,1003,2013-01-04,china-guangzhou,54,110-A,2133
3,1004,2013-01-05,china-shenzhen,32,110-C,5433
4,1005,2013-01-06,china-shanghai,34,210-A,3299
5,1006,2013-01-07,china-beijing,32,130-F,4432


In [64]:
import re
df['county'] = df['location'].apply(lambda x: re.split('\W+', x)[0])
df['city'] = df['location'].apply(lambda x: re.split('\W+', x)[1])

In [65]:
df

Unnamed: 0,id,date,location,age,category,price,county,city
0,1001,2013-01-02,china-beijing,23,100-A,1200,china,beijing
1,1002,2013-01-03,china-sh,44,100-B,3299,china,sh
2,1003,2013-01-04,china-guangzhou,54,110-A,2133,china,guangzhou
3,1004,2013-01-05,china-shenzhen,32,110-C,5433,china,shenzhen
4,1005,2013-01-06,china-shanghai,34,210-A,3299,china,shanghai
5,1006,2013-01-07,china-beijing,32,130-F,4432,china,beijing


In [69]:
df.drop(columns='location', inplace=True)

In [70]:
df

Unnamed: 0,id,date,age,category,price,county,city
0,1001,2013-01-02,23,100-A,1200,china,beijing
1,1002,2013-01-03,44,100-B,3299,china,sh
2,1003,2013-01-04,54,110-A,2133,china,guangzhou
3,1004,2013-01-05,32,110-C,5433,china,shenzhen
4,1005,2013-01-06,34,210-A,3299,china,shanghai
5,1006,2013-01-07,32,130-F,4432,china,beijing


In [72]:
df.rename(columns={'category':'category-size'}, inplace=True)
df

Unnamed: 0,id,date,age,category-size,price,county,city
0,1001,2013-01-02,23,100-A,1200,china,beijing
1,1002,2013-01-03,44,100-B,3299,china,sh
2,1003,2013-01-04,54,110-A,2133,china,guangzhou
3,1004,2013-01-05,32,110-C,5433,china,shenzhen
4,1005,2013-01-06,34,210-A,3299,china,shanghai
5,1006,2013-01-07,32,130-F,4432,china,beijing


In [77]:
df['city'].drop_duplicates(keep='last')

1           sh
2    guangzhou
3     shenzhen
4     shanghai
5      beijing
Name: city, dtype: object

In [78]:
df['city'].replace('sh', 'shanghai', inplace=True)

In [79]:
df

Unnamed: 0,id,date,age,category-size,price,county,city
0,1001,2013-01-02,23,100-A,1200,china,beijing
1,1002,2013-01-03,44,100-B,3299,china,shanghai
2,1003,2013-01-04,54,110-A,2133,china,guangzhou
3,1004,2013-01-05,32,110-C,5433,china,shenzhen
4,1005,2013-01-06,34,210-A,3299,china,shanghai
5,1006,2013-01-07,32,130-F,4432,china,beijing


In [81]:
df1 = pd.DataFrame({
    'id':list(range(1001,1009)),
    'gender':['male','female','male','female','male','female','male','female'],
    'pay':['Y','N','Y','Y','N','Y','N','Y'],
    'm-point':[10,12,20,40,40,40,30,20]
    })
df1

Unnamed: 0,id,gender,pay,m-point
0,1001,male,Y,10
1,1002,female,N,12
2,1003,male,Y,20
3,1004,female,Y,40
4,1005,male,N,40
5,1006,female,Y,40
6,1007,male,N,30
7,1008,female,Y,20


In [84]:
df_inner = pd.merge(df, df1, how='inner', on='id')

In [85]:
df_inner

Unnamed: 0,id,date,age,category-size,price,county,city,gender,pay,m-point
0,1001,2013-01-02,23,100-A,1200,china,beijing,male,Y,10
1,1002,2013-01-03,44,100-B,3299,china,shanghai,female,N,12
2,1003,2013-01-04,54,110-A,2133,china,guangzhou,male,Y,20
3,1004,2013-01-05,32,110-C,5433,china,shenzhen,female,Y,40
4,1005,2013-01-06,34,210-A,3299,china,shanghai,male,N,40
5,1006,2013-01-07,32,130-F,4432,china,beijing,female,Y,40


In [86]:
df_inner.set_index('id')

Unnamed: 0_level_0,date,age,category-size,price,county,city,gender,pay,m-point
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1001,2013-01-02,23,100-A,1200,china,beijing,male,Y,10
1002,2013-01-03,44,100-B,3299,china,shanghai,female,N,12
1003,2013-01-04,54,110-A,2133,china,guangzhou,male,Y,20
1004,2013-01-05,32,110-C,5433,china,shenzhen,female,Y,40
1005,2013-01-06,34,210-A,3299,china,shanghai,male,N,40
1006,2013-01-07,32,130-F,4432,china,beijing,female,Y,40


In [87]:
df_inner.sort_values(by=['age'])# replace=True

Unnamed: 0,id,date,age,category-size,price,county,city,gender,pay,m-point
0,1001,2013-01-02,23,100-A,1200,china,beijing,male,Y,10
3,1004,2013-01-05,32,110-C,5433,china,shenzhen,female,Y,40
5,1006,2013-01-07,32,130-F,4432,china,beijing,female,Y,40
4,1005,2013-01-06,34,210-A,3299,china,shanghai,male,N,40
1,1002,2013-01-03,44,100-B,3299,china,shanghai,female,N,12
2,1003,2013-01-04,54,110-A,2133,china,guangzhou,male,Y,20


In [89]:
df_inner.sort_index()

Unnamed: 0,id,date,age,category-size,price,county,city,gender,pay,m-point
0,1001,2013-01-02,23,100-A,1200,china,beijing,male,Y,10
1,1002,2013-01-03,44,100-B,3299,china,shanghai,female,N,12
2,1003,2013-01-04,54,110-A,2133,china,guangzhou,male,Y,20
3,1004,2013-01-05,32,110-C,5433,china,shenzhen,female,Y,40
4,1005,2013-01-06,34,210-A,3299,china,shanghai,male,N,40
5,1006,2013-01-07,32,130-F,4432,china,beijing,female,Y,40


In [90]:
df_inner['group'] = np.where(df_inner['price']>3000, 'high', 'low')
df_inner

Unnamed: 0,id,date,age,category-size,price,county,city,gender,pay,m-point,group
0,1001,2013-01-02,23,100-A,1200,china,beijing,male,Y,10,low
1,1002,2013-01-03,44,100-B,3299,china,shanghai,female,N,12,high
2,1003,2013-01-04,54,110-A,2133,china,guangzhou,male,Y,20,low
3,1004,2013-01-05,32,110-C,5433,china,shenzhen,female,Y,40,high
4,1005,2013-01-06,34,210-A,3299,china,shanghai,male,N,40,high
5,1006,2013-01-07,32,130-F,4432,china,beijing,female,Y,40,high


In [91]:
df_inner['group_1'] = df_inner.price.apply(lambda x: 'high' if x>3000 else 'low')
df_inner

Unnamed: 0,id,date,age,category-size,price,county,city,gender,pay,m-point,group,group_1
0,1001,2013-01-02,23,100-A,1200,china,beijing,male,Y,10,low,low
1,1002,2013-01-03,44,100-B,3299,china,shanghai,female,N,12,high,high
2,1003,2013-01-04,54,110-A,2133,china,guangzhou,male,Y,20,low,low
3,1004,2013-01-05,32,110-C,5433,china,shenzhen,female,Y,40,high,high
4,1005,2013-01-06,34,210-A,3299,china,shanghai,male,N,40,high,high
5,1006,2013-01-07,32,130-F,4432,china,beijing,female,Y,40,high,high


In [93]:
df_inner.loc[(df_inner.city=='beijing') & (df_inner.price>4000), 'sign'] = 1

In [94]:
df_inner

Unnamed: 0,id,date,age,category-size,price,county,city,gender,pay,m-point,group,group_1,sign
0,1001,2013-01-02,23,100-A,1200,china,beijing,male,Y,10,low,low,
1,1002,2013-01-03,44,100-B,3299,china,shanghai,female,N,12,high,high,
2,1003,2013-01-04,54,110-A,2133,china,guangzhou,male,Y,20,low,low,
3,1004,2013-01-05,32,110-C,5433,china,shenzhen,female,Y,40,high,high,
4,1005,2013-01-06,34,210-A,3299,china,shanghai,male,N,40,high,high,
5,1006,2013-01-07,32,130-F,4432,china,beijing,female,Y,40,high,high,1.0


In [95]:
df_inner.loc[df_inner.price>3000, 'group_2'] = 'high'
df_inner.loc[df_inner.price<=3000, 'group_2'] = 'low'

In [96]:
df_inner

Unnamed: 0,id,date,age,category-size,price,county,city,gender,pay,m-point,group,group_1,sign,group_2
0,1001,2013-01-02,23,100-A,1200,china,beijing,male,Y,10,low,low,,low
1,1002,2013-01-03,44,100-B,3299,china,shanghai,female,N,12,high,high,,high
2,1003,2013-01-04,54,110-A,2133,china,guangzhou,male,Y,20,low,low,,low
3,1004,2013-01-05,32,110-C,5433,china,shenzhen,female,Y,40,high,high,,high
4,1005,2013-01-06,34,210-A,3299,china,shanghai,male,N,40,high,high,,high
5,1006,2013-01-07,32,130-F,4432,china,beijing,female,Y,40,high,high,1.0,high


In [97]:
df_inner['category_1'] = df_inner['category-size'].apply(lambda x:re.split('\W+', x)[0])
df_inner['size'] = df_inner['category-size'].apply(lambda x:re.split('\W+', x)[1])
df_inner

Unnamed: 0,id,date,age,category-size,price,county,city,gender,pay,m-point,group,group_1,sign,group_2,category_1,size
0,1001,2013-01-02,23,100-A,1200,china,beijing,male,Y,10,low,low,,low,100,A
1,1002,2013-01-03,44,100-B,3299,china,shanghai,female,N,12,high,high,,high,100,B
2,1003,2013-01-04,54,110-A,2133,china,guangzhou,male,Y,20,low,low,,low,110,A
3,1004,2013-01-05,32,110-C,5433,china,shenzhen,female,Y,40,high,high,,high,110,C
4,1005,2013-01-06,34,210-A,3299,china,shanghai,male,N,40,high,high,,high,210,A
5,1006,2013-01-07,32,130-F,4432,china,beijing,female,Y,40,high,high,1.0,high,130,F


In [100]:
df_inner.sort_values(by='age', inplace=True)
df_inner

Unnamed: 0,id,date,age,category-size,price,county,city,gender,pay,m-point,group,group_1,sign,group_2,category_1,size
0,1001,2013-01-02,23,100-A,1200,china,beijing,male,Y,10,low,low,,low,100,A
3,1004,2013-01-05,32,110-C,5433,china,shenzhen,female,Y,40,high,high,,high,110,C
5,1006,2013-01-07,32,130-F,4432,china,beijing,female,Y,40,high,high,1.0,high,130,F
4,1005,2013-01-06,34,210-A,3299,china,shanghai,male,N,40,high,high,,high,210,A
1,1002,2013-01-03,44,100-B,3299,china,shanghai,female,N,12,high,high,,high,100,B
2,1003,2013-01-04,54,110-A,2133,china,guangzhou,male,Y,20,low,low,,low,110,A


In [102]:
df_inner.loc[0:5]

Unnamed: 0,id,date,age,category-size,price,county,city,gender,pay,m-point,group,group_1,sign,group_2,category_1,size
0,1001,2013-01-02,23,100-A,1200,china,beijing,male,Y,10,low,low,,low,100,A
3,1004,2013-01-05,32,110-C,5433,china,shenzhen,female,Y,40,high,high,,high,110,C
5,1006,2013-01-07,32,130-F,4432,china,beijing,female,Y,40,high,high,1.0,high,130,F


In [103]:
#reset_index: generate new sequencial index, and the previous index label become a column 'index'
df_inner.reset_index()

Unnamed: 0,index,id,date,age,category-size,price,county,city,gender,pay,m-point,group,group_1,sign,group_2,category_1,size
0,0,1001,2013-01-02,23,100-A,1200,china,beijing,male,Y,10,low,low,,low,100,A
1,3,1004,2013-01-05,32,110-C,5433,china,shenzhen,female,Y,40,high,high,,high,110,C
2,5,1006,2013-01-07,32,130-F,4432,china,beijing,female,Y,40,high,high,1.0,high,130,F
3,4,1005,2013-01-06,34,210-A,3299,china,shanghai,male,N,40,high,high,,high,210,A
4,1,1002,2013-01-03,44,100-B,3299,china,shanghai,female,N,12,high,high,,high,100,B
5,2,1003,2013-01-04,54,110-A,2133,china,guangzhou,male,Y,20,low,low,,low,110,A


In [111]:
df_inner.set_index('date', inplace=True)

In [112]:
df_inner

Unnamed: 0_level_0,id,age,category-size,price,county,city,gender,pay,m-point,group,group_1,sign,group_2,category_1,size
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2013-01-02,1001,23,100-A,1200,china,beijing,male,Y,10,low,low,,low,100,A
2013-01-05,1004,32,110-C,5433,china,shenzhen,female,Y,40,high,high,,high,110,C
2013-01-07,1006,32,130-F,4432,china,beijing,female,Y,40,high,high,1.0,high,130,F
2013-01-06,1005,34,210-A,3299,china,shanghai,male,N,40,high,high,,high,210,A
2013-01-03,1002,44,100-B,3299,china,shanghai,female,N,12,high,high,,high,100,B
2013-01-04,1003,54,110-A,2133,china,guangzhou,male,Y,20,low,low,,low,110,A


In [113]:
df_inner.iloc[:3, :2]

Unnamed: 0_level_0,id,age
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-01-02,1001,23
2013-01-05,1004,32
2013-01-07,1006,32


In [117]:
df_inner.iloc[[0,2,5], [3,6]]

Unnamed: 0_level_0,price,gender
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-01-02,1200,male
2013-01-07,4432,female
2013-01-04,2133,male


In [118]:
#iloc: extract by position, only take integer
#loc: extract by label or index
#iX: combination of iloc and loc, ix is deprecated in python 3

In [122]:
df_inner['city'].isin(['beijing'])

date
2013-01-02     True
2013-01-05    False
2013-01-07     True
2013-01-06    False
2013-01-03    False
2013-01-04    False
Name: city, dtype: bool

In [125]:
df_inner.query("city=='beijing' or city=='shanghai'")

Unnamed: 0_level_0,id,age,category-size,price,county,city,gender,pay,m-point,group,group_1,sign,group_2,category_1,size
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2013-01-02,1001,23,100-A,1200,china,beijing,male,Y,10,low,low,,low,100,A
2013-01-07,1006,32,130-F,4432,china,beijing,female,Y,40,high,high,1.0,high,130,F
2013-01-06,1005,34,210-A,3299,china,shanghai,male,N,40,high,high,,high,210,A
2013-01-03,1002,44,100-B,3299,china,shanghai,female,N,12,high,high,,high,100,B


In [132]:
df_inner.query("city==['beijing', 'shanghai']")

Unnamed: 0_level_0,id,age,category-size,price,county,city,gender,pay,m-point,group,group_1,sign,group_2,category_1,size
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2013-01-02,1001,23,100-A,1200,china,beijing,male,Y,10,low,low,,low,100,A
2013-01-07,1006,32,130-F,4432,china,beijing,female,Y,40,high,high,1.0,high,130,F
2013-01-06,1005,34,210-A,3299,china,shanghai,male,N,40,high,high,,high,210,A
2013-01-03,1002,44,100-B,3299,china,shanghai,female,N,12,high,high,,high,100,B


In [133]:
df_inner.query("city==['beijing', 'shanghai']").price.sum

<bound method Series.sum of date
2013-01-02    1200
2013-01-07    4432
2013-01-06    3299
2013-01-03    3299
Name: price, dtype: int64>

In [134]:
df_inner.query("city==['beijing', 'shanghai']").price.sum()

12230

In [126]:
df_inner[(df_inner.city=='beijing')|(df_inner.city=='shanghai')]

Unnamed: 0_level_0,id,age,category-size,price,county,city,gender,pay,m-point,group,group_1,sign,group_2,category_1,size
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2013-01-02,1001,23,100-A,1200,china,beijing,male,Y,10,low,low,,low,100,A
2013-01-07,1006,32,130-F,4432,china,beijing,female,Y,40,high,high,1.0,high,130,F
2013-01-06,1005,34,210-A,3299,china,shanghai,male,N,40,high,high,,high,210,A
2013-01-03,1002,44,100-B,3299,china,shanghai,female,N,12,high,high,,high,100,B


In [128]:
df_inner[df_inner.city.isin(['beijing','shanghai'])]

Unnamed: 0_level_0,id,age,category-size,price,county,city,gender,pay,m-point,group,group_1,sign,group_2,category_1,size
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2013-01-02,1001,23,100-A,1200,china,beijing,male,Y,10,low,low,,low,100,A
2013-01-07,1006,32,130-F,4432,china,beijing,female,Y,40,high,high,1.0,high,130,F
2013-01-06,1005,34,210-A,3299,china,shanghai,male,N,40,high,high,,high,210,A
2013-01-03,1002,44,100-B,3299,china,shanghai,female,N,12,high,high,,high,100,B


In [129]:
df_inner.loc[(df_inner.city=='beijing')|(df_inner.city=='shanghai')]

Unnamed: 0_level_0,id,age,category-size,price,county,city,gender,pay,m-point,group,group_1,sign,group_2,category_1,size
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2013-01-02,1001,23,100-A,1200,china,beijing,male,Y,10,low,low,,low,100,A
2013-01-07,1006,32,130-F,4432,china,beijing,female,Y,40,high,high,1.0,high,130,F
2013-01-06,1005,34,210-A,3299,china,shanghai,male,N,40,high,high,,high,210,A
2013-01-03,1002,44,100-B,3299,china,shanghai,female,N,12,high,high,,high,100,B


In [131]:
# or:|, and:&
df_inner.loc[(df_inner.age>25)|(df_inner.city=='beijing'), ['id','city','age','category','gender']]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Unnamed: 0_level_0,id,city,age,category,gender
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-01-02,1001,beijing,23,,male
2013-01-05,1004,shenzhen,32,,female
2013-01-07,1006,beijing,32,,female
2013-01-06,1005,shanghai,34,,male
2013-01-03,1002,shanghai,44,,female
2013-01-04,1003,guangzhou,54,,male


In [144]:
df_inner.query("city!='beijing'")[['id','city','age','category-size','gender']]

Unnamed: 0_level_0,id,city,age,category-size,gender
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-01-05,1004,shenzhen,32,110-C,female
2013-01-06,1005,shanghai,34,210-A,male
2013-01-03,1002,shanghai,44,100-B,female
2013-01-04,1003,guangzhou,54,110-A,male


In [145]:
df_inner.loc[(df_inner['city']!='beijing'), ['id','city','age','category-size','gender']]

Unnamed: 0_level_0,id,city,age,category-size,gender
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-01-05,1004,shenzhen,32,110-C,female
2013-01-06,1005,shanghai,34,210-A,male
2013-01-03,1002,shanghai,44,100-B,female
2013-01-04,1003,guangzhou,54,110-A,male


In [147]:
df_inner.loc[(df_inner['city']!='beijing'), ['id','city','age','category-size','gender']].sort_values(by='id')

Unnamed: 0_level_0,id,city,age,category-size,gender
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-01-03,1002,shanghai,44,100-B,female
2013-01-04,1003,guangzhou,54,110-A,male
2013-01-05,1004,shenzhen,32,110-C,female
2013-01-06,1005,shanghai,34,210-A,male


In [149]:
df_inner[['id','city']]

Unnamed: 0_level_0,id,city
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-01-02,1001,beijing
2013-01-05,1004,shenzhen
2013-01-07,1006,beijing
2013-01-06,1005,shanghai
2013-01-03,1002,shanghai
2013-01-04,1003,guangzhou


In [153]:
df_inner.groupby('city').size()

city
beijing      2
guangzhou    1
shanghai     2
shenzhen     1
dtype: int64

In [155]:
df_inner.groupby(['city','gender']).size()

city       gender
beijing    female    1
           male      1
guangzhou  male      1
shanghai   female    1
           male      1
shenzhen   female    1
dtype: int64

In [156]:
df_inner.groupby('city').count()

Unnamed: 0_level_0,id,age,category-size,price,county,gender,pay,m-point,group,group_1,sign,group_2,category_1,size
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
beijing,2,2,2,2,2,2,2,2,2,2,1,2,2,2
guangzhou,1,1,1,1,1,1,1,1,1,1,0,1,1,1
shanghai,2,2,2,2,2,2,2,2,2,2,0,2,2,2
shenzhen,1,1,1,1,1,1,1,1,1,1,0,1,1,1


In [158]:
df_inner.groupby(['city','size'])['id'].count()

city       size
beijing    A       1
           F       1
guangzhou  A       1
shanghai   A       1
           B       1
shenzhen   C       1
Name: id, dtype: int64

In [159]:
df_inner.groupby(['city','size'])['id'].size()

city       size
beijing    A       1
           F       1
guangzhou  A       1
shanghai   A       1
           B       1
shenzhen   C       1
Name: id, dtype: int64

In [160]:
df_inner.groupby('city')['price'].aggregate([len, np.sum, np.mean])

Unnamed: 0_level_0,len,sum,mean
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
beijing,2,5632,2816
guangzhou,1,2133,2133
shanghai,2,6598,3299
shenzhen,1,5433,5433


In [162]:
df_inner.groupby('city')['price'].sum()

city
beijing      5632
guangzhou    2133
shanghai     6598
shenzhen     5433
Name: price, dtype: int64

In [163]:
df_inner.sample(3)

Unnamed: 0_level_0,id,age,category-size,price,county,city,gender,pay,m-point,group,group_1,sign,group_2,category_1,size
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2013-01-07,1006,32,130-F,4432,china,beijing,female,Y,40,high,high,1.0,high,130,F
2013-01-05,1004,32,110-C,5433,china,shenzhen,female,Y,40,high,high,,high,110,C
2013-01-04,1003,54,110-A,2133,china,guangzhou,male,Y,20,low,low,,low,110,A


In [164]:
df_inner.sample?

In [170]:
weights = [0,0,0,0.01,0.49,0.5]
df_inner.sample(n=3, weights=weights)

Unnamed: 0_level_0,id,age,category-size,price,county,city,gender,pay,m-point,group,group_1,sign,group_2,category_1,size
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2013-01-04,1003,54,110-A,2133,china,guangzhou,male,Y,20,low,low,,low,110,A
2013-01-03,1002,44,100-B,3299,china,shanghai,female,N,12,high,high,,high,100,B
2013-01-06,1005,34,210-A,3299,china,shanghai,male,N,40,high,high,,high,210,A


In [171]:
weights = [0,0,0,0.01,0.49,0.5]
df_inner.sample(n=3, weights=weights, replace=True)

Unnamed: 0_level_0,id,age,category-size,price,county,city,gender,pay,m-point,group,group_1,sign,group_2,category_1,size
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2013-01-03,1002,44,100-B,3299,china,shanghai,female,N,12,high,high,,high,100,B
2013-01-04,1003,54,110-A,2133,china,guangzhou,male,Y,20,low,low,,low,110,A
2013-01-03,1002,44,100-B,3299,china,shanghai,female,N,12,high,high,,high,100,B


In [172]:
#replace=True: 又放回的取样，会有重复的sample
weights = [0,0,0,0.01,0.49,0.5]
df_inner.sample(n=3, weights=weights, replace=False)

Unnamed: 0_level_0,id,age,category-size,price,county,city,gender,pay,m-point,group,group_1,sign,group_2,category_1,size
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2013-01-04,1003,54,110-A,2133,china,guangzhou,male,Y,20,low,low,,low,110,A
2013-01-03,1002,44,100-B,3299,china,shanghai,female,N,12,high,high,,high,100,B
2013-01-06,1005,34,210-A,3299,china,shanghai,male,N,40,high,high,,high,210,A


In [175]:
df_inner.describe().T.round(2)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,6.0,1003.5,1.87,1001.0,1002.25,1003.5,1004.75,1006.0
age,6.0,36.5,10.88,23.0,32.0,33.0,41.5,54.0
price,6.0,3299.33,1523.35,1200.0,2424.5,3299.0,4148.75,5433.0
m-point,6.0,27.0,14.63,10.0,14.0,30.0,40.0,40.0
sign,1.0,1.0,,1.0,1.0,1.0,1.0,1.0


In [176]:
df_inner.corr()

Unnamed: 0,id,age,price,m-point,sign
id,1.0,-0.034401,0.682824,0.928096,
age,-0.034401,1.0,-0.08172,-0.194833,
price,0.682824,-0.08172,1.0,0.774666,
m-point,0.928096,-0.194833,0.774666,1.0,
sign,,,,,


In [178]:
df_inner.to_excel('df_inner.xlsx', sheet_name='Sheet_1')

In [179]:
df_inner.to_csv('df_inner.csv')