In [1]:
import numpy as np
import pandas as pd

In [4]:
#two index
data = pd.Series(np.random.rand(6), index=[['a','a','b','b','c','c'], [1,2,3,4,5,2]])
data

a  1    0.535373
   2    0.115353
b  3    0.800252
   4    0.684149
c  5    0.657277
   2    0.038429
dtype: float64

In [5]:
#let's catch the values of the external index
data['a']

1    0.535373
2    0.115353
dtype: float64

In [None]:
data[:,2] #we pick values from the internal index

a    0.115353
c    0.038429
dtype: float64

In [14]:
#we can turn a dataframe into series where the row index will be the external index while the column index will be the internal one
#so with the stack function df-> series
#unstack function series -> dataframe

#example with unstack
df_unstack = data.unstack()
df_unstack


Unnamed: 0,1,2,3,4,5
a,0.535373,0.115353,,,
b,,,0.800252,0.684149,
c,,0.038429,,,0.657277


In [21]:
df_unstack.stack()


a  1    0.535373
   2    0.115353
b  3    0.800252
   4    0.684149
c  2    0.038429
   5    0.657277
dtype: float64

In [27]:
#also dataframe can have two index 

df_with_twoindex = pd.DataFrame(np.arange(12).reshape(4,3),
                                index=[['a','a','b','b'], [1,2,1,2]],
                                columns=[['ohio','ohio','colorado'], ['green','red','green']]
                                )
df_with_twoindex

Unnamed: 0_level_0,Unnamed: 1_level_0,ohio,ohio,colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,green,red,green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [25]:
#access to the index

df_with_twoindex.index

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [34]:
df_with_twoindex['ohio':'a']

Unnamed: 0,Unnamed: 1
a,1
a,2
b,1
b,2


In [None]:
df_with_twoindex.loc["a",'colorado'] #access to the single cells of df

Unnamed: 0,green
1,2
2,5


In [67]:
data2 = [0,4,2,5,7,8]
df_one = pd.DataFrame(np.arange(6).reshape(3, 2), columns=["a", "b"], index=['colorado', 'newyork', 'nj'])
df_two = pd.DataFrame(np.array(data2).reshape(3, 2), columns=["a", "c"], index=['bolo', 'milan', 'rome'])

In [68]:
df_one

Unnamed: 0,a,b
colorado,0,1
newyork,2,3
nj,4,5


In [69]:
df_two


Unnamed: 0,a,c
bolo,0,4
milan,2,5
rome,7,8


In [70]:
pd.merge(df_one, df_two)

Unnamed: 0,a,b,c
0,0,1,4
1,2,3,5


In [74]:
pd.merge(df_one, df_two, on='a')

Unnamed: 0,a,b,c
0,0,1,4
1,2,3,5


In [79]:
pd.merge(df_two, df_one, left_on='a', right_on='a', how='left')

Unnamed: 0,a,c,b
0,0,4,1.0
1,2,5,3.0
2,7,8,


CONCATENATING ALONG AN AXIS

In [82]:
arr_one = np.arange(12).reshape(3,4)
arr_two = np.random.rand(3,4)
arr_two

array([[0.73782439, 0.44243679, 0.46330075, 0.96274325],
       [0.44342732, 0.31096043, 0.16112883, 0.9459954 ],
       [0.29274395, 0.48388636, 0.06770265, 0.78033837]])

In [83]:
np.concatenate([arr_one,arr_two])

array([[ 0.        ,  1.        ,  2.        ,  3.        ],
       [ 4.        ,  5.        ,  6.        ,  7.        ],
       [ 8.        ,  9.        , 10.        , 11.        ],
       [ 0.73782439,  0.44243679,  0.46330075,  0.96274325],
       [ 0.44342732,  0.31096043,  0.16112883,  0.9459954 ],
       [ 0.29274395,  0.48388636,  0.06770265,  0.78033837]])

In [85]:
np.concatenate([arr_one, arr_two], axis=1)

array([[ 0.        ,  1.        ,  2.        ,  3.        ,  0.73782439,
         0.44243679,  0.46330075,  0.96274325],
       [ 4.        ,  5.        ,  6.        ,  7.        ,  0.44342732,
         0.31096043,  0.16112883,  0.9459954 ],
       [ 8.        ,  9.        , 10.        , 11.        ,  0.29274395,
         0.48388636,  0.06770265,  0.78033837]])

In [89]:
pd.concat([df_one, df_two])

Unnamed: 0,a,b,c
colorado,0,1.0,
newyork,2,3.0,
nj,4,5.0,
bolo,0,,4.0
milan,2,,5.0
rome,7,,8.0


In [99]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.random.randn(5),
                   'data2' : np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.561376,0.230785
1,a,two,-0.011489,0.823625
2,b,one,-0.460976,0.312512
3,b,two,-0.10125,0.199012
4,a,one,0.93684,-2.866166


In [100]:
grouped = df['data1'].groupby(df['key1'])
grouped #groups of values of column data1 based on the group of key1

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001FAC1C0E8D0>

In [101]:
grouped.max()

key1
a    0.93684
b   -0.10125
Name: data1, dtype: float64

In [102]:
grouped.mean()

key1
a    0.495576
b   -0.281113
Name: data1, dtype: float64

In [103]:
df['data1'].groupby([df['key1'], df['key2']]).mean()

key1  key2
a     one     0.749108
      two    -0.011489
b     one    -0.460976
      two    -0.101250
Name: data1, dtype: float64

In [104]:
df['data1'].groupby([df['key1'], df['key2']]).mean().unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.749108,-0.011489
b,-0.460976,-0.10125


In [None]:
df.groupby(df['key1']).mean(numeric_only=True) #because there is some columns that aren't numerical 

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.495576,-0.603919
b,-0.281113,0.255762
