# Pandas

In [8]:
import pandas as pd # import pandas library

### DataFrame

In [9]:
data = pd.DataFrame({'names': ['Houari', 'Anoir', 'Khaled'],
                    'salary': [300, 400, 450]})
data

Unnamed: 0,names,salary
0,Houari,300
1,Anoir,400
2,Khaled,450


In [10]:
data.describe()

Unnamed: 0,salary
count,3.0
mean,383.333333
std,76.376262
min,300.0
25%,350.0
50%,400.0
75%,425.0
max,450.0


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
names     3 non-null object
salary    3 non-null int64
dtypes: int64(1), object(1)
memory usage: 128.0+ bytes


In [12]:
data.sort_values(by=['names'], ascending=[False]) # Sort DESC or ASC

Unnamed: 0,names,salary
2,Khaled,450
0,Houari,300
1,Anoir,400


In [13]:
[3] * 4 + [1] * 2

[3, 3, 3, 3, 1, 1]

In [14]:
from pandas import DataFrame
df = DataFrame({'x': [2] * 4 + [1] * 2, 'y': [2, 2, 3, 9, 0, 1]})
df

Unnamed: 0,x,y
0,2,2
1,2,2
2,2,3
3,2,9
4,1,0
5,1,1


In [15]:
df.sort_values(by=['y'])

Unnamed: 0,x,y
4,1,0
5,1,1
0,2,2
1,2,2
2,2,3
3,2,9


In [16]:
df.drop_duplicates()

Unnamed: 0,x,y
0,2,2
2,2,3
3,2,9
4,1,0
5,1,1


In [17]:
df.drop_duplicates(subset='x') # delete duplicate in column x

Unnamed: 0,x,y
0,2,2
4,1,0


In [18]:
def labeling(df):
    if df[0] == df[1]:
        return 'similar'
    else:
        return 'different'

df['stat'] = df.apply(labeling, axis='columns')
df

Unnamed: 0,x,y,stat
0,2,2,similar
1,2,2,similar
2,2,3,different
3,2,9,different
4,1,0,different
5,1,1,similar


In [19]:
data = DataFrame({'x': [1, 20, 30, 42, 25], 'y': [1, 2, 3, 6, 4]})
data = data.assign(result = data['x'] * data['y'])
data

Unnamed: 0,x,y,result
0,1,1,1
1,20,2,40
2,30,3,90
3,42,6,252
4,25,4,100


In [20]:
data.drop('x', axis='columns')

Unnamed: 0,y,result
0,1,1
1,2,40
2,3,90
3,6,252
4,4,100


### Series

In [21]:
pd.Series([1, 2, 3, 6, 5])

0    1
1    2
2    3
3    6
4    5
dtype: int64

In [22]:
pd.Series([2., 3, 4, 2, 9]) # for float just make '.' in one number

0    2.0
1    3.0
2    4.0
3    2.0
4    9.0
dtype: float64

In [23]:
import numpy as np
data.replace(2, np.nan) # replace in all table 2 by NaN

Unnamed: 0,x,y,result
0,1,1.0,1
1,20,,40
2,30,3.0,90
3,42,6.0,252
4,25,4.0,100


In [24]:
d = DataFrame(np.arange(12).reshape((4, 3)),
             index=['mohamed', 'omar', 'khaled', 'ali']
             )
d

Unnamed: 0,0,1,2
mohamed,0,1,2
omar,3,4,5
khaled,6,7,8
ali,9,10,11


In [25]:
d.index.map(str.upper)

Index(['MOHAMED', 'OMAR', 'KHALED', 'ALI'], dtype='object')

In [26]:
d.index = d.index.map(str.upper)
d

Unnamed: 0,0,1,2
MOHAMED,0,1,2
OMAR,3,4,5
KHALED,6,7,8
ALI,9,10,11


In [27]:
d.rename(index={'OMAR':'Houari'})

Unnamed: 0,0,1,2
MOHAMED,0,1,2
Houari,3,4,5
KHALED,6,7,8
ALI,9,10,11


### Groups by Category

In [28]:
ages = [20, 22, 25, 30, 20, 33, 37, 61, 45, 41]
bins = [18, 25, 60, 90]

cats = pd.cut(ages, bins) # make category (interval in bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 60], (18, 25], (25, 60], (25, 60], (60, 90], (25, 60], (25, 60]]
Categories (3, interval[int64]): [(18, 25] < (25, 60] < (60, 90]]

In [29]:
cats.categories

IntervalIndex([(18, 25], (25, 60], (60, 90]]
              closed='right',
              dtype='interval[int64]')

In [30]:
pd.value_counts(cats)

(25, 60]    5
(18, 25]    4
(60, 90]    1
dtype: int64

In [31]:
groups = ['youth', 'middleAged', 'Senior']

pd.cut(ages, bins, labels=groups)

[youth, youth, youth, middleAged, youth, middleAged, middleAged, Senior, middleAged, middleAged]
Categories (3, object): [youth < middleAged < Senior]

In [32]:
 pd.value_counts(pd.cut(ages, bins, labels=groups))

middleAged    5
youth         4
Senior        1
dtype: int64

In [42]:
df = DataFrame({'X' : ['a', 'a', 'b', 'a', 'b'],
              'Y' : ['one', 'two', 'one', 'two', 'one'],
              'W' : np.random.randn(5),
              'Z' : np.random.randn(5)})
df

Unnamed: 0,X,Y,W,Z
0,a,one,-0.129211,-0.715535
1,a,two,-0.544908,1.367317
2,b,one,-0.155293,1.580612
3,a,two,0.127598,0.556871
4,b,one,1.058185,0.611479


In [53]:
df['W'].groupby(df['X']).mean() # get avg

X
a   -0.182174
b    0.451446
Name: W, dtype: float64

### Get from table (DataFrame)

In [59]:
df[0:3] # get only 3 first ligne

Unnamed: 0,X,Y,W,Z
0,a,one,-0.129211,-0.715535
1,a,two,-0.544908,1.367317
2,b,one,-0.155293,1.580612


In [64]:
df.loc[:, ['X', 'Z']] # get specific columns (location)

Unnamed: 0,X,Z
0,a,-0.715535
1,a,1.367317
2,b,1.580612
3,a,0.556871
4,b,0.611479


In [73]:
df.iloc[2] # Get data in row 2 (index location)

X           b
Y         one
W   -0.155293
Z     1.58061
Name: 2, dtype: object

In [69]:
df.iloc[0:2, 0:3] # get 2 row & 3 columns from table

Unnamed: 0,X,Y,W
0,a,one,-0.129211
1,a,two,-0.544908


In [71]:
df.iloc[:, 0:2] # get all row & 2 columns from table

Unnamed: 0,X,Y
0,a,one
1,a,two
2,b,one
3,a,two
4,b,one


In [75]:
df.iloc[2, 3] # get data in row 2 & column 3

1.5806124230948482

### Date in table

In [86]:
df = DataFrame(np.random.randn(7, 3),
               index = pd.date_range('20190205', periods=7),
               columns=list('ABC')
              )

df

Unnamed: 0,A,B,C
2019-02-05,-0.115373,-1.23816,0.797144
2019-02-06,-0.854672,1.081663,-1.480701
2019-02-07,1.004604,1.102799,1.417067
2019-02-08,-2.021457,1.462289,-0.687419
2019-02-09,-0.724971,0.638925,-0.649032
2019-02-10,-0.670703,1.869971,-0.540427
2019-02-11,1.156592,0.453676,-0.354533


### Filter data

In [87]:
df

Unnamed: 0,A,B,C
2019-02-05,-0.115373,-1.23816,0.797144
2019-02-06,-0.854672,1.081663,-1.480701
2019-02-07,1.004604,1.102799,1.417067
2019-02-08,-2.021457,1.462289,-0.687419
2019-02-09,-0.724971,0.638925,-0.649032
2019-02-10,-0.670703,1.869971,-0.540427
2019-02-11,1.156592,0.453676,-0.354533


In [89]:
df.query('A > -1 & A < 1')

Unnamed: 0,A,B,C
2019-02-05,-0.115373,-1.23816,0.797144
2019-02-06,-0.854672,1.081663,-1.480701
2019-02-09,-0.724971,0.638925,-0.649032
2019-02-10,-0.670703,1.869971,-0.540427


In [91]:
df.query('B > 0 | C < A')

Unnamed: 0,A,B,C
2019-02-06,-0.854672,1.081663,-1.480701
2019-02-07,1.004604,1.102799,1.417067
2019-02-08,-2.021457,1.462289,-0.687419
2019-02-09,-0.724971,0.638925,-0.649032
2019-02-10,-0.670703,1.869971,-0.540427
2019-02-11,1.156592,0.453676,-0.354533


In [101]:
df.filter(like = 'B')

Unnamed: 0,B
2019-02-05,-1.23816
2019-02-06,1.081663
2019-02-07,1.102799
2019-02-08,1.462289
2019-02-09,0.638925
2019-02-10,1.869971
2019-02-11,0.453676


In [105]:
df.B.value_counts() # count value repeat in column B

 0.638925    1
 1.869971    1
 1.462289    1
-1.238160    1
 1.102799    1
 1.081663    1
 0.453676    1
Name: B, dtype: int64

In [107]:
df.B.value_counts().cumsum() # counter + 1

 0.638925    1
 1.869971    2
 1.462289    3
-1.238160    4
 1.102799    5
 1.081663    6
 0.453676    7
Name: B, dtype: int64

In [113]:
df.apply(lambda x : x / x.sum() * 10, axis='columns') # change data (make anonymous function using lambda)

Unnamed: 0,A,B,C
2019-02-05,2.073607,22.253478,-14.327085
2019-02-06,6.817143,-8.627697,11.810553
2019-02-07,2.850368,3.128978,4.020654
2019-02-08,16.215927,-11.730338,5.514411
2019-02-09,9.862503,-8.69194,8.829437
2019-02-10,-10.180041,28.382739,-8.202699
2019-02-11,9.210473,3.612833,-2.823306


In [114]:
df.apply(lambda x : x / x.sum() * 10, axis='index') # sum for row

Unnamed: 0,A,B,C
2019-02-05,0.518303,-2.305199,-5.321737
2019-02-06,3.839531,2.013834,9.885167
2019-02-07,-4.513084,2.053184,-9.460348
2019-02-08,9.081199,2.722481,4.589217
2019-02-09,3.25686,1.189547,4.332944
2019-02-10,3.013067,3.481501,3.607896
2019-02-11,-5.195876,0.844652,2.366861


In [117]:
df.transpose() # transpose table (change between columns & rows)

Unnamed: 0,2019-02-05 00:00:00,2019-02-06 00:00:00,2019-02-07 00:00:00,2019-02-08 00:00:00,2019-02-09 00:00:00,2019-02-10 00:00:00,2019-02-11 00:00:00
A,-0.115373,-0.854672,1.004604,-2.021457,-0.724971,-0.670703,1.156592
B,-1.23816,1.081663,1.102799,1.462289,0.638925,1.869971,0.453676
C,0.797144,-1.480701,1.417067,-0.687419,-0.649032,-0.540427,-0.354533


In [119]:
df.head(3).transpose() # get 3 first item, and transpose it

Unnamed: 0,2019-02-05 00:00:00,2019-02-06 00:00:00,2019-02-07 00:00:00
A,-0.115373,-0.854672,1.004604
B,-1.23816,1.081663,1.102799
C,0.797144,-1.480701,1.417067


In [120]:
df.dtypes

A    float64
B    float64
C    float64
dtype: object