# PANDAS TUTORIAL

In [1]:
import numpy as np

In [2]:
import pandas as pd

## 10min to pandas

### Object creation

##### Series 

In [3]:
s = pd.Series([1,3,5,8,6,0])

In [4]:
s #crée automatiquement un index avec des nombres entiers

0    1
1    3
2    5
3    8
4    6
5    0
dtype: int64

##### DataFrame 

In [10]:
dates = pd.date_range("20130101", periods = 6)
dates #periods = le nbre de jour qui se suivent dont on veut 
                # faire apparaitre la date

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [47]:
df = df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,1.840819,-0.431351,-2.499161,0.906483
2013-01-02,-0.301082,0.810154,-2.052269,1.497925
2013-01-03,-0.199374,1.639223,1.993562,0.394749
2013-01-04,-2.093442,0.220406,-0.438023,-1.532158
2013-01-05,-0.73586,-0.921368,0.968376,-0.387609
2013-01-06,0.607657,0.589649,-1.271901,1.831074


In [36]:
df1 = pd.DataFrame([[0,0,0,0], [0,0,0,0], [0,0,0,0], [0,0,0,0], [0,0,0,0],[0,0,0,0]],
                  index = dates, 
                  columns = ['A','B','C','D'])
df1 #on fixe l'index (les lignes) et les colonnes
   #le premier argument est une matrice qui remplit le milieu du tab

Unnamed: 0,A,B,C,D
2013-01-01,0,0,0,0
2013-01-02,0,0,0,0
2013-01-03,0,0,0,0
2013-01-04,0,0,0,0
2013-01-05,0,0,0,0
2013-01-06,0,0,0,0


In [16]:
df2 = pd.DataFrame( {
         "A": 1.0,
         "B": pd.Timestamp("20130102"),
         "C": pd.Series(1, index=list(range(4)), dtype="float32"),
         "D": np.array([3] * 4, dtype="int32"),
         "E": pd.Categorical(["test", "train", "test", "train"]),
         "F": "foo",
     })

df2 #à partir d'un dictionaire

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


### Viewing Data 

In [17]:
df2.dtypes
#dtypes donne le type des élements d'une colonne

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [18]:
print(df2.columns)
print(df2.index)

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
Int64Index([0, 1, 2, 3], dtype='int64')


In [20]:
print(df2.head(2))
print(df2.tail(4))

     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo


/!\ NumPy arrays have one dtype for the entire array, while DataFrames have one dtype per column !!

In [37]:
%%time

df1.to_numpy()

CPU times: user 42 µs, sys: 0 ns, total: 42 µs
Wall time: 47 µs


array([[0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0]])

In [23]:
%%time

df2.to_numpy() ##le passage à numpy fait perdre le nom des lignes et
               ##des colones ! -> se fait bien si partout le meme type

CPU times: user 1.19 ms, sys: 773 µs, total: 1.96 ms
Wall time: 7.95 ms


array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [25]:
print(df2.describe())
print(df2.T)

         A    C    D
count  4.0  4.0  4.0
mean   1.0  1.0  3.0
std    0.0  0.0  0.0
min    1.0  1.0  3.0
25%    1.0  1.0  3.0
50%    1.0  1.0  3.0
75%    1.0  1.0  3.0
max    1.0  1.0  3.0
                     0                    1                    2  \
A                    1                    1                    1   
B  2013-01-02 00:00:00  2013-01-02 00:00:00  2013-01-02 00:00:00   
C                    1                    1                    1   
D                    3                    3                    3   
E                 test                train                 test   
F                  foo                  foo                  foo   

                     3  
A                    1  
B  2013-01-02 00:00:00  
C                    1  
D                    3  
E                train  
F                  foo  


In [28]:
df2.sort_values(by='B')

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


### Selection 

##### Getting

In [30]:
df2['E']

0     test
1    train
2     test
3    train
Name: E, dtype: category
Categories (2, object): ['test', 'train']

In [32]:
df2[0:3] #sélectionner certains évènements

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo


In [38]:
df1["20130102":"20130104"]

Unnamed: 0,A,B,C,D
2013-01-02,0,0,0,0
2013-01-03,0,0,0,0
2013-01-04,0,0,0,0


##### Selection by labels - LOC

In [52]:
df2.loc[0:2,'A':'C'] # de 'A' à 'C'
#la borne extérieure est prise en compte

Unnamed: 0,A,B,C
0,1.0,2013-01-02,1.0
1,1.0,2013-01-02,1.0
2,1.0,2013-01-02,1.0


In [41]:
df2.loc[:,['A','C']] #que 'A' et 'C'

Unnamed: 0,A,C
0,1.0,1.0
1,1.0,1.0
2,1.0,1.0
3,1.0,1.0


In [48]:
print(df2.loc[0,'B'])
print(df2.at[0,'B'])

2013-01-02 00:00:00
2013-01-02 00:00:00


##### Selection by position - ILOC

In [54]:
df

Unnamed: 0,A,B,C,D
2013-01-01,1.840819,-0.431351,-2.499161,0.906483
2013-01-02,-0.301082,0.810154,-2.052269,1.497925
2013-01-03,-0.199374,1.639223,1.993562,0.394749
2013-01-04,-2.093442,0.220406,-0.438023,-1.532158
2013-01-05,-0.73586,-0.921368,0.968376,-0.387609
2013-01-06,0.607657,0.589649,-1.271901,1.831074


In [50]:
df.iloc[3] #valeur de la 3+1ème ligne

A   -2.093442
B    0.220406
C   -0.438023
D   -1.532158
Name: 2013-01-04 00:00:00, dtype: float64

In [51]:
df.iloc[3:5,0:2] #la borne extérieure n'est pas prise en compte

Unnamed: 0,A,B
2013-01-04,-2.093442,0.220406
2013-01-05,-0.73586,-0.921368


In [53]:
df.iloc[1,1]

0.8101538996970606

In [55]:
df[df['A'] > 0] #on peut faire des masques comme on en a l'habitude

Unnamed: 0,A,B,C,D
2013-01-01,1.840819,-0.431351,-2.499161,0.906483
2013-01-06,0.607657,0.589649,-1.271901,1.831074


In [56]:
df[df['A'] < 0]

Unnamed: 0,A,B,C,D
2013-01-02,-0.301082,0.810154,-2.052269,1.497925
2013-01-03,-0.199374,1.639223,1.993562,0.394749
2013-01-04,-2.093442,0.220406,-0.438023,-1.532158
2013-01-05,-0.73586,-0.921368,0.968376,-0.387609


In [57]:
df[df>0]

Unnamed: 0,A,B,C,D
2013-01-01,1.840819,,,0.906483
2013-01-02,,0.810154,,1.497925
2013-01-03,,1.639223,1.993562,0.394749
2013-01-04,,0.220406,,
2013-01-05,,,0.968376,
2013-01-06,0.607657,0.589649,,1.831074


In [58]:
df3=df.copy()
df3['E'] = ['one', 'two', 'two', 'three', 'four', 'five']
df3

Unnamed: 0,A,B,C,D,E
2013-01-01,1.840819,-0.431351,-2.499161,0.906483,one
2013-01-02,-0.301082,0.810154,-2.052269,1.497925,two
2013-01-03,-0.199374,1.639223,1.993562,0.394749,two
2013-01-04,-2.093442,0.220406,-0.438023,-1.532158,three
2013-01-05,-0.73586,-0.921368,0.968376,-0.387609,four
2013-01-06,0.607657,0.589649,-1.271901,1.831074,five


In [60]:
df3[df3['E'].isin(['two','five'])] # POUR FILTRER

Unnamed: 0,A,B,C,D,E
2013-01-02,-0.301082,0.810154,-2.052269,1.497925,two
2013-01-03,-0.199374,1.639223,1.993562,0.394749,two
2013-01-06,0.607657,0.589649,-1.271901,1.831074,five


In [61]:
df3['E'].isin(['two','five']) #MASK SPECIAL DATAFRAMES

2013-01-01    False
2013-01-02     True
2013-01-03     True
2013-01-04    False
2013-01-05    False
2013-01-06     True
Freq: D, Name: E, dtype: bool

In [64]:
df3[df3['B'].isin([0.810154,1.639223])] 
#marche pas car tout les chiffres qu'on voit ne sont pas la réalité

Unnamed: 0,A,B,C,D,E


In [65]:
df3[df3['B'] > 0]

Unnamed: 0,A,B,C,D,E
2013-01-02,-0.301082,0.810154,-2.052269,1.497925,two
2013-01-03,-0.199374,1.639223,1.993562,0.394749,two
2013-01-04,-2.093442,0.220406,-0.438023,-1.532158,three
2013-01-06,0.607657,0.589649,-1.271901,1.831074,five
