In [1]:
import pandas as pd
import scipy.stats

In [2]:
maDataFrame = pd.read_csv('data/ozone.csv')
maDataFrame

Unnamed: 0,JOUR,O3obs,MOCAGE,TEMPE,RMH2O,NO2,NO,STATION,VentMOD,VentANG
0,1,91,93.2,21.5,0.00847,1.602,0.424,Aix,9.5000,-0.64350
1,1,100,104.6,20.2,0.00881,2.121,0.531,Aix,8.0100,-0.04996
2,0,82,103.6,17.4,0.00951,1.657,0.467,Aix,9.3771,-0.12832
3,0,94,94.8,18.8,0.00855,2.350,0.701,Aix,9.4578,-0.34516
4,0,107,99.0,23.7,0.00731,1.653,0.452,Aix,7.8791,-0.41822
...,...,...,...,...,...,...,...,...,...,...
1036,0,116,233.6,22.1,0.01048,4.843,0.431,Pla,4.8052,-0.20964
1037,0,60,162.5,15.8,0.00789,3.166,0.411,Pla,2.7203,-0.62880
1038,0,74,198.4,16.2,0.00811,8.349,0.942,Pla,4.0311,0.12435
1039,0,121,191.7,16.4,0.00722,2.101,0.199,Pla,3.8000,0.00000


## Quantitive variables

In [3]:
print( maDataFrame.O3obs.min() )
print( maDataFrame.O3obs.max() )
print( maDataFrame.O3obs.mean() )

19
319
115.40057636887607


In [4]:
# variance (sigma²)
print( maDataFrame.O3obs.var() ) # default ddof value is 1 (unbiaised)
print( maDataFrame.O3obs.var(ddof=0) )# biaised variance = variance on sample
print( maDataFrame.O3obs.var(ddof=1) )# unbiased variance (on global pop = n/(n-1) * variance 

1680.9307304367082
1679.316003510256
1680.9307304367082


In [5]:
# ecart-type (sigma)
maDataFrame.O3obs.var(ddof=1)

1680.9307304367082

In [6]:
# quantile
print( maDataFrame.O3obs.quantile(0.5) ) # half of sample values < quantile(0.5) < half of sample values == median
# inverse quantile : get percentage of sample values inferior to target
print(  scipy.stats.percentileofscore(maDataFrame.O3obs, 300) )

109.0
99.90393852065321


In [7]:
# DESCRIBE DATAFRAME : give all significative distribution values
maDataFrame.describe()
# on selected column list
maDataFrame.describe()[['O3obs','MOCAGE','TEMPE']]

Unnamed: 0,O3obs,MOCAGE,TEMPE
count,1041.0,1041.0,1041.0
mean,115.400576,127.219693,23.883381
std,40.999155,39.820186,5.217462
min,19.0,46.4,10.4
25%,87.0,97.5,20.2
50%,109.0,125.6,23.8
75%,135.0,153.6,27.6
max,319.0,284.7,38.0


## Qualitative variables

In [8]:
maDataFrame = pd.read_csv('data/whickham.csv')
maDataFrame.head()

Unnamed: 0,outcome,smoker,age
0,Alive,Yes,23
1,Alive,Yes,18
2,Dead,Yes,71
3,Alive,No,67
4,Alive,No,64


In [9]:
# counting categorial values of data
print( pd.DataFrame(maDataFrame.smoker.value_counts()) )
pd.DataFrame(maDataFrame.outcome.value_counts(normalize=True)) # print with percentage [0;1]

     smoker
No      732
Yes     582


Unnamed: 0,outcome
Alive,0.719178
Dead,0.280822


In [10]:
# GENERATE A NEW DATAFRAME BY GROUPING DATA PER VALUE OF TARDGETED COLUMN and counting value from another
byFumeur = maDataFrame.groupby('smoker').outcome.value_counts(normalize=True)
byFumeur.unstack() # print grouping data frame as a normal dataframe (readable friendly)

outcome,Alive,Dead
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1
No,0.685792,0.314208
Yes,0.761168,0.238832


In [11]:
# CREATE NEW CATEGORIAL COLUMN IN DATAFRAME FROM QUANTITATIVE VARIABLE

maDataFrame['CatAge']= pd.cut(maDataFrame.age,[0,20,30,40,50,60,65,70,100],
                              labels=['0-20','20-30','30-40','40-50','50-60','60-65','65-70','+70'])
maDataFrame

Unnamed: 0,outcome,smoker,age,CatAge
0,Alive,Yes,23,20-30
1,Alive,Yes,18,0-20
2,Dead,Yes,71,+70
3,Alive,No,67,65-70
4,Alive,No,64,60-65
...,...,...,...,...
1309,Alive,Yes,35,30-40
1310,Alive,No,33,30-40
1311,Alive,Yes,21,20-30
1312,Alive,No,46,40-50


In [12]:
# RECOUNTS DATA BY GROUPING ON 2 COLUMNS 
parAge= maDataFrame.groupby(['CatAge','smoker']).outcome.value_counts(normalize=True)
parAge

CatAge  smoker  outcome
0-20    No      Alive      0.966667
                Dead       0.033333
        Yes     Alive      0.958333
                Dead       0.041667
20-30   No      Alive      0.985185
                Dead       0.014815
        Yes     Alive      0.979798
                Dead       0.020202
30-40   No      Alive      0.955224
                Dead       0.044776
        Yes     Alive      0.940678
                Dead       0.059322
40-50   No      Alive      0.867470
                Dead       0.132530
        Yes     Alive      0.828125
                Dead       0.171875
50-60   No      Alive      0.722222
                Dead       0.277778
        Yes     Alive      0.616071
                Dead       0.383929
60-65   No      Alive      0.596774
                Dead       0.403226
        Yes     Alive      0.596154
                Dead       0.403846
65-70   No      Dead       0.758065
                Alive      0.241935
        Yes     Dead       0.812500
    