In [3]:
# Ejemplos de estadistica descriptiva con python

import numpy as np # importando numpy
from scipy import stats # importando scipy.stats
import pandas as pd # importando pandas

np.random.seed(2131982) # para poder replicar el random

In [7]:
datos = np.random.randn(5, 4) # datos normalmente distribuidos
datos

array([[ 0.04072584, -0.20752267, -1.25471621,  1.02200627],
       [ 0.11412021, -0.69563936, -0.46166141,  0.74263255],
       [-0.65640569,  0.15759185, -1.11944404,  1.27042332],
       [ 0.88126147, -0.56305738, -1.00973234,  1.20856547],
       [ 0.13592767,  2.88003542, -0.96333006,  0.21124599]])

In [9]:
# media arítmetica
datos.mean() # Calcula la media aritmetica de

0.08665134490453297

In [10]:
np.mean(datos) # Mismo resultado desde la funcion de numpy

0.08665134490453297

In [11]:
datos.mean(axis=1) # media aritmetica de cada fila

array([-0.09987669, -0.075137  , -0.08695864,  0.12925931,  0.56596975])

In [12]:
datos.mean(axis=0) # media aritmetica de cada columna

array([ 0.1031259 ,  0.31428157, -0.96177681,  0.89097472])

In [13]:
# mediana
np.median(datos) 

0.07742302456156587

In [14]:
np.median(datos, 0) # media aritmetica de cada columna

array([ 0.11412021, -0.20752267, -1.00973234,  1.02200627])

In [15]:
 # Desviación típica
np.std(datos)

0.9989325794169909

In [16]:
np.std(datos, 0) # Desviación típica de cada columna

array([0.48733434, 1.31678305, 0.26947728, 0.38627735])

In [17]:
# varianza
np.var(datos) 

0.9978662982206827

In [18]:
np.var(datos, 0) # varianza de cada columna

array([0.23749476, 1.73391761, 0.072618  , 0.14921019])

In [19]:
# moda
stats.mode(datos) # Calcula la moda de cada columna
# el 2do array devuelve la frecuencia.

ModeResult(mode=array([[-0.65640569, -0.69563936, -1.25471621,  0.21124599]]), count=array([[1, 1, 1, 1]]))

In [26]:
# correlacion
np.corrcoef(datos) # Crea matriz de correlación.

array([[ 1.        ,  0.80823899,  0.88760285,  0.89224148,  0.2309958 ],
       [ 0.80823899,  1.        ,  0.61867875,  0.90898293, -0.38620801],
       [ 0.88760285,  0.61867875,  1.        ,  0.60136124,  0.37171287],
       [ 0.89224148,  0.90898293,  0.60136124,  1.        , -0.10691323],
       [ 0.2309958 , -0.38620801,  0.37171287, -0.10691323,  1.        ]])

In [27]:
# calculando la correlación entre dos vectores.
np.corrcoef(datos[0], datos[1])

array([[1.        , 0.80823899],
       [0.80823899, 1.        ]])

In [28]:
# covarianza
np.cov(datos) # calcula matriz de covarianza

array([[ 0.87454414,  0.48574   ,  0.86959602,  0.90215543,  0.35285749],
       [ 0.48574   ,  0.41299654,  0.41653011,  0.63159259, -0.40541412],
       [ 0.86959602,  0.41653011,  1.09752893,  0.68116372,  0.63609207],
       [ 0.90215543,  0.63159259,  0.68116372,  1.16900447, -0.18881822],
       [ 0.35285749, -0.40541412,  0.63609207, -0.18881822,  2.66814099]])

In [29]:
# covarianza de dos vectores
np.cov(datos[0], datos[1])

array([[0.87454414, 0.48574   ],
       [0.48574   , 0.41299654]])

In [30]:
# usando pandas
dataframe = pd.DataFrame(datos, index=['a', 'b', 'c', 'd', 'e'], columns=['col1', 'col2', 'col3', 'col4'])
dataframe

Unnamed: 0,col1,col2,col3,col4
a,0.040726,-0.207523,-1.254716,1.022006
b,0.11412,-0.695639,-0.461661,0.742633
c,-0.656406,0.157592,-1.119444,1.270423
d,0.881261,-0.563057,-1.009732,1.208565
e,0.135928,2.880035,-0.96333,0.211246


In [31]:
# resumen estadistadistico con pandas
dataframe.describe()

Unnamed: 0,col1,col2,col3,col4
count,5.0,5.0,5.0,5.0
mean,0.103126,0.314282,-0.961777,0.890975
std,0.544856,1.472208,0.301285,0.431871
min,-0.656406,-0.695639,-1.254716,0.211246
25%,0.040726,-0.563057,-1.119444,0.742633
50%,0.11412,-0.207523,-1.009732,1.022006
75%,0.135928,0.157592,-0.96333,1.208565
max,0.881261,2.880035,-0.461661,1.270423


In [32]:
# sumando las columnas
dataframe.sum()

col1    0.515629
col2    1.571408
col3   -4.808884
col4    4.454874
dtype: float64

In [33]:
# sumando filas
dataframe.sum(axis=1)

a   -0.399507
b   -0.300548
c   -0.347835
d    0.517037
e    2.263879
dtype: float64

In [34]:
dataframe.cumsum() # acumulados

Unnamed: 0,col1,col2,col3,col4
a,0.040726,-0.207523,-1.254716,1.022006
b,0.154846,-0.903162,-1.716378,1.764639
c,-0.50156,-0.74557,-2.835822,3.035062
d,0.379702,-1.308628,-3.845554,4.243628
e,0.515629,1.571408,-4.808884,4.454874


In [35]:
# media aritmetica de cada columna con pandas
dataframe.mean()

col1    0.103126
col2    0.314282
col3   -0.961777
col4    0.890975
dtype: float64

In [36]:
# media aritmetica de cada fila con pandas
dataframe.mean(axis=1)

a   -0.099877
b   -0.075137
c   -0.086959
d    0.129259
e    0.565970
dtype: float64

## Prueba de datos discretos

In [20]:
datos2 = np.array([1, 2, 3, 6, 6, 1, 2, 4, 2, 2, 6, 6, 8, 10, 6])

In [24]:
datos2.sort()
datos2

array([ 1,  1,  2,  2,  2,  2,  3,  4,  6,  6,  6,  6,  6,  8, 10])

In [21]:
# Moda
stats.mode(datos2) # aqui la moda es el 6 porque aparece 5 veces en el vector.

ModeResult(mode=array([6]), count=array([5]))