# Pandas descriptive statistics

In [1]:
import pandas as pd
import numpy as np

### Create a 100x4 dataframe with random numbers between 0 and 400 from the discrete uniform distribution

In [2]:
df = pd.DataFrame(np.random.randint(0,400,size=(100,4)),columns=list('ABCD'))
#The head method shows 5 rows by default; this is mutable with the 'n' parameter
df.head()

Unnamed: 0,A,B,C,D
0,280,92,30,279
1,305,340,311,331
2,160,186,41,49
3,381,118,367,13
4,33,15,248,227


### Compute set of summary statistics for each dataframe column

In [3]:
df.describe()

Unnamed: 0,A,B,C,D
count,100.0,100.0,100.0,100.0
mean,203.99,205.97,195.63,200.07
std,112.443253,109.736084,114.594925,115.899414
min,1.0,15.0,5.0,5.0
25%,123.75,111.0,98.75,96.75
50%,208.0,194.5,201.5,215.0
75%,295.75,288.5,282.25,287.0
max,394.0,394.0,399.0,388.0


### Return a series of the highest number from each column

In [4]:
df.max(axis=0)

A    394
B    394
C    399
D    388
dtype: int32

### Return index values of the max value

In [5]:
highest = df.idxmax(axis=0)
#Can be used across rows or columns using the axis parameter, idxmin returns the index of the lowest value. 
highestA = highest[0]
highest

A    24
B    85
C     7
D    70
dtype: int64

In [6]:
df.ix[highestA,'A']

394

### Sum Columns

In [7]:
df.sum()

A    20399
B    20597
C    19563
D    20007
dtype: int64

In [8]:
#Sum rows
df.sum(axis=1).head()

0     681
1    1287
2     436
3     879
4     523
dtype: int64

### Other useful methods:

<ul>
<li><b>count</b> - Number of non-NA values</li>
<li><b>mean</b></li>
<li><b>mad</b> - Mean absolute deviation from mean value</li>
<li><b>cumsum</b> - Cumulative sum of values</li>
<li><b>pct_change</b> - Compute percent changes</li>

### On non-numerical data, the describe method produces alternate summary statistics

In [9]:
string_data = pd.DataFrame(["This","Is","A","Test"]*4)
string_data.describe()

Unnamed: 0,0
count,16
unique,4
top,Test
freq,4
