# Summarizing & Computing Descriptive Statistics

In [1]:
import pandas as pd
import numpy as np 

In [2]:
df=pd.DataFrame(
    [[2.4,np.nan],[6.3,-5.4],
     [np.nan,np.nan],[0.75,-1.3]],
    index=["a","b","c","d"],
    columns=["one","two"])
df

Unnamed: 0,one,two
a,2.4,
b,6.3,-5.4
c,,
d,0.75,-1.3


In [3]:
df.sum()

one    9.45
two   -6.70
dtype: float64

In [4]:
df.sum(axis=1)

a    2.40
b    0.90
c    0.00
d   -0.55
dtype: float64

In [5]:
df.mean(axis=1)

a    2.400
b    0.450
c      NaN
d   -0.275
dtype: float64

In [6]:
df.mean(axis=1,skipna=False)

a      NaN
b    0.450
c      NaN
d   -0.275
dtype: float64

In [7]:
df.idxmax()

one    b
two    d
dtype: object

In [8]:
df.idxmin()

one    d
two    b
dtype: object

In [9]:
df.cumsum()

Unnamed: 0,one,two
a,2.4,
b,8.7,-5.4
c,,
d,9.45,-6.7


In [10]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.15,-3.35
std,2.85,2.899138
min,0.75,-5.4
25%,1.575,-4.375
50%,2.4,-3.35
75%,4.35,-2.325
max,6.3,-1.3


To find the correlation coefficient, let's first import the famous iris data set. You can download iris data set from https://archive.ics.uci.edu/ml/datasets/iris.

In [11]:
iris=pd.read_csv("Data/iris.data", 
                 sep=",",
                 header=None)

In [12]:
iris.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [13]:
iris.columns=['sepal_length','sepal_width',
             'petal_length','petal_width',
             'class']

In [14]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [15]:
iris["sepal_length"].corr(iris["sepal_width"])

-0.10936924995064938

In [16]:
iris.corr()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
sepal_length,1.0,-0.109369,0.871754,0.817954
sepal_width,-0.109369,1.0,-0.420516,-0.356544
petal_length,0.871754,-0.420516,1.0,0.962757
petal_width,0.817954,-0.356544,0.962757,1.0


In [17]:
iris.cov()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
sepal_length,0.685694,-0.039268,1.273682,0.516904
sepal_width,-0.039268,0.188004,-0.321713,-0.117981
petal_length,1.273682,-0.321713,3.113179,1.296387
petal_width,0.516904,-0.117981,1.296387,0.582414


In [18]:
iris.corrwith(iris.petal_length)

sepal_length    0.871754
sepal_width    -0.420516
petal_length    1.000000
petal_width     0.962757
dtype: float64

In [19]:
s=pd.Series(["b","b","b","b","c",
             "c","a","a","a"])
s

0    b
1    b
2    b
3    b
4    c
5    c
6    a
7    a
8    a
dtype: object

In [20]:
s.unique()

array(['b', 'c', 'a'], dtype=object)

In [21]:
s.value_counts()

b    4
a    3
c    2
dtype: int64

In [22]:
x=s.isin(["b","c"])
x

0     True
1     True
2     True
3     True
4     True
5     True
6    False
7    False
8    False
dtype: bool

In [23]:
s[x]

0    b
1    b
2    b
3    b
4    c
5    c
dtype: object