# Pandas Cont...

# Descriptive Statistics

In [7]:
import pandas as pd
import numpy as np

#Create a Dictionary of series
d = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack',
   'Lee','David','Gasper','Betina','Andres']),
   'Age':pd.Series([25,26,25,23,30,29,23,34,40,30,51,46]),
   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8,3.78,2.98,4.80,4.10,3.65])
}

#Create a DataFrame
df = pd.DataFrame(d)
df

Unnamed: 0,Name,Age,Rating
0,Tom,25,4.23
1,James,26,3.24
2,Ricky,25,3.98
3,Vin,23,2.56
4,Steve,30,3.2
5,Smith,29,4.6
6,Jack,23,3.8
7,Lee,34,3.78
8,David,40,2.98
9,Gasper,30,4.8


# sum()
Returns the sum of the values for the requested axis. By default, axis is index (axis=0).
Syntax 
--------- 
df.sum(axis=None, skipna=None, level=None, numeric_only=None, min_count=0, **kwargs)

In [5]:
df.sum()

Name      TomJamesRickyVinSteveSmithJackLeeDavidGasperBe...
Age                                                     382
Rating                                                44.92
dtype: object

In [8]:
#Sum of Rows
df.sum(axis=1)

0     29.23
1     29.24
2     28.98
3     25.56
4     33.20
5     33.60
6     26.80
7     37.78
8     42.98
9     34.80
10    55.10
11    49.65
dtype: float64

# mean()
Returns the average value
Syntax
======

 df.mean(axis=None, skipna=None, level=None, numeric_only=None, **kwargs)

In [9]:
#Columns Mean value
df.mean()

Age       31.833333
Rating     3.743333
dtype: float64

In [11]:
#Rows Mean Value
df.mean(1)

0     14.615
1     14.620
2     14.490
3     12.780
4     16.600
5     16.800
6     13.400
7     18.890
8     21.490
9     17.400
10    27.550
11    24.825
dtype: float64

# std()
Returns the Bressel standard deviation of the numerical columns.

Syntax
======

df.std(axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs)


In [13]:
#Columns Standard Deviation
df.std()

Age       9.232682
Rating    0.661628
dtype: float64

In [15]:
#Rows Standard Deviation
df.std(1)

0     14.686608
1     16.093750
2     14.863385
3     14.453263
4     18.950462
5     17.253405
6     13.576450
7     21.368767
8     26.177093
9     17.819091
10    33.163308
11    29.945972
dtype: float64

In [None]:
# Sr.No.	Function	Description
#1	count()	Number of non-null observations
#2	sum()	Sum of values
#3	mean()	Mean of Values
#4	median()	Median of Values
#5	mode()	Mode of values
#6	std()	Standard Deviation of the Values
#7	min()	Minimum Value
#8	max()	Maximum Value
#9	abs()	Absolute Value
#10	prod()	Product of Values
#11	cumsum()	Cumulative Sum
#12	cumprod()	Cumulative Product

# Summarizing Data
The describe() function computes a summary of statistics pertaining to the DataFrame columns.

Syntax
------
df.describe(percentiles=None, include=None, exclude=None)

percentiles : 
----------
list-like of numbers, optional
    The percentiles to include in the output. All should
    fall between 0 and 1. The default is
    ``[.25, .5, .75]``, which returns the 25th, 50th, and
    75th percentiles.
    
include :
--------
'all', list-like of dtypes or None (default), optional
    A white list of data types to include in the result. Ignored
    for ``Series``. Here are the options:

    - 'all' : All columns of the input will be included in the output.
    - A list-like of dtypes : Limits the results to the
      provided data types.
      To limit the result to numeric types submit
      ``numpy.number``. To limit it instead to object columns submit
      the ``numpy.object`` data type. Strings
      can also be used in the style of
      ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To
      select pandas categorical columns, use ``'category'``
    - None (default) : The result will include all numeric columns.
exclude :
--------
    list-like of dtypes or None (default), optional,
    A black list of data types to omit from the result. Ignored
    for ``Series``. Here are the options:

    - A list-like of dtypes : Excludes the provided data types
      from the result. To exclude numeric types submit
      ``numpy.number``. To exclude object columns submit the data
      type ``numpy.object``. Strings can also be used in the style of
      ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To
      exclude pandas categorical columns, use ``'category'``
    - None (default) : The result will exclude nothing.

Returns
-------
summary:  Series/DataFrame of summary statistics



In [18]:
df.describe()

Unnamed: 0,Age,Rating
count,12.0,12.0
mean,31.833333,3.743333
std,9.232682,0.661628
min,23.0,2.56
25%,25.0,3.23
50%,29.5,3.79
75%,35.5,4.1325
max,51.0,4.8


In [20]:
df.describe(include=[object])

Unnamed: 0,Name
count,12
unique,12
top,Jack
freq,1


In [24]:
df.describe(include='all')

Unnamed: 0,Name,Age,Rating
count,12,12.0,12.0
unique,12,,
top,Jack,,
freq,1,,
mean,,31.833333,3.743333
std,,9.232682,0.661628
min,,23.0,2.56
25%,,25.0,3.23
50%,,29.5,3.79
75%,,35.5,4.1325


# Iterating a DataFrame
Iterating a DataFrame gives column names. Let us consider the following example to understand the same.

To iterate over the rows of the DataFrame, we can use the following functions −

    iteritems() − to iterate over the (key,value) pairs

    iterrows() − iterate over the rows as (index,series) pairs

    itertuples() − iterate over the rows as namedtuples



In [26]:
#iteritems()
#Iterates over each column as key, value pair with label as key and column value as a Series object.
for key,value in df.iteritems():
   print(key,value)

Name 0        Tom
1      James
2      Ricky
3        Vin
4      Steve
5      Smith
6       Jack
7        Lee
8      David
9     Gasper
10    Betina
11    Andres
Name: Name, dtype: object
Age 0     25
1     26
2     25
3     23
4     30
5     29
6     23
7     34
8     40
9     30
10    51
11    46
Name: Age, dtype: int64
Rating 0     4.23
1     3.24
2     3.98
3     2.56
4     3.20
5     4.60
6     3.80
7     3.78
8     2.98
9     4.80
10    4.10
11    3.65
Name: Rating, dtype: float64


In [27]:
#iterrows()
#it returns the iterator yielding each index value along with a series containing the data in each row.
for row_index,row in df.iterrows():
   print(row_index,row)

0 Name       Tom
Age         25
Rating    4.23
Name: 0, dtype: object
1 Name      James
Age          26
Rating     3.24
Name: 1, dtype: object
2 Name      Ricky
Age          25
Rating     3.98
Name: 2, dtype: object
3 Name       Vin
Age         23
Rating    2.56
Name: 3, dtype: object
4 Name      Steve
Age          30
Rating      3.2
Name: 4, dtype: object
5 Name      Smith
Age          29
Rating      4.6
Name: 5, dtype: object
6 Name      Jack
Age         23
Rating     3.8
Name: 6, dtype: object
7 Name       Lee
Age         34
Rating    3.78
Name: 7, dtype: object
8 Name      David
Age          40
Rating     2.98
Name: 8, dtype: object
9 Name      Gasper
Age           30
Rating       4.8
Name: 9, dtype: object
10 Name      Betina
Age           51
Rating       4.1
Name: 10, dtype: object
11 Name      Andres
Age           46
Rating      3.65
Name: 11, dtype: object


In [28]:
#itertuples() 
#This method will return an iterator yielding a named tuple for each row in the DataFrame. 
#The first element of the tuple will be the row’s corresponding index value, while the remaining values are the row values.
for row in df.itertuples():
    print(row)


Pandas(Index=0, Name='Tom', Age=25, Rating=4.23)
Pandas(Index=1, Name='James', Age=26, Rating=3.24)
Pandas(Index=2, Name='Ricky', Age=25, Rating=3.98)
Pandas(Index=3, Name='Vin', Age=23, Rating=2.56)
Pandas(Index=4, Name='Steve', Age=30, Rating=3.2)
Pandas(Index=5, Name='Smith', Age=29, Rating=4.6)
Pandas(Index=6, Name='Jack', Age=23, Rating=3.8)
Pandas(Index=7, Name='Lee', Age=34, Rating=3.78)
Pandas(Index=8, Name='David', Age=40, Rating=2.98)
Pandas(Index=9, Name='Gasper', Age=30, Rating=4.8)
Pandas(Index=10, Name='Betina', Age=51, Rating=4.1)
Pandas(Index=11, Name='Andres', Age=46, Rating=3.65)


# Percent_change

Series, DatFrames and Panel, all have the function pct_change(). This function compares every element with its prior element and computes the change percentage.

In [30]:
#Series Percent_change
s = pd.Series([1,2,3,4,5,4])
s.pct_change()

0         NaN
1    1.000000
2    0.500000
3    0.333333
4    0.250000
5   -0.200000
dtype: float64

In [32]:
#DataFrame Percent_change
df = pd.DataFrame(np.random.randn(5, 2))
df.pct_change()

Unnamed: 0,0,1
0,,
1,-1.242789,-1.520785
2,-4.816129,17.267569
3,-2.32608,-0.87898
4,-1.233136,-6.987424


# Covariance
Covariance is applied on series data. The Series object has a method cov to compute covariance between series objects. NA will be excluded automatically.

In [33]:
#Covariance on Series
s1 = pd.Series(np.random.randn(10))
s2 = pd.Series(np.random.randn(10))
s1.cov(s2)

0.3429010241704238

In [36]:
#Covariance on DataFrame
frame = pd.DataFrame(np.random.randn(10, 5), columns=['a', 'b', 'c', 'd', 'e'])
#Covariance on two columns
frame['a'].cov(frame['b'])

0.25026234634672384

In [38]:
#Covariance on Entire DataFrame
frame.cov()

Unnamed: 0,a,b,c,d,e
a,0.822972,0.250262,-0.111752,-0.118694,-0.421002
b,0.250262,1.095099,-0.082299,-0.142705,0.06194
c,-0.111752,-0.082299,1.661717,-0.582318,-0.257662
d,-0.118694,-0.142705,-0.582318,0.602854,0.437287
e,-0.421002,0.06194,-0.257662,0.437287,1.431943


# Correlation
Correlation shows the linear relationship between any two array of values (series). There are multiple methods to compute the correlation like pearson(default), spearman and kendall.

In [39]:
#Correlation between the values
frame = pd.DataFrame(np.random.randn(10, 5), columns=['a', 'b', 'c', 'd', 'e'])
#Correlation between the two columns
frame['a'].corr(frame['b'])

-0.3911914153807798

In [41]:
#Correlation between Entire DataFrame
frame.corr()

Unnamed: 0,a,b,c,d,e
a,1.0,-0.391191,0.106634,-0.130439,-0.143895
b,-0.391191,1.0,0.318194,-0.328414,-0.268678
c,0.106634,0.318194,1.0,-0.310535,-0.153756
d,-0.130439,-0.328414,-0.310535,1.0,-0.250453
e,-0.143895,-0.268678,-0.153756,-0.250453,1.0
