In [None]:
# # Summarizing and Computing Descriptive Statistics

# pandas objects are equipped with a set of common mathematical and statistical meth
# ods. Most of these fall into the category of reductions or summary statistics, methods
# that extract a single value (like the sum or mean) from a Series or a Series of values from
# the rows or columns of a DataFrame. 

In [34]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

df = DataFrame([[1.4, np.nan], [7.1, -4.5],[np.nan, np.nan], [0.75, -1.3]], index=['a', 'b', 'c', 'd'], columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [35]:
print(df.sum())
df.sum(axis=1)

one    9.25
two   -5.80
dtype: float64


a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [36]:
df.mean(axis=1,skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [37]:
#  Method     Description
#  axisAxis   to reduce over. 0 for DataFrame’s rows and 1 for columns.
#  skipna     Exclude missing values, True by default.
#  level      Reduce grouped by level if the axis is hierarchically-indexed (MultiIndex)

In [38]:
df.idxmin()  #returns the index containg min value
df.idxmax()  #returns the index containg the ax value

one    b
two    d
dtype: object

In [39]:
df.cumsum()  # cumulative summation

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [40]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [41]:
obj=Series(['a','a','b','c']*4)
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

In [42]:
#  Method     Description
#  count      Number of non-NA values
#  describe   Compute set of summary statistics for Series or each DataFrame column
#  min,max    Compute minimum and maximum values
 
#  argmin,argmax   Compute index locations (integers) at which minimum or maximum value obtained, respectively
#  idxmin,idxmax   Compute index values at which minimum or maximum value obtained, respectively
#  quantile       Compute sample quantile ranging from 0 to 1
#  sum          Sum of values
#  mean         Mean of values
#  median        Arithmetic median (50% quantile) of values
#  mad            Mean absolute deviation from mean value
#  var           Sample variance of values
#  std             Sample standard deviation of values
#  skew         Sample skewness (3rd moment) of values
#  kurt            Sample kurtosis (4th moment) of values
#  cumsum        Cumulative sum of values
  
#  cummin,cummax   Cumulative minimum or maximum of values, respectively
#  cumprod         Cumulative product of values
#  diff            Compute 1st arithmetic difference (useful for time series)
#  pct_change       Compute percent changes

In [43]:
#  Correlation and Covariance  CHECK NOT WORKING 

# Some summary statistics, like correlation and covariance, are computed from pairs of
# arguments.

In [44]:
import pandas.io.data as web
all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')
    
price = DataFrame({tic: data['Adj Close']
                   for tic, data in all_data.iteritems()})
volume = DataFrame({tic: data['Volume']
                    for tic, data in all_data.iteritems()})

returns = price.pct_change()
returns.tail()

#Not Working

ModuleNotFoundError: No module named 'pandas.io.data'

In [45]:
# Unique Values, Value Counts, and Membership

In [46]:
# Another class of related methods extracts information about the values contained in a
# one-dimensional Series

In [47]:
Obj=Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
uniques=obj.unique()
uniques

array(['a', 'b', 'c'], dtype=object)

In [48]:
uniques.sort()
uniques

array(['a', 'b', 'c'], dtype=object)

In [50]:
# Relatedly, value_counts computes a Series containing value frequencies
obj.value_counts()

a    8
b    4
c    4
dtype: int64

In [51]:
# The Series is sorted by value in descending order as a convenience. value_counts is also
#  available as a top-level pandas method that can be used with any array or sequence
pd.value_counts(obj.values,sort=False)

a    8
b    4
c    4
dtype: int64

In [52]:
mask=obj.isin(['b','c'])
mask

0     False
1     False
2      True
3      True
4     False
5     False
6      True
7      True
8     False
9     False
10     True
11     True
12    False
13    False
14     True
15     True
dtype: bool

In [53]:
# Method         Description
# isin           Compute boolean array indicating whether each Series value is contained in the passed sequence of values.
# unique         Compute array of unique values in a Series, returned in the order observed.
# value_counts   Return a Series containing unique values as its index and frequencies as its values, ordered count indescending order.

In [54]:
# In some cases, you may want to compute a histogram on multiple related columns in
#  a DataFrame
data=DataFrame({
    'que1':[1,3,4,3,4],
    'que2':[2,3,1,2,3],
    'que3':[1,5,2,4,4]
})
data

Unnamed: 0,que1,que2,que3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [None]:
# Passing pandas.value_counts to this DataFrame’s apply function gives

result=data.apply(pd.value_counts).fillna(0)
result

In [None]:
# Handling Missing Data

# Missing data is common in most data analysis applications. One of the goals in de
# signing pandas was to make working with missing data as painless as possible. For
#  example, all of the descriptive statistics on pandas objects exclude missing data as
#  you’ve seen earlier in the chapter.pandas uses the floating point value NaN (Not a Number) to represent missing data in
#  both floating as well as in non-floating point arrays. It is just used as a sentinel that can
#  be easily detected

In [57]:
string_data=Series(['aardvark','artichoke',np.nan,'avocado'])
print(string_data)
string_data.isnull()

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object


0    False
1    False
2     True
3    False
dtype: bool

In [58]:
# The built-in Python None value is also treated as NA in object arrays:
string_data[0]=None
string_data

0         None
1    artichoke
2          NaN
3      avocado
dtype: object

In [None]:
#  Argument    Description
#  dropna      Filter axis labels based on whether values for each label have missing data, with varying thresholds for how much missing data to tolerate.
#  fillna      Fill in missing data with some value or using an interpolation method such as 'ffill' or 'bfill'.
#  isnull      Return like-type object containing boolean values indicating which values are missing / NA.
#  notnull     Negation of isnull.


In [None]:
# Filtering Out Missing Data

# You have a number of options for filtering out missing data. While doing it by hand is
#  always an option, dropna can be very helpful. On a Series, it returns the Series with only
#  the non-null data and index values

In [63]:
from numpy import nan as NA

data=Series([1,NA,3.5,NA,7])
print(data)
data.dropna()

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64


0    1.0
2    3.5
4    7.0
dtype: float64

In [65]:
# Naturally, you could have computed this yourself by boolean indexing
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [69]:
# With DataFrame objects, these are a bit more complex. You may want to drop rows
#  or columns which are all NA or just those containing any NAs. dropna by default drops
#  any row containing a missing value

data=DataFrame([[1., 6.5, 3.], [1., NA, NA],[NA, NA, NA], [NA, 6.5, 3.]])
print(data)
cleaned=data.dropna()
cleaned

     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0


Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [71]:
#  Passing how='all' will only drop rows that are all NA
print(data.dropna(how='all'))
data.dropna(how='all',axis=1)

     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
3  NaN  6.5  3.0


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [75]:
# A related way to filter out DataFrame rows tends to concern time series data. Suppose
#  you want to keep only rows containing a certain number of observations. You can
#  indicate this with the thresh argument

df=DataFrame(np.random.randn(7,3))
df.ix[:4, 1] = NA; df.ix[:2, 2] = NA
df
df.dropna(thresh=3)

#Not Working

AttributeError: 'DataFrame' object has no attribute 'ix'

In [76]:
# Filling in Missing Data

# Rather than filtering out missing data (and potentially discarding other data along with
#  it), you may want to fill in the “holes” in any number of ways. For most purposes, the 
# fillna method is the workhorse function to use. Calling fillna with a constant replaces
#  missing values with that value

In [77]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.028319,0.483797,-0.191863
1,-2.03783,0.462302,1.234357
2,-0.827556,0.112301,0.109802
3,-0.337294,0.055946,1.085474
4,-0.987479,0.302529,-0.101032
5,-0.310166,-1.834797,0.855269
6,-0.780663,-0.397962,1.163425


In [79]:
# Calling fillna with a dict you can use a different fill value for each column
df.fillna({1:0.5,3:-1})

Unnamed: 0,0,1,2
0,0.028319,0.483797,-0.191863
1,-2.03783,0.462302,1.234357
2,-0.827556,0.112301,0.109802
3,-0.337294,0.055946,1.085474
4,-0.987479,0.302529,-0.101032
5,-0.310166,-1.834797,0.855269
6,-0.780663,-0.397962,1.163425


In [82]:
# fillna returns a new object, but you can modify the existing object in place
_ = df.fillna(0, inplace=True)
df

Unnamed: 0,0,1,2
0,0.028319,0.483797,-0.191863
1,-2.03783,0.462302,1.234357
2,-0.827556,0.112301,0.109802
3,-0.337294,0.055946,1.085474
4,-0.987479,0.302529,-0.101032
5,-0.310166,-1.834797,0.855269
6,-0.780663,-0.397962,1.163425


In [84]:
# The same interpolation methods available for reindexing can be used with fillna
df = DataFrame(np.random.randn(6, 3))
df.ix[2:, 1] = NA; df.ix[4:, 2] = NA
df  #Not Working

# o/p:  
#         0         1         2
#  0  0.286350  0.377984 -0.753887
#  1  0.331286  1.349742  0.069877
#  2  0.246674       NaN  1.004812
#  3  1.327195       NaN -1.549106
#  4  0.022185       NaN       NaN
#  5  0.862580       NaN       NaN

AttributeError: 'DataFrame' object has no attribute 'ix'

In [85]:
# In [257]: df.fillna(method='ffill')      In [258]: df.fillna(method='ffill', limit=2)
#  Out[257]:                                Out[258]:                                   
#           0         1         2                    0         1         2             
# 0  0.286350  0.377984 -0.753887          0  0.286350  0.377984 -0.753887             
# 1  0.331286  1.349742  0.069877          1  0.331286  1.349742  0.069877             
# 2  0.246674  1.349742  1.004812          2  0.246674  1.349742  1.004812             
# 3  1.327195  1.349742 -1.549106          3  1.327195  1.349742 -1.549106             
# 4  0.022185  1.349742 -1.549106          4  0.022185       NaN -1.549106             
# 5  0.862580  1.349742 -1.549106          5  0.862580       NaN -1.549106

In [89]:
# With fillna you can do lots of other things with a little creativity. For example, you
#  might pass the mean or median value of a Series
data=Series([1,NA,3.5,NA,7])
print(data.fillna(method='ffill'))
data.fillna(data.mean())

0    1.0
1    1.0
2    3.5
3    3.5
4    7.0
dtype: float64


0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [90]:
#  Argument    Description
#  value       Scalar value or dict-like object to use to fill missing values
#  method      Interpolation, by default 'ffill' if function called with no other arguments
#  axis        Axis to fill on, default axis=0
#  inplace     Modify the calling object without producing a copy
#  limit       For forward and backward filling, maximum number of consecutive periods to fill