# Pandas - Python Data Analysis

## Upgrade PIP

Windows

    python -m pip install -U pip

Linux
   
    pip install -U pip

### Install Packages if required

In [1]:
!pip install wheel



## Importing required libraries

In [None]:
%matplotlib inline
from pandas import *
from pylab import *
import pandas
import numpy as np
import matplotlib.pyplot as plt

def side_by_side(*objs, **kwds):
    from pandas.core.common import adjoin
    space = kwds.get('space', 4)
    reprs = [repr(obj).split('\n') for obj in objs]
    print adjoin(space, *reprs)

plt.rc('figure', figsize=(10, 6))
# pandas.set_printoptions(notebook_repr_html=False)

In [None]:
plt.plot([1,2,4,8,16])

Series
======

In [None]:
np.random.randn(5)

In [None]:
labels = ['a', 'b', 'c', 'd', 'e']
s = pd.Series(np.random.randn(5), index=labels)

In [None]:
'b' in s

In [None]:
s['b']

In [None]:
s.index

In [None]:
s

In [None]:
mapping = s.to_dict()
mapping

In [None]:
s = pd.Series(mapping, index=['b', 'e', 'a', 'd', 'f'])
s

In [None]:
notnull(s)

In [None]:
s[notnull(s)]

In [None]:
s.dropna()

In [None]:
s * 2

In [None]:
s[3:]

In [None]:
s[:3]

In [None]:
s.index

DataFrame: 2D collection of Series
==================================

In [None]:
df = DataFrame({'a': np.random.randn(6),
                'b': ['foo', 'bar'] * 3,
                'c': np.random.randn(6)})
df['d'] = range(6)
df

In [None]:
df['b']    # by column

In [None]:
df[:3]

In [None]:
df[:-2]

In [None]:
df[-2:]   # the last 2 rows

In [None]:
df[['a','b']]

In [None]:
np.tile(['foo', 'bar'], 3)   # in numpy

In [None]:
df.xs(0)   # returning the first row

In [None]:
df.ix[2]    # does equal thing as above. In this case, 3rd row. 

In [None]:
df.ix[2, 'b']

In [None]:
timeit df.ix[1]

In [None]:
df.get_value(2, 'b')

In [None]:
timeit df.ix[2, 'b']

In [None]:
timeit df.get_value(2, 'b')

In [None]:
df.ix[2:4, 'b']

In [None]:
df.ix[2:4, ['b', 'c']]

In [None]:
df.ix[2:4, 'b':'c']   # slices by column - essentially the same as above.

In [None]:
df.ix[2:4, 0:2]

In [None]:
df.ix[[0, 2, 4], ['b', 'c', 'd']]   # pass a list of rows and columns I want to select out

In [None]:
df['c'] > 0

In [None]:
df.ix[df['c'] > 0]   # boolean arrays

In [None]:
df.index

In [None]:
df.columns

In [None]:
pandas.date_range('1/1/2000',periods=6)

In [None]:
df = DataFrame({'a': np.random.randn(6),
                'b': ['foo', 'bar'] * 3,
                'c': np.random.randn(6)},
               index=pandas.date_range('1/1/2000', periods=6))
df

In [None]:
df = DataFrame({'a': np.random.randn(6),
                'b': ['foo', 'bar'] * 3,
                'c': np.random.randn(6)},
               columns=['a', 'b', 'c', 'd'])
df

In [None]:
isnull(df)

Creation from nested dicts
--------------------------

These arise naturally in Python code

In [None]:
data = {}
for col in ['foo', 'bar', 'baz']:
    for row in ['a', 'b', 'c', 'd']:
        data.setdefault(col, {})[row] = np.random.randn()   # sorted cols: bar, baz, foo
data

In [None]:
del data ['foo']['c']   # delete an entry by index key (col) and row.

In [None]:
DataFrame(data)   # pass a dict 

Data alignment
==============

In [None]:
close_px = read_csv('stock_data.csv', index_col=0, parse_dates=True)

In [None]:
!head -n 10 stock_data.csv

In [None]:
!head  stock_data.csv

In [None]:
close_px

In [None]:
s1 = close_px['AAPL'][-20:]      # the last 20 rows
s2 = close_px['AAPL'][-25:-10]   # the last 25 - last 10 = 15 rows.
side_by_side(s1, s2)

In [None]:
s1 + s2

In [None]:
s1.add(s2, fill_value=0)

In [None]:
(s1 + s2).dropna()

In [None]:
df = close_px.ix[-10:, :3]
df

In [None]:
side_by_side(s1.reindex(s2.index), s2)   # align indexes of s1 and s2. Print 2 objects alongside each other.

In [None]:
side_by_side(s1.ix[s2.index], s2)   # same as above, supported in newer version of 0.12 ipython.

In [None]:
b, c  = s1.align(s2, join='inner')
side_by_side(b, c)

In [None]:
b, c  = s1.align(s2, join='outer')
side_by_side(b, c)

b, c  = s1.align(s2, join='right')
side_by_side(b, c)

In [None]:
df = close_px.ix[-10:, ['AAPL', 'IBM', 'MSFT']]
df

In [None]:
df2 = df.ix[::2, ['IBM', 'MSFT']]
side_by_side(df, df2)

In [None]:
df + df2

In [None]:
b, c = df.align(df2, join='inner')
side_by_side(b, c) 

Transposing: no copy if all columns are same type
-------------------------------------------------

In [None]:
df[:5].T

Columns can be any type
-----------------------

In [None]:
n = 10
foo = DataFrame(index=range(n))
foo['floats'] = np.random.randn(n)
foo['ints'] = np.arange(n)
foo['strings'] = ['foo', 'bar'] * (n / 2)
foo['bools'] = foo['floats'] > 0
foo['objects'] = pandas.date_range('1/1/2000', periods=n)
foo

In [None]:
foo.dtypes

N.B. transposing is not roundtrippable in this case (column-oriented data structure)

In [None]:
foo.T.T

In [None]:
foo.T.T.dtypes

Function application
====================

You can apply arbitrary functions to the rows or columns of a DataFrame

In [None]:
df

In [None]:
df.apply(np.mean)

In [None]:
df.mean()   # same as above.   df.mean? to look up the function usage.

In [None]:
df.mean(1)   # get the mean of the rows based on a column. Also excludes missing data (NaN: not a number).

In [None]:
df.mean(1, skipna=False)   # flag skipna: to include NaN. In this data, no NaN.

In [None]:
df.apply(np.mean, axis=1)   # same as above.

You can get as fancy as you want

In [None]:
close_px    # Output: 1000 rows.

In [None]:
close_px.AAPL   # For easier typing, after dot ., use tab to select the name of the stock.

In [None]:
close_px.AAPL.idxmax() # gives index of its maximum value.

In [None]:
close_px.AAPL.index[1]

In [None]:
#close_px.AAPL.index[close_px.AAPL.argmax()]    # If you are numpy aficionado, does the same thing.

In [None]:
close_px.AA.idxmax()

def peak_date(series):
    return series.idxmax()

In [None]:
close_px.apply(peak_date)

In [None]:
for column in close_px:
    print close_px[column].max()

In [None]:
def peak_date(series):
    return series.idxmax()    # new method: series.idxmax()
close_px.apply(peak_date, axis=0)

In [None]:
df.apply(lambda x: x.max())

In [None]:
df.apply(lambda x: x.max() - x.min()) # np.ptp

In [None]:
np.log(close_px)

Hierarchical indexing
------------------------

In [None]:
index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]])
hdf = DataFrame(np.random.randn(10, 3), index=index,
                columns=['A', 'B', 'C'])
hdf

In [None]:
hdf.ix['foo']

In [None]:
hdf.ix['foo'] = 0
hdf

In [None]:
hdf.ix['bar','one']['A']

Stacking and unstacking
-----------------------

In [None]:
tuples = zip(*[['bar', 'bar', 'baz', 'baz',
                'foo', 'foo', 'qux', 'qux'],
               ['one', 'two', 'one', 'two',
                'one', 'two', 'one', 'two']])
index = MultiIndex.from_tuples(tuples)
columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'),
                                  ('B', 'cat'), ('A', 'dog')])
df = DataFrame(np.random.randn(8, 4), index=index, columns=columns)
df

In [None]:
df2 = df.ix[[0, 1, 2, 4, 5, 7]]
df2

In [None]:
df.unstack()['B']

GroupBy
=======

In [None]:
df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                       'foo', 'bar', 'foo', 'foo'],
                'B' : ['one', 'one', 'two', 'three',
                       'two', 'two', 'one', 'three'],
                'C' : np.random.randn(8),
                'D' : np.random.randn(8)})
df

In [None]:
for key, group in df.groupby('A'):
    print key
    print group

In [None]:
df.groupby('A')['C'].describe().T

In [None]:
df.groupby('A').mean()

In [None]:
for key, group in df.groupby('A'):
    print key
    print group

In [None]:
df.groupby(['A', 'B']).mean()

In [None]:
#df.groupby(['A', 'B'], as_index=False).mean()

In [None]:
df.stack()

In [None]:
#df.stack().mean(1).unstack()

In [None]:
#df.groupby(level=1, axis=1).mean()