# Pandas


In [None]:
import numpy as np
import pandas as pd
import matplotlib
from  matplotlib import pyplot
import sys

%matplotlib inline

In [None]:
print('Python version ' + sys.version)
print('Pandas version ' + pd.__version__)

### Let's start with basics

In [None]:
# Many ways to construct a DataFrame
# We pass a dict of {column name: column values}
np.random.seed(42)
df = pd.DataFrame({'A': [1, 2, 3], 'B': [True, True, False],
                   'C': np.random.randn(3)},
                  index=['a', 'b', 'c'])  # also this weird index thing
df

## Selecting
Our first improvement over numpy arrays is labeled indexing. We can select subsets by column, row, or both. Column selection uses the regular python machinery. Pass in a single column label 'A' or a list of labels ['A', 'C'] to select subsets of the original DataFrame.

In [None]:
# Single column, reduces to a Series
df['A']

In [None]:
cols = ['A', 'C']
df[cols]

For row-wise selection, use the special .loc accessor.

*Purely label-location based indexer for selection by label.*

In [None]:
df.loc[['a', 'b']]


When your index labels are ordered, you can use ranges to select rows or columns.

In [None]:
df.loc['a':'b']

In [None]:
df.loc['a':'c']

Notice that the slice is inclusive on both sides, unlike your typical slicing of a list. Sometimes, you'd rather slice by position instead of label. .iloc has you covered:

*Purely integer-location based indexing for selection by position.*

In [None]:
df.iloc[0:2]

In [None]:
df.loc['a':'b', ['A', 'C']]

In [None]:
df['A']

In [None]:
df.loc[:, 'A']

In [None]:
df.index

### Read CSV files

In [None]:
pd.read_csv?

In [None]:
apple_stocks = pd.read_csv(r'Data/AAPL.csv', index_col='Date', parse_dates=['Date']).squeeze()

In [None]:
apple_stocks

In [None]:
apple_stocks.describe()

In [None]:
apple_stocks.Open > 150.0

In [None]:
apple_stocks[apple_stocks.Open > 150.0].head()

Notice that we just used [] there. We can pass the boolean indexer in to .loc as well.

In [None]:
apple_stocks.axes

In [None]:
apple_stocks.loc[apple_stocks.Open > 150.0, ['High', 'Volume']].head(n=10)

In [None]:
apple_stocks[((apple_stocks.Open > 150) & (apple_stocks.Close > 156.100006)) | (apple_stocks.Volume >= 22966400)]

In [None]:
df = apple_stocks[['Open', 'Close', 'Volume']]
df.head()

In [None]:
apple_stocks[['Open']].hist()

In [None]:
def plot_by(dataset, column='Open', bins_count=10):
    plot = apple_stocks[column].hist(bins=bins_count)
    
    # Plot settings.
    pyplot.title('%s Values' % column)
    pyplot.ylabel('N')

from ipywidgets import interact, fixed
interact(
    plot_by,
    dataset=fixed(apple_stocks),
    column=apple_stocks.columns.tolist(),
    bins_count=(5,50)
);