# Python for Data Science


### Pandas 

In [2]:
import pandas as pd
pd.__version__

'0.23.0'

In [3]:
import numpy as np
import matplotlib.pyplot as plt

### Object Creation

Creating a `Series` by passing a list of values, letting pandas create a default integer index:

In [None]:
s = pd.Series([1,3,5,np.nan,6,8])

In [None]:
s

Creating a `DataFrame` by passing a NumPy array, with a datetime index and labeled columns:

In [4]:
dates = pd.date_range('20180101', periods=6)

In [5]:
dates

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06'],
              dtype='datetime64[ns]', freq='D')

In [6]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

In [7]:
df

Unnamed: 0,A,B,C,D
2018-01-01,0.476278,0.978263,-0.98864,-0.413071
2018-01-02,1.495812,0.262722,0.238888,-0.563323
2018-01-03,0.32463,1.254385,0.294734,-0.574882
2018-01-04,1.417684,0.523339,1.26308,2.302291
2018-01-05,1.627111,1.163846,-0.576795,0.873408
2018-01-06,0.556139,-0.021522,-0.801073,0.788913


Creating a `DataFrame` by passing a dict of objects that can be converted to series-like.

In [35]:
df2 = pd.DataFrame({ 'A' : 1.,
            'B' : pd.Timestamp('20180102'),
            'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
            'D' : np.array([3] * 4,dtype='int32'),
            'E' : pd.Categorical(["test","train","test","train"]),
            'F' : 'foo' })

In [36]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2018-01-02,1.0,3,test,foo
1,1.0,2018-01-02,1.0,3,train,foo
2,1.0,2018-01-02,1.0,3,test,foo
3,1.0,2018-01-02,1.0,3,train,foo


In [12]:
#The columns of the resulting DataFrame have different dtypes.
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [17]:
#Tab completion for column names
df2.<TAB>
df2.A                  df2.bool
df2.abs                df2.boxplot
df2.add                df2.C
df2.add_prefix         df2.clip
df2.add_suffix         df2.clip_lower
df2.align              df2.clip_upper
df2.all                df2.columns
df2.any                df2.combine
df2.append             df2.combine_first
df2.apply              df2.compound
df2.applymap           df2.consolidate
df2.D

SyntaxError: invalid syntax (<ipython-input-17-075c4b77261d>, line 2)

### Viewing Data
Here is how to view the top and bottom rows of the frame:

In [23]:
df2.head()

Unnamed: 0,A,B,C,D,E,F
0,1.0,2018-01-02,1.0,3,test,foo
1,1.0,2018-01-02,1.0,3,train,foo
2,1.0,2018-01-02,1.0,3,test,foo
3,1.0,2018-01-02,1.0,3,train,foo


In [24]:
df2.tail()

Unnamed: 0,A,B,C,D,E,F
0,1.0,2018-01-02,1.0,3,test,foo
1,1.0,2018-01-02,1.0,3,train,foo
2,1.0,2018-01-02,1.0,3,test,foo
3,1.0,2018-01-02,1.0,3,train,foo


Display the index, columns, and the underlying NumPy data:

In [37]:
df.index

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06'],
              dtype='datetime64[ns]', freq='D')

In [39]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [40]:
df.values

array([[ 0.4762781 ,  0.97826288, -0.98863989, -0.41307088],
       [ 1.4958118 ,  0.2627219 ,  0.23888764, -0.56332261],
       [ 0.32463029,  1.25438548,  0.29473421, -0.57488154],
       [ 1.41768441,  0.52333907,  1.26307961,  2.3022914 ],
       [ 1.62711073,  1.16384588, -0.57679504,  0.8734081 ],
       [ 0.55613898, -0.02152209, -0.80107288,  0.78891272]])

`describe()` shows a quick statistic summary of your data:

In [41]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.982942,0.693506,-0.094968,0.402223
std,0.589785,0.518206,0.852879,1.143112
min,0.32463,-0.021522,-0.98864,-0.574882
25%,0.496243,0.327876,-0.745003,-0.52576
50%,0.986912,0.750801,-0.168954,0.187921
75%,1.47628,1.11745,0.280773,0.852284
max,1.627111,1.254385,1.26308,2.302291


In [42]:
df2.describe()

Unnamed: 0,A,C,D
count,4.0,4.0,4.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


Transposing your data:

In [43]:
df.T

Unnamed: 0,2018-01-01 00:00:00,2018-01-02 00:00:00,2018-01-03 00:00:00,2018-01-04 00:00:00,2018-01-05 00:00:00,2018-01-06 00:00:00
A,0.476278,1.495812,0.32463,1.417684,1.627111,0.556139
B,0.978263,0.262722,1.254385,0.523339,1.163846,-0.021522
C,-0.98864,0.238888,0.294734,1.26308,-0.576795,-0.801073
D,-0.413071,-0.563323,-0.574882,2.302291,0.873408,0.788913


Sorting by an axis:

In [46]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2018-01-01,-0.413071,-0.98864,0.978263,0.476278
2018-01-02,-0.563323,0.238888,0.262722,1.495812
2018-01-03,-0.574882,0.294734,1.254385,0.32463
2018-01-04,2.302291,1.26308,0.523339,1.417684
2018-01-05,0.873408,-0.576795,1.163846,1.627111
2018-01-06,0.788913,-0.801073,-0.021522,0.556139


In [47]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2018-01-06,0.556139,-0.021522,-0.801073,0.788913
2018-01-02,1.495812,0.262722,0.238888,-0.563323
2018-01-04,1.417684,0.523339,1.26308,2.302291
2018-01-01,0.476278,0.978263,-0.98864,-0.413071
2018-01-05,1.627111,1.163846,-0.576795,0.873408
2018-01-03,0.32463,1.254385,0.294734,-0.574882


In [None]:
### Selection
Here is how to view the top and bottom rows of the frame: