# Pandas Viewing

Practice on how to explore data in Pandas.

In [2]:
import numpy as np
import pandas as pd

In [3]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [6]:
# create a date index using the `date_range` function
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [7]:
df = pd.DataFrame(np.random.rand(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,0.288387,0.616599,0.656129,0.519211
2013-01-02,0.575596,0.093279,0.790642,0.670111
2013-01-03,0.984154,0.234529,0.835678,0.90521
2013-01-04,0.208888,0.365048,0.108914,0.226973
2013-01-05,0.584147,0.902622,0.313074,0.860921
2013-01-06,0.626268,0.914701,0.244343,0.658814


In [12]:
df2 = pd.DataFrame({'A': 1.0,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(['test', 'train', 'test', 'train']),
                    'F': 'foo'})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [13]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [14]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,0.288387,0.616599,0.656129,0.519211
2013-01-02,0.575596,0.093279,0.790642,0.670111
2013-01-03,0.984154,0.234529,0.835678,0.90521
2013-01-04,0.208888,0.365048,0.108914,0.226973
2013-01-05,0.584147,0.902622,0.313074,0.860921


In [15]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,0.208888,0.365048,0.108914,0.226973
2013-01-05,0.584147,0.902622,0.313074,0.860921
2013-01-06,0.626268,0.914701,0.244343,0.658814


In [16]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [17]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [18]:
df.to_numpy()

array([[0.28838669, 0.61659898, 0.65612857, 0.51921076],
       [0.57559578, 0.09327901, 0.79064184, 0.67011052],
       [0.98415396, 0.23452916, 0.8356778 , 0.90520962],
       [0.2088878 , 0.36504818, 0.10891404, 0.22697344],
       [0.58414704, 0.90262166, 0.31307378, 0.86092102],
       [0.62626827, 0.91470069, 0.24434333, 0.65881377]])

`Numpy` arrays have one dtype for the entire array, while Pandas DataFrames have one dtype per column. When you call `DataFrame.to_numpy()`, Pandas will find the NumPy dtype that can hold all of the dtypes in the DataFrame. This may end up being an object, which requires casting every value to a Python object. This can lead to very expensive (time and memory-consuming) operations.

In [19]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.544573,0.52113,0.491463,0.640207
std,0.275953,0.346152,0.308007,0.247218
min,0.208888,0.093279,0.108914,0.226973
25%,0.360189,0.267159,0.261526,0.554112
50%,0.579871,0.490824,0.484601,0.664462
75%,0.615738,0.831116,0.757014,0.813218
max,0.984154,0.914701,0.835678,0.90521


In [20]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,0.288387,0.575596,0.984154,0.208888,0.584147,0.626268
B,0.616599,0.093279,0.234529,0.365048,0.902622,0.914701
C,0.656129,0.790642,0.835678,0.108914,0.313074,0.244343
D,0.519211,0.670111,0.90521,0.226973,0.860921,0.658814


In [21]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,0.519211,0.656129,0.616599,0.288387
2013-01-02,0.670111,0.790642,0.093279,0.575596
2013-01-03,0.90521,0.835678,0.234529,0.984154
2013-01-04,0.226973,0.108914,0.365048,0.208888
2013-01-05,0.860921,0.313074,0.902622,0.584147
2013-01-06,0.658814,0.244343,0.914701,0.626268
