# Pandas Introduction

In [2]:
import numpy as np
import pandas as pd

## Object creation

Create a Series by passing a list of values

In [3]:
s = pd.Series([1, 3, 4, np.nan, 6, 8])
print(s)

0    1.0
1    3.0
2    4.0
3    NaN
4    6.0
5    8.0
dtype: float64


Creating a DataFrame by passing a NumPy array, with datetime index using data_range() and labeled columns

In [4]:
dates = pd.date_range("20130101", periods=6)
print(dates)
df = pd.DataFrame(np.random.rand(6, 4), index=dates, columns=list("ABCD"))
print(df)

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
                   A         B         C         D
2013-01-01  0.572308  0.920845  0.525894  0.931884
2013-01-02  0.768634  0.644030  0.225700  0.727291
2013-01-03  0.944050  0.748139  0.990291  0.381071
2013-01-04  0.906818  0.965129  0.085156  0.251212
2013-01-05  0.566287  0.397570  0.419578  0.817292
2013-01-06  0.176934  0.703657  0.532107  0.751935


Creating a DataFrame by passing a dictionaty of objects that can be converted into a series-like structure: 

In [5]:
df2 = pd.DataFrame({
    "A": 1.0,
    "B": pd.Timestamp("20130102"),
    "C": pd.Series(1, index=list(range(4)), dtype="float32"),
    "D": np.array([3]* 4, dtype="int32"),
    "E": pd.Categorical(["test", "train", "test", "train"]),
    "F": "foo",
})
print(df2)
print('-'*40)
print(df2.dtypes)

     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo
----------------------------------------
A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object


## Viewing data

Use:
- DataFrame.head()
- DataFrame.tail()
<br>to view top and bottom rows respectively.

In [6]:
print(df.head(3))
print('-'*50)
print(df.tail(3))

                   A         B         C         D
2013-01-01  0.572308  0.920845  0.525894  0.931884
2013-01-02  0.768634  0.644030  0.225700  0.727291
2013-01-03  0.944050  0.748139  0.990291  0.381071
--------------------------------------------------
                   A         B         C         D
2013-01-04  0.906818  0.965129  0.085156  0.251212
2013-01-05  0.566287  0.397570  0.419578  0.817292
2013-01-06  0.176934  0.703657  0.532107  0.751935


Display df index or columns

In [7]:
print(df.index)
print('-'*50)
print(df.columns)

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
--------------------------------------------------
Index(['A', 'B', 'C', 'D'], dtype='object')


__Note:__ NumPy arrays have one dtype for the entire array, while pandas DataFrames have one dtype per column.

In [8]:
df.to_numpy()

array([[0.57230816, 0.92084451, 0.52589381, 0.93188374],
       [0.76863413, 0.64403013, 0.2257003 , 0.72729084],
       [0.94404968, 0.74813908, 0.99029115, 0.38107145],
       [0.9068183 , 0.96512942, 0.08515571, 0.2512122 ],
       [0.56628723, 0.39756973, 0.41957786, 0.81729219],
       [0.17693361, 0.70365709, 0.53210651, 0.75193522]])

__Note:__ DataFrame.to_numpy() does not include the index or column labels in the output.

In [9]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

Quick statistic summary

In [10]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.655839,0.729895,0.463121,0.643448
std,0.283946,0.205247,0.312324,0.266423
min,0.176934,0.39757,0.085156,0.251212
25%,0.567792,0.658937,0.27417,0.467626
50%,0.670471,0.725898,0.472736,0.739613
75%,0.872272,0.877668,0.530553,0.800953
max,0.94405,0.965129,0.990291,0.931884


Transpose your data

In [11]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,0.572308,0.768634,0.94405,0.906818,0.566287,0.176934
B,0.920845,0.64403,0.748139,0.965129,0.39757,0.703657
C,0.525894,0.2257,0.990291,0.085156,0.419578,0.532107
D,0.931884,0.727291,0.381071,0.251212,0.817292,0.751935


Sort by an axis

In [12]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,0.931884,0.525894,0.920845,0.572308
2013-01-02,0.727291,0.2257,0.64403,0.768634
2013-01-03,0.381071,0.990291,0.748139,0.94405
2013-01-04,0.251212,0.085156,0.965129,0.906818
2013-01-05,0.817292,0.419578,0.39757,0.566287
2013-01-06,0.751935,0.532107,0.703657,0.176934


Sort by values

In [14]:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2013-01-05,0.566287,0.39757,0.419578,0.817292
2013-01-02,0.768634,0.64403,0.2257,0.727291
2013-01-06,0.176934,0.703657,0.532107,0.751935
2013-01-03,0.94405,0.748139,0.990291,0.381071
2013-01-01,0.572308,0.920845,0.525894,0.931884
2013-01-04,0.906818,0.965129,0.085156,0.251212
