In [1]:
import numpy as np
import pandas as pd

### Object Creation

Create a *Series* by passeing a list of values, letting pandas create a default integer index

In [2]:
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

Create a *DataFrame* by passing a numpy array, with datetime index and columns

In [3]:
dates = pd.date_range("20190101",periods=6)
print(dates)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list("ABCD"))
df

DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05', '2019-01-06'],
              dtype='datetime64[ns]', freq='D')


Unnamed: 0,A,B,C,D
2019-01-01,0.377062,1.320765,-0.306338,-0.773348
2019-01-02,-1.853365,-0.874267,1.241223,-0.490473
2019-01-03,-0.332513,2.442822,0.016876,-1.228647
2019-01-04,-1.154273,-0.423022,0.528699,-0.043597
2019-01-05,0.685772,0.321247,0.951732,-0.477273
2019-01-06,-1.299573,-0.532645,-0.528329,-0.593916


Create a *DataFrame* by passing a dict

In [4]:
df_ = pd.DataFrame({"A":1.0,
                    "B":pd.Timestamp("20190920"),
                    "C":pd.Series(1,index=list(range(4)),dtype="float32"),
                    "D":np.array([3]*4,dtype="int32"),
                    "E":pd.Categorical(["test","train","test","train"]),
                    "F":"foo"
                   })
print(df_.dtypes)
df_

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object


Unnamed: 0,A,B,C,D,E,F
0,1.0,2019-09-20,1.0,3,test,foo
1,1.0,2019-09-20,1.0,3,train,foo
2,1.0,2019-09-20,1.0,3,test,foo
3,1.0,2019-09-20,1.0,3,train,foo


In [5]:
print(df_.A)
print("---------------------------------")
print(df_.columns)
print("---------------------------------")
print(df_.compound)
print("---------------------------------")

0    1.0
1    1.0
2    1.0
3    1.0
Name: A, dtype: float64
---------------------------------
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
---------------------------------
<bound method NDFrame._add_numeric_operations.<locals>.compound of      A          B    C  D      E    F
0  1.0 2019-09-20  1.0  3   test  foo
1  1.0 2019-09-20  1.0  3  train  foo
2  1.0 2019-09-20  1.0  3   test  foo
3  1.0 2019-09-20  1.0  3  train  foo>
---------------------------------


### Viewing Data

In [6]:
df

Unnamed: 0,A,B,C,D
2019-01-01,0.377062,1.320765,-0.306338,-0.773348
2019-01-02,-1.853365,-0.874267,1.241223,-0.490473
2019-01-03,-0.332513,2.442822,0.016876,-1.228647
2019-01-04,-1.154273,-0.423022,0.528699,-0.043597
2019-01-05,0.685772,0.321247,0.951732,-0.477273
2019-01-06,-1.299573,-0.532645,-0.528329,-0.593916


In [7]:
df.head(2)

Unnamed: 0,A,B,C,D
2019-01-01,0.377062,1.320765,-0.306338,-0.773348
2019-01-02,-1.853365,-0.874267,1.241223,-0.490473


In [8]:
df.tail(3)

Unnamed: 0,A,B,C,D
2019-01-04,-1.154273,-0.423022,0.528699,-0.043597
2019-01-05,0.685772,0.321247,0.951732,-0.477273
2019-01-06,-1.299573,-0.532645,-0.528329,-0.593916


In [9]:
df.index

DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05', '2019-01-06'],
              dtype='datetime64[ns]', freq='D')

In [10]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

DataFrame.to_numpy() creates a numpy representation of the pandas data

In [11]:
df.to_numpy()

array([[ 0.37706182,  1.32076451, -0.30633841, -0.77334824],
       [-1.85336527, -0.87426722,  1.24122334, -0.49047317],
       [-0.33251269,  2.44282151,  0.01687575, -1.22864675],
       [-1.15427265, -0.42302165,  0.52869945, -0.04359742],
       [ 0.68577173,  0.32124674,  0.95173176, -0.47727275],
       [-1.29957309, -0.5326452 , -0.52832945, -0.59391619]])

DataFrame.describe() shows the statistic summary of the data

In [12]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.596148,0.375816,0.31731,-0.601209
std,1.004676,1.280201,0.706413,0.390363
min,-1.853365,-0.874267,-0.528329,-1.228647
25%,-1.263248,-0.505239,-0.225535,-0.72849
50%,-0.743393,-0.050887,0.272788,-0.542195
75%,0.199668,1.070885,0.845974,-0.480573
max,0.685772,2.442822,1.241223,-0.043597


Transposing data is easy as matrix

In [13]:
df.T

Unnamed: 0,2019-01-01,2019-01-02,2019-01-03,2019-01-04,2019-01-05,2019-01-06
A,0.377062,-1.853365,-0.332513,-1.154273,0.685772,-1.299573
B,1.320765,-0.874267,2.442822,-0.423022,0.321247,-0.532645
C,-0.306338,1.241223,0.016876,0.528699,0.951732,-0.528329
D,-0.773348,-0.490473,-1.228647,-0.043597,-0.477273,-0.593916


Sorting data by axis

In [14]:
df.sort_index(axis=0,ascending=False)

Unnamed: 0,A,B,C,D
2019-01-06,-1.299573,-0.532645,-0.528329,-0.593916
2019-01-05,0.685772,0.321247,0.951732,-0.477273
2019-01-04,-1.154273,-0.423022,0.528699,-0.043597
2019-01-03,-0.332513,2.442822,0.016876,-1.228647
2019-01-02,-1.853365,-0.874267,1.241223,-0.490473
2019-01-01,0.377062,1.320765,-0.306338,-0.773348


In [15]:
df.sort_index(axis=1,ascending=False)

Unnamed: 0,D,C,B,A
2019-01-01,-0.773348,-0.306338,1.320765,0.377062
2019-01-02,-0.490473,1.241223,-0.874267,-1.853365
2019-01-03,-1.228647,0.016876,2.442822,-0.332513
2019-01-04,-0.043597,0.528699,-0.423022,-1.154273
2019-01-05,-0.477273,0.951732,0.321247,0.685772
2019-01-06,-0.593916,-0.528329,-0.532645,-1.299573


Sorting by values

In [16]:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2019-01-02,-1.853365,-0.874267,1.241223,-0.490473
2019-01-06,-1.299573,-0.532645,-0.528329,-0.593916
2019-01-04,-1.154273,-0.423022,0.528699,-0.043597
2019-01-05,0.685772,0.321247,0.951732,-0.477273
2019-01-01,0.377062,1.320765,-0.306338,-0.773348
2019-01-03,-0.332513,2.442822,0.016876,-1.228647
