SERIES:
Create series using python arrays

In [None]:
#All third party packages require to be imported first in Python. 'pd' is used to simplify and avoid repeating 'pandas' all time.
#You can choose other name as well.
import pandas as pd
import numpy as np

#Create series using python arrays
labels = ['a', 'b', 'c'] #Define labels for each row, not part of the actual data!
my_list = [10, 20, 30] #Define python array

series_variable = pd.Series(data=my_list, index = labels) #Define pandas.series

print(series_variable)


In [2]:
series_variable.shape

(3,)

In [5]:
#Create series using Python dict object
python_dict = {'a':15, 'b':42, 'c':36} #Define python dict

series_dic = pd.Series(python_dict) #Define pandas.series

print(series_dic) 


a    15
b    42
c    36
dtype: int64


In [6]:
arr = np.array([55, 35, 22]) #Define a NumPy array

series_arr = pd.Series(data=arr) #Define pandas.series

print(series_arr)

0    55
1    35
2    22
dtype: int64


Operations between Series

NOTE: Series object based on their associated index values. They don’t need be of same length and the result will be the union of them.


In [7]:
#Define pandas.Series
ser_1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
ser_2 = pd.Series([4, 5, 6, 7], index=['a', 'c', 'e', 'f'])

#Add two series
ser_res = ser_1 + ser_2

#print results
print(ser_res) 

a    5.0
b    NaN
c    8.0
d    NaN
e    NaN
f    NaN
dtype: float64


DATAFRAMES

 two-dimensional tabular data structure with labeled axes (rows and columns).

In [8]:
#Create pandas.DataFrame using Python dict object
d = {'col1': [1, 2], 'col2': [3, 4]} #Define a dict object

df = pd.DataFrame(data=d) #Define pandas.DataFrame object

print(df)

   col1  col2
0     1     3
1     2     4


DataFrame from numpy ndarray

In [9]:
#Create pandas.DataFrame from numpy ndarray
from numpy.random import randn
np.random.seed(101)

df = pd.DataFrame(randn(5, 4), index = 'A B C D E'.split(), columns = 'W X Y Z'.split())

print(df)

          W         X         Y         Z
A  2.706850  0.628133  0.907969  0.503826
B  0.651118 -0.319318 -0.848077  0.605965
C -2.018168  0.740122  0.528813 -0.589001
D  0.188695 -0.758872 -0.933237  0.955057
E  0.190794  1.978757  2.605967  0.683509


In [10]:
#Create a DataFrame by passing a dict of objects.
df = pd.DataFrame({ 'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
'D' : np.array([3] * 4,dtype='int32'),
'E' : pd.Categorical(["test","train","test","train"]),
'F' : 'foo' })

print(df)

print('\n', df.dtypes)

#The columns of the resulting DataFrame have different dtypes.

     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo
('\n', A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object)


In [11]:
df.shape

(4, 6)

In [18]:
#Create a DataFrame of shape 6 x 4; use dateTime as index, give column names
df = pd.DataFrame(np.random.randn(6,4), index=pd.date_range('20180101', periods=6), columns=list('ABCD'))

#View DataFrame
print(df)

                   A         B         C         D
2018-01-01  0.641806 -0.905100 -0.391157  1.028293
2018-01-02 -1.972605 -0.866885  0.720788 -1.223082
2018-01-03  1.606780 -1.115710 -1.385379 -1.329660
2018-01-04  0.041460 -0.411055 -0.771329  0.110477
2018-01-05 -0.804652  0.253548  0.649148  0.358941
2018-01-06 -1.080471  0.902398  0.161781  0.833029


In [13]:
df.shape

(6, 4)

Transposing your DataFrame

In [16]:
print(df.T)     # note, the actual df is not modified

   2018-01-01  2018-01-02  2018-01-03  2018-01-04  2018-01-05  2018-01-06
A    1.025984    2.154846    0.147027   -0.925874    0.386030    0.681209
B   -0.156598   -0.610259   -0.479448    1.862864    2.084019    1.035125
C   -0.031579   -0.755325    0.558769   -1.133817   -0.376519   -0.031160
D    0.649826   -0.346419    1.024810    0.610478    0.230336    1.939932


In [42]:
#Sorting by an axis

# NOTE: # sort_index level 0 for ‘index’ - NOTE this says sort_index(), not the same as sort()

df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2018-01-06,-1.080471,0.902398,0.161781,0.833029
2018-01-05,-0.804652,0.253548,0.649148,0.358941
2018-01-04,0.04146,-0.411055,-0.771329,0.110477
2018-01-03,1.60678,-1.11571,-1.385379,-1.32966
2018-01-02,-1.972605,-0.866885,0.720788,-1.223082
2018-01-01,0.641806,-0.9051,-0.391157,1.028293


Sorting by values in a column

In [36]:
#Sorting by values
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2018-01-03,1.60678,-1.11571,-1.385379,-1.32966
2018-01-01,0.641806,-0.9051,-0.391157,1.028293
2018-01-02,-1.972605,-0.866885,0.720788,-1.223082
2018-01-04,0.04146,-0.411055,-0.771329,0.110477
2018-01-05,-0.804652,0.253548,0.649148,0.358941
2018-01-06,-1.080471,0.902398,0.161781,0.833029


In [43]:
#Selecting a single column
print(df['A'])

2018-01-01    0.641806
2018-01-02   -1.972605
2018-01-03    1.606780
2018-01-04    0.041460
2018-01-05   -0.804652
2018-01-06   -1.080471
Freq: D, Name: A, dtype: float64


In [45]:
#Selecting a range of columns

print(df['20180102':'20180104'])

                   A         B         C         D
2018-01-02 -1.972605 -0.866885  0.720788 -1.223082
2018-01-03  1.606780 -1.115710 -1.385379 -1.329660
2018-01-04  0.041460 -0.411055 -0.771329  0.110477


In [None]:
#Show two columns between two dates
print(df.loc['20180102':'20180104',['A','B']])

                   A         B
2018-01-02 -1.972605 -0.866885
2018-01-03  1.606780 -1.115710
2018-01-04  0.041460 -0.411055


In [53]:
#Show three columns using iloc
print(df.iloc[:,[0,1,2]])

                   A         B         C
2018-01-01  0.641806 -0.905100 -0.391157
2018-01-02 -1.972605 -0.866885  0.720788
2018-01-03  1.606780 -1.115710 -1.385379
2018-01-04  0.041460 -0.411055 -0.771329
2018-01-05 -0.804652  0.253548  0.649148
2018-01-06 -1.080471  0.902398  0.161781
