In [1]:
import pandas as pd

1. Creating DataFrames

In [10]:
# Option 1: Using Dictionary as data source
df_dictionary = pd.DataFrame(
{"a" : [4, 5, 6], 
"b" : [7, 8, 9], 
"c" : [10, 11, 12]}, 
index = [1, 2, 3])
df_dictionary

Unnamed: 0,a,b,c
1,4,7,10
2,5,8,11
3,6,9,12


In [11]:
# Option 1b: without index: row number will by default start from 0
df_default = pd.DataFrame(
{"a" : [4, 5, 6], 
"b" : [7, 8, 9], 
"c" : [10, 11, 12]})
df_default

Unnamed: 0,a,b,c
0,4,7,10
1,5,8,11
2,6,9,12


In [12]:
# Option 2: Using list of lists
df_list = pd.DataFrame(
[[4, 5, 6], 
[7, 8, 9], 
[10, 11, 12]], 
index = [1, 2, 3],
columns = ['a', 'b', 'c'])
df_list

Unnamed: 0,a,b,c
1,4,5,6
2,7,8,9
3,10,11,12


In [16]:
# Option 3: multi-index
df_multiindex = pd.DataFrame(
{"a" : [4 ,5, 6], 
"b" : [7, 8, 9], 
"c" : [10, 11, 12]}, 
index = pd.MultiIndex.from_tuples(
[('d', 1), ('d', 2),
('e', 2)], names=['n', 'v']))
df_multiindex

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4,7,10
d,2,5,8,11
e,2,6,9,12


2. Reading a CSV file

In [3]:
df=pd.read_csv('../data/Demo_sales_data.csv')
df.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,08/11/2021,11/11/2021,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.91
1,2,CA-2016-152156,08/11/2021,11/11/2021,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.58
2,3,CA-2016-138688,12/06/2021,16/06/2021,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.87
3,14,CA-2016-161389,05/12/2021,10/12/2021,Standard Class,IM-15070,Irene Maddox,Consumer,United States,Seattle,...,98103,West,OFF-BI-10003656,Office Supplies,Binders,Fellowes PB200 Plastic Comb Binding Machine,407.98,3,0.2,132.59
4,22,CA-2016-137330,09/12/2021,13/12/2021,Standard Class,KB-16585,Ken Black,Corporate,United States,Fremont,...,68025,Central,OFF-AR-10000246,Office Supplies,Art,Newell 318,19.46,7,0.0,5.06


3. Selecting Subset of a Dataset

In [165]:
# Selecting a column as data series
df_segment=df['Segment']
df_segment.head()

0     Consumer
1     Consumer
2    Corporate
3     Consumer
4    Corporate
Name: Segment, dtype: object

In [166]:
# Selecting a column: alternative
df.Segment

0          Consumer
1          Consumer
2         Corporate
3          Consumer
4         Corporate
           ...     
2582    Home Office
2583    Home Office
2584       Consumer
2585       Consumer
2586       Consumer
Name: Segment, Length: 2587, dtype: object

In [46]:
# Selecting columns as dataframe along with column names
df_subset=df[['Row ID','Segment']]
df_subset.head()

Unnamed: 0,Row ID,Segment
0,1,Consumer
1,2,Consumer
2,3,Corporate
3,14,Consumer
4,22,Corporate


In [66]:
# Selecting rows using query, loc or dataframe
x=df_subset.query("Segment=='Consumer'").head()
y=df_subset[df_subset["Segment"]=='Corporate'].head()
z=df_subset.loc[df_subset['Segment']=='Home Office'].head()
print("Filtering using query")
print(x)
print("Filtering using dataframe")
print(y)
print("Filtering using loc")
print(z)

Filtering using query
   Row ID   Segment
0       1  Consumer
1       2  Consumer
3      14  Consumer
6      26  Consumer
7      27  Consumer
Filtering using dataframe
   Row ID    Segment
2       3  Corporate
4      22  Corporate
5      23  Corporate
8      36  Corporate
9      37  Corporate
Filtering using loc
    Row ID      Segment
28      89  Home Office
33     100  Home Office
34     101  Home Office
35     102  Home Office
44     129  Home Office


In [148]:
df['Segment']

0          Consumer
1          Consumer
2         Corporate
3          Consumer
4         Corporate
           ...     
2582    Home Office
2583    Home Office
2584       Consumer
2585       Consumer
2586       Consumer
Name: Segment, Length: 2587, dtype: object

In [161]:
df.query("Segment=='Corporate'")
df_subset.query("Segment=='Consumer'" or "Segment=='Corporate'")

Unnamed: 0,Row ID,Segment
0,1,Consumer
1,2,Consumer
3,14,Consumer
6,26,Consumer
7,27,Consumer
...,...,...
2568,9936,Consumer
2576,9973,Consumer
2584,9983,Consumer
2585,9984,Consumer


In [116]:
df[df["Postal Code"].isin(range(10000,50000))].head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,08/11/2021,11/11/2021,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.91
1,2,CA-2016-152156,08/11/2021,11/11/2021,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.58
13,48,CA-2016-169194,20/06/2021,25/06/2021,Standard Class,LH-16900,Lena Hernandez,Consumer,United States,Dover,...,19901,East,TEC-AC-10002167,Technology,Accessories,Imation 8gb Micro Traveldrive Usb 2.0 Flash Drive,45.0,3,0.0,4.95
14,49,CA-2016-169194,20/06/2021,25/06/2021,Standard Class,LH-16900,Lena Hernandez,Consumer,United States,Dover,...,19901,East,TEC-PH-10003988,Technology,Phones,"LF Elite 3D Dazzle Designer Hard Case Cover, L...",21.8,2,0.0,6.1
15,54,CA-2016-105816,11/12/2021,17/12/2021,Standard Class,JM-15265,Janet Molinari,Corporate,United States,New York City,...,10024,East,OFF-FA-10000304,Office Supplies,Fasteners,Advantus Push Pins,15.26,7,0.0,6.26


In [74]:
# Selecting all rows, and all columns using loc
df.loc[:,:].head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,08/11/2021,11/11/2021,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.91
1,2,CA-2016-152156,08/11/2021,11/11/2021,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.58
2,3,CA-2016-138688,12/06/2021,16/06/2021,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.87
3,14,CA-2016-161389,05/12/2021,10/12/2021,Standard Class,IM-15070,Irene Maddox,Consumer,United States,Seattle,...,98103,West,OFF-BI-10003656,Office Supplies,Binders,Fellowes PB200 Plastic Comb Binding Machine,407.98,3,0.2,132.59
4,22,CA-2016-137330,09/12/2021,13/12/2021,Standard Class,KB-16585,Ken Black,Corporate,United States,Fremont,...,68025,Central,OFF-AR-10000246,Office Supplies,Art,Newell 318,19.46,7,0.0,5.06


In [70]:
# Selecting all rows, and selected columns using loc
df.loc[:,('Segment','Country')].head()

Unnamed: 0,Segment,Country
0,Consumer,United States
1,Consumer,United States
2,Corporate,United States
3,Consumer,United States
4,Corporate,United States


In [79]:
# Selected rows and columns between a range of indexes and coulmns=>segment and country
df.loc[1:4,'Segment':'City'].head()

Unnamed: 0,Segment,Country,City
1,Consumer,United States,Henderson
2,Corporate,United States,Los Angeles
3,Consumer,United States,Seattle
4,Corporate,United States,Fremont


In [77]:
# Selecting all rows, and selected columns using iloc
# This is used to locate rows and columns using index
# Note that it is not inclusive of right hand boundary
df.iloc[1:4,[2,4]].head()

Unnamed: 0,Order Date,Ship Mode
1,08/11/2021,Second Class
2,12/06/2021,Second Class
3,05/12/2021,Standard Class


In [89]:
# Selected row and column indexes
df.iloc[[1,4],[2,4,5,8]].head()

Unnamed: 0,Order Date,Ship Mode,Customer ID,Country
1,08/11/2021,Second Class,CG-12520,United States
4,09/12/2021,Standard Class,KB-16585,United States


In [86]:
# Range of row and column indexes
df.iloc[1:4,2:5].head()

Unnamed: 0,Order Date,Ship Date,Ship Mode
1,08/11/2021,11/11/2021,Second Class
2,12/06/2021,16/06/2021,Second Class
3,05/12/2021,10/12/2021,Standard Class


In [None]:
# Row indexing and filtering not allowed
df.loc[2:7,df['Segment']=="Corporate"]
# Output throughs an error

In [111]:
# Row level filtering on more than one column with column selection
df.loc[(df['Segment']=="Corporate") & (df['Postal Code']<50000),'Segment':'City'].head()

Unnamed: 0,Segment,Country,City
15,Corporate,United States,New York City
16,Corporate,United States,New York City
26,Corporate,United States,Decatur
27,Corporate,United States,Decatur
53,Corporate,United States,Philadelphia


In [113]:
# Row level filtering on more than one column with column selection
df.iloc[(df['Segment']=="Corporate") & (df['Postal Code']<50000),5:8].head()

NotImplementedError: iLocation based boolean indexing on an integer type is not available