# A Deep Dive into Data Wrangling with Python

In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xlrd

In [2]:
# import data
df = pd.read_excel('datasets/Sample - Superstore.xls')
df.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


In [3]:
# Drop the column 'Row ID'
df.drop('Row ID', axis=1, inplace=True)
df.head()

Unnamed: 0,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,California,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


In [4]:
# Check the number of rows and columns
df.shape

(9994, 20)

In [5]:
# Extract states for which the information is in the database
df['State'].unique()

array(['Kentucky', 'California', 'Florida', 'North Carolina',
       'Washington', 'Texas', 'Wisconsin', 'Utah', 'Nebraska',
       'Pennsylvania', 'Illinois', 'Minnesota', 'Michigan', 'Delaware',
       'Indiana', 'New York', 'Arizona', 'Virginia', 'Tennessee',
       'Alabama', 'South Carolina', 'Oregon', 'Colorado', 'Iowa', 'Ohio',
       'Missouri', 'Oklahoma', 'New Mexico', 'Louisiana', 'Connecticut',
       'New Jersey', 'Massachusetts', 'Georgia', 'Nevada', 'Rhode Island',
       'Mississippi', 'Arkansas', 'Montana', 'New Hampshire', 'Maryland',
       'District of Columbia', 'Kansas', 'Vermont', 'Maine',
       'South Dakota', 'Idaho', 'North Dakota', 'Wyoming',
       'West Virginia'], dtype=object)

In [6]:
# Use the nunique method to count the number of unique values in the State column
df['State'].nunique()

49

In [7]:
# Create the matrix_data, row_labels, and column_headings functions
matrix_data = np.matrix('22,66,140;42,70,148;30,62,125;35,68,160;25,62,152')
row_labels = ['A','B','C','D','E']
column_headings = ['Age', 'Height', 'Weight']

In [8]:
# create a dataframe
df1 = pd.DataFrame(data=matrix_data,
                  index=row_labels,
                  columns=column_headings)
df1

Unnamed: 0,Age,Height,Weight
A,22,66,140
B,42,70,148
C,30,62,125
D,35,68,160
E,25,62,152


In [9]:
# Reset the index
df1.reset_index()

Unnamed: 0,index,Age,Height,Weight
0,A,22,66,140
1,B,42,70,148
2,C,30,62,125
3,D,35,68,160
4,E,25,62,152


In [10]:
# Reset the index with drop set to True
df1.reset_index(drop=True)

Unnamed: 0,Age,Height,Weight
0,22,66,140
1,42,70,148
2,30,62,125
3,35,68,160
4,25,62,152


In [11]:
# Add a new column
df1['Professional'] = "Student Teacher Engineer Doctor Nurse".split()
df1

Unnamed: 0,Age,Height,Weight,Professional
A,22,66,140,Student
B,42,70,148,Teacher
C,30,62,125,Engineer
D,35,68,160,Doctor
E,25,62,152,Nurse


In [12]:
# set the Profession column as an index 
df1.set_index('Professional')

Unnamed: 0_level_0,Age,Height,Weight
Professional,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Student,22,66,140
Teacher,42,70,148
Engineer,30,62,125
Doctor,35,68,160
Nurse,25,62,152


In [13]:
# Create a 10-record subset
df_subset = df.loc[:9, ['Ship Mode', 'State', 'Sales']]
df_subset

Unnamed: 0,Ship Mode,State,Sales
0,Second Class,Kentucky,261.96
1,Second Class,Kentucky,731.94
2,Second Class,California,14.62
3,Standard Class,Florida,957.5775
4,Standard Class,Florida,22.368
5,Standard Class,California,48.86
6,Standard Class,California,7.28
7,Standard Class,California,907.152
8,Standard Class,California,18.504
9,Standard Class,California,114.9


In [14]:
# Create a pandas DataFrame using the groupby method
byState = df_subset.groupby('State')
print(byState)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000271BA249408>


In [15]:
# Calculate the mean sales figure by State
byState.mean()

Unnamed: 0_level_0,Sales
State,Unnamed: 1_level_1
California,185.219333
Florida,489.97275
Kentucky,496.95


In [16]:
# Calculate the total sales figure by State
byState.sum()

Unnamed: 0_level_0,Sales
State,Unnamed: 1_level_1
California,1111.316
Florida,979.9455
Kentucky,993.9


In [17]:
# Subset that DataFrame for a particular state and show the statistics
pd.DataFrame(byState.describe().loc['California'])

Unnamed: 0,Unnamed: 1,California
Sales,count,6.0
Sales,mean,185.219333
Sales,std,355.889307
Sales,min,7.28
Sales,25%,15.591
Sales,50%,33.682
Sales,75%,98.39
Sales,max,907.152


In [18]:
# Perform a similar summarization by using the Ship Mode attribute
df_subset.groupby('Ship Mode').describe().loc[['Second Class', 'Standard Class']]

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Ship Mode,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Second Class,3.0,336.173333,364.373037,14.62,138.29,261.96,496.95,731.94
Standard Class,7.0,296.663071,435.947552,7.28,20.436,48.86,511.026,957.5775


In [19]:
# Display the complete summary statistics of sales by every city in each state
byStateCity = df.groupby(['State', 'City'])
byStateCity.describe()['Sales']

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
State,City,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama,Auburn,6.0,294.471667,361.914543,3.760,8.8050,182.030,456.4075,900.080
Alabama,Decatur,13.0,259.601538,385.660903,14.940,23.9200,44.950,239.9200,1215.920
Alabama,Florence,5.0,399.470000,796.488863,4.980,7.2700,12.480,152.7600,1819.860
Alabama,Hoover,4.0,131.462500,230.646923,7.160,13.3925,20.725,138.7950,477.240
Alabama,Huntsville,10.0,248.437000,419.576667,3.620,26.8700,81.920,171.8075,1319.960
...,...,...,...,...,...,...,...,...,...
Wisconsin,Superior,9.0,144.414444,213.394065,5.560,17.1200,47.400,125.9900,629.100
Wisconsin,Waukesha,1.0,54.500000,,54.500,54.5000,54.500,54.5000,54.500
Wisconsin,Wausau,4.0,79.370000,111.450605,12.390,20.0325,29.605,88.9425,245.880
Wisconsin,West Allis,2.0,125.240000,165.067007,8.520,66.8800,125.240,183.6000,241.960


In [20]:
# create dataset with missing values
df_missing = pd.read_excel('datasets/Sample - Superstore.xls', sheet_name="Missing")
df_missing

Unnamed: 0,Customer,Product,Sales,Quantity,Discount,Profit
0,Brosina Hoffman,,1706.184,9.0,0.2,85.3092
1,Brosina Hoffman,Phones,911.424,4.0,0.2,68.3568
2,Zuschuss Donatelli,Art,8.56,2.0,0.0,2.4824
3,Zuschuss Donatelli,Phones,,3.0,0.2,16.011
4,Zuschuss Donatelli,Binders,22.72,4.0,0.2,7.384
5,Eric Hoffmann,Binders,11.648,,0.2,4.2224
6,Eric Hoffmann,Accessories,90.57,3.0,0.0,11.7741
7,Ruben Ausman,,77.88,2.0,0.0,
8,,Accessories,13.98,2.0,0.0,6.1512
9,Kunst Miller,Binders,25.824,6.0,0.2,9.3612


In [21]:
# Fill in all the missing values with the FILL string
df_missing.fillna('FILL')

Unnamed: 0,Customer,Product,Sales,Quantity,Discount,Profit
0,Brosina Hoffman,FILL,1706.18,9,0.2,85.3092
1,Brosina Hoffman,Phones,911.424,4,0.2,68.3568
2,Zuschuss Donatelli,Art,8.56,2,0.0,2.4824
3,Zuschuss Donatelli,Phones,FILL,3,0.2,16.011
4,Zuschuss Donatelli,Binders,22.72,4,0.2,7.384
5,Eric Hoffmann,Binders,11.648,FILL,0.2,4.2224
6,Eric Hoffmann,Accessories,90.57,3,0.0,11.7741
7,Ruben Ausman,FILL,77.88,2,0.0,FILL
8,FILL,Accessories,13.98,2,0.0,6.1512
9,Kunst Miller,Binders,25.824,6,0.2,9.3612


In [22]:
# Fill in the values using ffill
df_missing['Sales'].fillna(method='ffill')

0     1706.184
1      911.424
2        8.560
3        8.560
4       22.720
5       11.648
6       90.570
7       77.880
8       13.980
9       25.824
10     146.730
Name: Sales, dtype: float64

In [23]:
# Use bfill to fill backward
df_missing['Sales'].fillna(method='bfill')

0     1706.184
1      911.424
2        8.560
3       22.720
4       22.720
5       11.648
6       90.570
7       77.880
8       13.980
9       25.824
10     146.730
Name: Sales, dtype: float64

In [24]:
# Fill the missing values in Sales by the average sales amount
df_missing['Sales'].fillna(df_missing.mean()['Sales'])

0     1706.184
1      911.424
2        8.560
3      301.552
4       22.720
5       11.648
6       90.570
7       77.880
8       13.980
9       25.824
10     146.730
Name: Sales, dtype: float64

In [25]:
# To set the axis parameter to zero and drop all missing rows
df_missing.dropna(axis=0)

Unnamed: 0,Customer,Product,Sales,Quantity,Discount,Profit
1,Brosina Hoffman,Phones,911.424,4.0,0.2,68.3568
2,Zuschuss Donatelli,Art,8.56,2.0,0.0,2.4824
4,Zuschuss Donatelli,Binders,22.72,4.0,0.2,7.384
6,Eric Hoffmann,Accessories,90.57,3.0,0.0,11.7741
9,Kunst Miller,Binders,25.824,6.0,0.2,9.3612
10,Kunst Miller,Paper,146.73,3.0,0.0,68.9631


In [26]:
# To set the axis parameter to 1 and drop all missing rows
df_missing.dropna(axis=1)

Unnamed: 0,Discount
0,0.2
1,0.2
2,0.0
3,0.2
4,0.2
5,0.2
6,0.0
7,0.0
8,0.0
9,0.2


In [27]:
# Drop the values with axis set to 1 and thresh set to 10
df_missing.dropna(axis=1, thresh=10)

Unnamed: 0,Customer,Sales,Quantity,Discount,Profit
0,Brosina Hoffman,1706.184,9.0,0.2,85.3092
1,Brosina Hoffman,911.424,4.0,0.2,68.3568
2,Zuschuss Donatelli,8.56,2.0,0.0,2.4824
3,Zuschuss Donatelli,,3.0,0.2,16.011
4,Zuschuss Donatelli,22.72,4.0,0.2,7.384
5,Eric Hoffmann,11.648,,0.2,4.2224
6,Eric Hoffmann,90.57,3.0,0.0,11.7741
7,Ruben Ausman,77.88,2.0,0.0,
8,,13.98,2.0,0.0,6.1512
9,Kunst Miller,25.824,6.0,0.2,9.3612


In [28]:
# import data
df = pd.read_excel("datasets/Sample - Superstore.xls")
df.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


In [29]:
# Sample 4 records each to create three DataFrames at random from the original sales dataset
df_1 = df[['Customer Name','State','Sales','Profit']].sample(n=4)
df_2 = df[['Customer Name','State','Sales','Profit']].sample(n=4)
df_3 = df[['Customer Name','State','Sales','Profit']].sample(n=4)

In [30]:
# Create a combined DataFrame with all the rows concatenated
df_cat1 = pd.concat([df_1, df_2, df_3], axis=0)
df_cat1

Unnamed: 0,Customer Name,State,Sales,Profit
4833,Maribeth Schnelling,Wisconsin,12.39,5.8233
3620,Sally Hughsby,Tennessee,40.776,4.5873
7808,Vivek Sundaresam,North Carolina,47.616,5.952
386,Dean Katz,Pennsylvania,482.34,-337.638
3102,Jason Gross,Rhode Island,220.98,50.8254
7088,Jennifer Ferguson,Nevada,79.36,23.808
5400,Muhammed Lee,Washington,71.98,15.1158
1962,Robert Waldorf,Missouri,13.71,6.5808
804,Gary McGarr,California,18.28,6.2152
1238,Greg Matthias,Washington,12.35,5.434


In [31]:
# Create a combined DataFrame with all the columns concatenated
df_cat2 = pd.concat([df_1, df_2, df_3], axis=1)
df_cat2

Unnamed: 0,Customer Name,State,Sales,Profit,Customer Name.1,State.1,Sales.1,Profit.1,Customer Name.2,State.2,Sales.2,Profit.2
67,,,,,,,,,Brendan Sweed,Arizona,1113.024,111.3024
386,Dean Katz,Pennsylvania,482.34,-337.638,,,,,,,,
804,,,,,,,,,Gary McGarr,California,18.28,6.2152
1238,,,,,,,,,Greg Matthias,Washington,12.35,5.434
1950,,,,,,,,,Ann Chong,New York,26.432,8.9208
1962,,,,,Robert Waldorf,Missouri,13.71,6.5808,,,,
3102,,,,,Jason Gross,Rhode Island,220.98,50.8254,,,,
3620,Sally Hughsby,Tennessee,40.776,4.5873,,,,,,,,
4833,Maribeth Schnelling,Wisconsin,12.39,5.8233,,,,,,,,
5400,,,,,Muhammed Lee,Washington,71.98,15.1158,,,,


In [32]:
# Create the df1 DataFrame with the Customer Name common key
df_1 = df[['Ship Date','Ship Mode','Customer Name']][0:4]
df_1

Unnamed: 0,Ship Date,Ship Mode,Customer Name
0,2016-11-11,Second Class,Claire Gute
1,2016-11-11,Second Class,Claire Gute
2,2016-06-16,Second Class,Darrin Van Huff
3,2015-10-18,Standard Class,Sean O'Donnell


In [33]:
# Create the second DataFrame, df2, with the Customer Name common key
df_2 = df[['Customer Name','Product Name','Quantity']][0:4]
df_2

Unnamed: 0,Customer Name,Product Name,Quantity
0,Claire Gute,Bush Somerset Collection Bookcase,2
1,Claire Gute,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",3
2,Darrin Van Huff,Self-Adhesive Address Labels for Typewriters b...,2
3,Sean O'Donnell,Bretford CR4500 Series Slim Rectangular Table,5


In [34]:
# Join these two tables with an inner join
pd.merge(df_1, df_2, on='Customer Name', how='inner')

Unnamed: 0,Ship Date,Ship Mode,Customer Name,Product Name,Quantity
0,2016-11-11,Second Class,Claire Gute,Bush Somerset Collection Bookcase,2
1,2016-11-11,Second Class,Claire Gute,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",3
2,2016-11-11,Second Class,Claire Gute,Bush Somerset Collection Bookcase,2
3,2016-11-11,Second Class,Claire Gute,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",3
4,2016-06-16,Second Class,Darrin Van Huff,Self-Adhesive Address Labels for Typewriters b...,2
5,2015-10-18,Standard Class,Sean O'Donnell,Bretford CR4500 Series Slim Rectangular Table,5


In [35]:
# Drop the duplicates
pd.merge(df_1,df_2,on='Customer Name', how='inner').drop_duplicates()

Unnamed: 0,Ship Date,Ship Mode,Customer Name,Product Name,Quantity
0,2016-11-11,Second Class,Claire Gute,Bush Somerset Collection Bookcase,2
1,2016-11-11,Second Class,Claire Gute,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",3
4,2016-06-16,Second Class,Darrin Van Huff,Self-Adhesive Address Labels for Typewriters b...,2
5,2015-10-18,Standard Class,Sean O'Donnell,Bretford CR4500 Series Slim Rectangular Table,5


In [36]:
# Extract another small table called df_3
df_3=df[['Customer Name','Product Name','Quantity']][2:6]
df_3

Unnamed: 0,Customer Name,Product Name,Quantity
2,Darrin Van Huff,Self-Adhesive Address Labels for Typewriters b...,2
3,Sean O'Donnell,Bretford CR4500 Series Slim Rectangular Table,5
4,Sean O'Donnell,Eldon Fold 'N Roll Cart System,2
5,Brosina Hoffman,Eldon Expressions Wood and Plastic Desk Access...,7


In [37]:
# Perform an inner join on df_1 and df_3
pd.merge(df_1,df_3,on='Customer Name', how='inner').drop_duplicates()

Unnamed: 0,Ship Date,Ship Mode,Customer Name,Product Name,Quantity
0,2016-06-16,Second Class,Darrin Van Huff,Self-Adhesive Address Labels for Typewriters b...,2
1,2015-10-18,Standard Class,Sean O'Donnell,Bretford CR4500 Series Slim Rectangular Table,5
2,2015-10-18,Standard Class,Sean O'Donnell,Eldon Fold 'N Roll Cart System,2


In [38]:
# perform an outer join on df_1 and df_3
pd.merge(df_1,df_3,on='Customer Name', how='outer').drop_duplicates()

Unnamed: 0,Ship Date,Ship Mode,Customer Name,Product Name,Quantity
0,2016-11-11,Second Class,Claire Gute,,
2,2016-06-16,Second Class,Darrin Van Huff,Self-Adhesive Address Labels for Typewriters b...,2.0
3,2015-10-18,Standard Class,Sean O'Donnell,Bretford CR4500 Series Slim Rectangular Table,5.0
4,2015-10-18,Standard Class,Sean O'Donnell,Eldon Fold 'N Roll Cart System,2.0
5,NaT,,Brosina Hoffman,Eldon Expressions Wood and Plastic Desk Access...,7.0


In [39]:
# Create df1 with Customer Name as the index
df_1 = df[['Customer Name','Ship Date','Ship Mode']][0:4]
df_1.set_index(['Customer Name'],inplace=True)
df_1

Unnamed: 0_level_0,Ship Date,Ship Mode
Customer Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Claire Gute,2016-11-11,Second Class
Claire Gute,2016-11-11,Second Class
Darrin Van Huff,2016-06-16,Second Class
Sean O'Donnell,2015-10-18,Standard Class


In [40]:
# Create df2 with Customer Name as the index
df_2 = df[['Customer Name','Product Name','Quantity']][2:6]
df_2.set_index(['Customer Name'],inplace=True) 
df_2

Unnamed: 0_level_0,Product Name,Quantity
Customer Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Darrin Van Huff,Self-Adhesive Address Labels for Typewriters b...,2
Sean O'Donnell,Bretford CR4500 Series Slim Rectangular Table,5
Sean O'Donnell,Eldon Fold 'N Roll Cart System,2
Brosina Hoffman,Eldon Expressions Wood and Plastic Desk Access...,7


In [41]:
# Perform a left join on df_1 and df_2
df_1.join(df_2, how='left').drop_duplicates()

Unnamed: 0_level_0,Ship Date,Ship Mode,Product Name,Quantity
Customer Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Claire Gute,2016-11-11,Second Class,,
Darrin Van Huff,2016-06-16,Second Class,Self-Adhesive Address Labels for Typewriters b...,2.0
Sean O'Donnell,2015-10-18,Standard Class,Bretford CR4500 Series Slim Rectangular Table,5.0
Sean O'Donnell,2015-10-18,Standard Class,Eldon Fold 'N Roll Cart System,2.0


In [42]:
# Perform a right join on df_1 and df_2
df_1.join(df_2, how='right').drop_duplicates()

Unnamed: 0_level_0,Ship Date,Ship Mode,Product Name,Quantity
Customer Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Brosina Hoffman,NaT,,Eldon Expressions Wood and Plastic Desk Access...,7
Darrin Van Huff,2016-06-16,Second Class,Self-Adhesive Address Labels for Typewriters b...,2
Sean O'Donnell,2015-10-18,Standard Class,Bretford CR4500 Series Slim Rectangular Table,5
Sean O'Donnell,2015-10-18,Standard Class,Eldon Fold 'N Roll Cart System,2


In [43]:
# Perform an inner join on df_1 and df_2
df_1.join(df_2, how='inner').drop_duplicates()

Unnamed: 0_level_0,Ship Date,Ship Mode,Product Name,Quantity
Customer Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Darrin Van Huff,2016-06-16,Second Class,Self-Adhesive Address Labels for Typewriters b...,2
Sean O'Donnell,2015-10-18,Standard Class,Bretford CR4500 Series Slim Rectangular Table,5
Sean O'Donnell,2015-10-18,Standard Class,Eldon Fold 'N Roll Cart System,2


In [44]:
# Perform an outer join on df_1 and df_2
df_1.join(df_2, how='outer').drop_duplicates()

Unnamed: 0_level_0,Ship Date,Ship Mode,Product Name,Quantity
Customer Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Brosina Hoffman,NaT,,Eldon Expressions Wood and Plastic Desk Access...,7.0
Claire Gute,2016-11-11,Second Class,,
Darrin Van Huff,2016-06-16,Second Class,Self-Adhesive Address Labels for Typewriters b...,2.0
Sean O'Donnell,2015-10-18,Standard Class,Bretford CR4500 Series Slim Rectangular Table,5.0
Sean O'Donnell,2015-10-18,Standard Class,Eldon Fold 'N Roll Cart System,2.0


In [45]:
# Specify the number of samples that we require from the DataFrame
df.sample(n=5)

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
9025,9026,CA-2016-102127,2016-06-26,2016-07-02,Standard Class,VP-21760,Victoria Pisteka,Corporate,United States,New York City,...,10035,East,TEC-AC-10000397,Technology,Accessories,"Perixx PERIBOARD-512B, Ergonomic Split Keyboard",139.96,4,0.0,9.7972
3301,3302,US-2014-159926,2014-11-18,2014-11-22,Standard Class,CS-11950,Carlos Soltero,Consumer,United States,Philadelphia,...,19140,East,TEC-PH-10001128,Technology,Phones,Motorola Droid Maxx,539.964,6,0.4,-107.9928
8333,8334,CA-2016-153269,2016-03-09,2016-03-12,First Class,PS-18760,Pamela Stobb,Consumer,United States,Andover,...,1810,East,OFF-PA-10001801,Office Supplies,Paper,Xerox 193,17.94,3,0.0,8.7906
3490,3491,CA-2015-157322,2015-07-02,2015-07-06,Standard Class,RH-19600,Rob Haberlin,Consumer,United States,Carol Stream,...,60188,Central,OFF-ST-10004507,Office Supplies,Storage,Advantus Rolling Storage Box,68.6,5,0.2,6.0025
324,325,CA-2016-162138,2016-04-23,2016-04-27,Standard Class,GK-14620,Grace Kelly,Corporate,United States,Hesperia,...,92345,West,TEC-AC-10001908,Technology,Accessories,Logitech Wireless Headset h800,99.99,1,0.0,34.9965


In [46]:
# Specify a definite fraction of the data to be sampled
df.sample(frac=0.1)

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
8564,8565,US-2017-108315,2017-04-30,2017-05-04,Standard Class,MH-18115,Mick Hernandez,Home Office,United States,Sanford,...,32771,South,FUR-FU-10000747,Furniture,Furnishings,Tenex B1-RE Series Chair Mats for Low Pile Car...,220.704,6,0.2,-8.2764
8438,8439,CA-2014-108861,2014-05-28,2014-06-01,Standard Class,MM-17260,Magdelene Morse,Consumer,United States,Seattle,...,98105,West,OFF-BI-10003876,Office Supplies,Binders,"Green Canvas Binder for 8-1/2"" x 14"" Sheets",136.960,4,0.2,51.3600
4872,4873,CA-2017-164042,2017-05-23,2017-05-27,Standard Class,KL-16645,Ken Lonsdale,Consumer,United States,Houston,...,77095,Central,OFF-AP-10001947,Office Supplies,Appliances,Acco 6 Outlet Guardian Premium Plus Surge Supp...,18.320,5,0.8,-46.7160
6044,6045,CA-2017-145702,2017-05-19,2017-05-24,Second Class,AH-10075,Adam Hart,Corporate,United States,Knoxville,...,37918,South,OFF-PA-10001526,Office Supplies,Paper,Xerox 1949,27.888,7,0.2,10.1094
347,348,CA-2017-134306,2017-07-08,2017-07-12,Standard Class,TD-20995,Tamara Dahlen,Consumer,United States,Lowell,...,1852,East,OFF-PA-10000249,Office Supplies,Paper,Easy-staple paper,24.560,2,0.0,11.5432
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8089,8090,US-2015-117492,2015-08-07,2015-08-09,First Class,NS-18640,Noel Staavos,Corporate,United States,Baltimore,...,21215,East,OFF-AP-10001492,Office Supplies,Appliances,"Acco Six-Outlet Power Strip, 4' Cord Length",77.580,9,0.0,20.1708
907,908,CA-2017-143259,2017-12-30,2018-01-03,Standard Class,PO-18865,Patrick O'Donnell,Consumer,United States,New York City,...,10009,East,TEC-PH-10004774,Technology,Phones,Gear Head AU3700S Headset,90.930,7,0.0,2.7279
9017,9018,CA-2016-161361,2016-03-26,2016-03-28,Second Class,MV-17485,Mark Van Huff,Consumer,United States,Pocatello,...,83201,West,OFF-PA-10001838,Office Supplies,Paper,Adams Telephone Message Book W/Dividers/Space ...,17.640,3,0.0,8.6436
9447,9448,CA-2017-136882,2017-05-27,2017-06-03,Standard Class,DN-13690,Duane Noonan,Consumer,United States,Tulsa,...,74133,Central,FUR-FU-10003664,Furniture,Furnishings,"Electrix Architect's Clamp-On Swing Arm Lamp, ...",477.300,5,0.0,138.4170


In [47]:
# Take a random sample of 15 records
df_sample = df[['Customer Name','State', 'Sales','Quantity']].sample(n=15)
df_sample

Unnamed: 0,Customer Name,State,Sales,Quantity
6597,Stewart Visinsky,Florida,1.365,1
9057,Stewart Visinsky,California,74.76,7
269,Christine Abelman,Ohio,57.576,3
9269,Pete Kriz,New York,32.088,7
1702,Natalie DeCherney,New York,9.96,2
812,Zuschuss Carroll,Washington,25.78,2
8272,Muhammed Yedwab,California,387.136,4
2392,Speros Goranitis,New York,113.92,4
1101,Paul Van Hugh,California,8.64,3
5921,Peter Fuller,New York,110.96,2


In [48]:
# Sort the values with respect to Sales
df_sample.sort_values(by='Sales')

Unnamed: 0,Customer Name,State,Sales,Quantity
6597,Stewart Visinsky,Florida,1.365,1
9873,Sara Luxemburg,New York,6.48,1
1101,Paul Van Hugh,California,8.64,3
1702,Natalie DeCherney,New York,9.96,2
5078,Catherine Glotzbach,Wisconsin,18.24,3
812,Zuschuss Carroll,Washington,25.78,2
7225,Raymond Buch,California,30.8,4
9269,Pete Kriz,New York,32.088,7
7105,Katharine Harms,Ohio,38.784,3
269,Christine Abelman,Ohio,57.576,3


In [49]:
# Sort the values with respect to Sales and State
df_sample.sort_values(by=['Sales', 'State'])

Unnamed: 0,Customer Name,State,Sales,Quantity
6597,Stewart Visinsky,Florida,1.365,1
9873,Sara Luxemburg,New York,6.48,1
1101,Paul Van Hugh,California,8.64,3
1702,Natalie DeCherney,New York,9.96,2
5078,Catherine Glotzbach,Wisconsin,18.24,3
812,Zuschuss Carroll,Washington,25.78,2
7225,Raymond Buch,California,30.8,4
9269,Pete Kriz,New York,32.088,7
7105,Katharine Harms,Ohio,38.784,3
269,Christine Abelman,Ohio,57.576,3


In [50]:
# Create a user-defined function
def categorize_sales(price):
    if price < 50:
        return "Low"
    elif price < 200:
        return "Medium"
    else:
        return "High"

In [51]:
# Sample 100 records randomly from the database
df_sample=df[['Customer Name','State','Sales']].sample(n=100)

In [52]:
# apply the categorization function to the Sales column
df_sample['Sales Price Category'] = df_sample['Sales'].apply(categorize_sales)
df_sample.head()

Unnamed: 0,Customer Name,State,Sales,Sales Price Category
8557,Laura Armstrong,Texas,453.576,High
8993,Victoria Pisteka,Tennessee,1875.258,High
568,Ellis Ballard,Washington,19.44,Low
854,Greg Tran,New York,242.94,High
1954,Ann Chong,New York,4.92,Low


In [53]:
# create another column for storing the length of the name of the customer
df_sample['Customer Name Length'] = df_sample['Customer Name'].apply(len)
df_sample.head()

Unnamed: 0,Customer Name,State,Sales,Sales Price Category,Customer Name Length
8557,Laura Armstrong,Texas,453.576,High,15
8993,Victoria Pisteka,Tennessee,1875.258,High,16
568,Ellis Ballard,Washington,19.44,Low,13
854,Greg Tran,New York,242.94,High,9
1954,Ann Chong,New York,4.92,Low,9


In [54]:
# Use a lambda function and the apply method
df_sample['Discounted Price'] = df_sample['Sales'].apply(lambda x: 0.85*x if x>200 else x)
df_sample.head()

Unnamed: 0,Customer Name,State,Sales,Sales Price Category,Customer Name Length,Discounted Price
8557,Laura Armstrong,Texas,453.576,High,15,385.5396
8993,Victoria Pisteka,Tennessee,1875.258,High,16,1593.9693
568,Ellis Ballard,Washington,19.44,Low,13,19.44
854,Greg Tran,New York,242.94,High,9,206.499
1954,Ann Chong,New York,4.92,Low,9,4.92
