In [1]:
import pandas as pd

In [2]:
# Importing datasets.

week1 = pd.read_csv('Datasets/Restaurant - Week 1 Sales.csv')

week2 = pd.read_csv('Datasets/Restaurant - Week 2 Sales.csv')

customers = pd.read_csv('Datasets/Restaurant - Customers.csv')

foods = pd.read_csv('Datasets/Restaurant - Foods.csv')

# # pd.concat() method.

#### The pd.concat() method concatinates 2 DataFrames with same column-headers, one below the other.
#### If the column-headers are not same, it will concatinate column-wise (increases number of columns).
#### By default it concatinates row wise, use "axis" parameter to change.
#### While concatinating row wise it will retain the row-headers from the previous DataFrame.
#### "objs" parameter takes a list of DataFrames & concatinates them in the order they are in the list.

In [8]:
# Creating a DataFrame.
k = pd.DataFrame([(10, 5), (11, 20), (45, 12)], columns=['units', 'tense'])

# Creating a DataFrame with similar columns.
j = pd.DataFrame([(5 + 3j, 5 - 9j), (45, 1), (5 + 4j, 4)], columns=['units', 'tense'])

# Concatinating the DataFrames.
pd.concat(objs=[k, j])

Unnamed: 0,units,tense
0,10.000000+0.000000j,5.000000+0.000000j
1,11.000000+0.000000j,20.000000+0.000000j
2,45.000000+0.000000j,12.000000+0.000000j
0,5.000000+3.000000j,5.000000-9.000000j
1,45.000000+0.000000j,1.000000+0.000000j
2,5.000000+4.000000j,4.000000+0.000000j


In [3]:
# Concatinating the DataFrames week1 & week2

pd.concat(objs=[week1, week2])

Unnamed: 0,Customer ID,Food ID
0,537,9
1,97,4
2,658,1
3,202,2
4,155,9
...,...,...
245,783,10
246,556,10
247,547,9
248,252,9


#### # set "axis=1" to concatinate the DataFrame by columns.
#### # If a DataFrame has more rows than the other, new rows will be created with NaN values.

In [10]:
# Concatinating Column wise.

pd.concat(objs=[week1, week2], axis=1, keys=['week1', 'week2'])

Unnamed: 0_level_0,week1,week1,week2,week2
Unnamed: 0_level_1,Customer ID,Food ID,Customer ID,Food ID
0,537,9,688,10
1,97,4,813,7
2,658,1,495,10
3,202,2,189,5
4,155,9,267,3
...,...,...,...,...
245,413,9,783,10
246,926,6,556,10
247,134,3,547,9
248,396,6,252,9


#### # One can set "ignore_index=True" to create a new series of row-headers in the output DataFrame.

In [7]:
# Concatinating the DataFrames while giving ignoring indexes.

pd.concat(objs=[week1, week2], ignore_index=True)

Unnamed: 0,Customer ID,Food ID
0,537,9
1,97,4
2,658,1
3,202,2
4,155,9
...,...,...
495,783,10
496,556,10
497,547,9
498,252,9


#### # IF 2 DataFrames are concatinated row-wise the values will retain their previous index, in this case 2 or more values can have same row & column indexes.
#### # A way to uniquely identify values is to give concatinated DataFrames a "key", which will serve as a multi-index at a lower level.
#### # One can give keys to each DataFrame using the "keys" parameter that takes a list of keys for each DataFrame in "objs" list as an argument.

In [9]:
# Concatinating week1 & week2 one below the other assigning a multi-index

pd.concat(objs=[week1, week2], keys=['week1', 'week2'])

Unnamed: 0,Unnamed: 1,Customer ID,Food ID
week1,0,537,9
week1,1,97,4
week1,2,658,1
week1,3,202,2
week1,4,155,9
...,...,...,...
week2,245,783,10
week2,246,556,10
week2,247,547,9
week2,248,252,9


# # df.merge() method.
## creating inner joins.

#### df.merge() can be used to create a joined table similar to the SQL joins
#### The DataFrame on which merge is applied is left table, while the DataFrame passed in it is right table.

In [11]:
# Extracting customers that arrived both on week1 & week2.
# The reason why values in merged columns repeat, is that they might appear 
#more than once in a DataFrame with different values in other columns.

week1.merge(week2, on='Customer ID')

Unnamed: 0,Customer ID,Food ID_x,Food ID_y
0,537,9,5
1,155,9,3
2,155,1,3
3,503,5,8
4,503,5,9
...,...,...,...
57,945,5,4
58,343,3,5
59,343,3,2
60,343,3,7


In [12]:
week1.merge(week2, on='Customer ID', suffixes=['-W1', '-W2'])

Unnamed: 0,Customer ID,Food ID-W1,Food ID-W2
0,537,9,5
1,155,9,3
2,155,1,3
3,503,5,8
4,503,5,9
...,...,...,...
57,945,5,4
58,343,3,5
59,343,3,2
60,343,3,7


### # merging multiple columns.

In [13]:
# Extracting customers that arrived both weeks & ordered same food both the time.

week1.merge(week2, on=['Customer ID', 'Food ID'])

Unnamed: 0,Customer ID,Food ID
0,304,3
1,540,3
2,937,10
3,233,3
4,21,4
5,21,4
6,922,1
7,578,5
8,578,5


## # Creating Outer Joins.

In [4]:
# Creating an outer joined DataFrame from week1 & week2

week1.merge(week2, how='outer', on='Customer ID', suffixes=['-W1', '-W2'])

Unnamed: 0,Customer ID,Food ID-W1,Food ID-W2
0,537,9.0,5.0
1,97,4.0,
2,658,1.0,
3,202,2.0,
4,155,9.0,3.0
...,...,...,...
449,855,,4.0
450,559,,10.0
451,276,,4.0
452,556,,10.0


### # The "indicator" parameter creates a new column where it indicates where all the remaining values come from. From the right DataFrame, left DataFrame or both.

In [8]:
merged = week1.merge(week2, how='outer', on='Customer ID', suffixes=['-W1', '-W2'], indicator=True)
merged

Unnamed: 0,Customer ID,Food ID-W1,Food ID-W2,_merge
0,537,9.0,5.0,both
1,97,4.0,,left_only
2,658,1.0,,left_only
3,202,2.0,,left_only
4,155,9.0,3.0,both
...,...,...,...,...
449,855,,4.0,right_only
450,559,,10.0,right_only
451,276,,4.0,right_only
452,556,,10.0,right_only


### # Creating a mutualy exclusive outer join.

In [10]:
# Getting the count of people who came in week2, week1 & both.

merged['_merge'].value_counts()

right_only    197
left_only     195
both           62
Name: _merge, dtype: int64

In [12]:
# getting a boolean series of mutualy exclusive customers.

mask = merged['_merge'].isin(['left_only', 'right_only'])
mask

0      False
1       True
2       True
3       True
4      False
       ...  
449     True
450     True
451     True
452     True
453     True
Name: _merge, Length: 454, dtype: bool

In [13]:
# Getting customers that arrived on either but not on both the weeks.

merged[mask]

Unnamed: 0,Customer ID,Food ID-W1,Food ID-W2,_merge
1,97,4.0,,left_only
2,658,1.0,,left_only
3,202,2.0,,left_only
6,213,8.0,,left_only
7,600,1.0,,left_only
...,...,...,...,...
449,855,,4.0,right_only
450,559,,10.0,right_only
451,276,,4.0,right_only
452,556,,10.0,right_only


## # Creating a left join.

#### A left join is when the right DataFrame is joined on the left DataFrame using a common column, where all the rows that are common in both the DataFrames are taken from the right DataFrame, and the rest are ignored. If the right DataFrame does not have values for the corrosponding column value in left DataFrame, it is represented as NaN.

In [16]:
# Joining foods DataFrame on week1 DataFrame.
# The output DataFrame will have same number of rows as week1.

week1.merge(foods, how='left', on='Food ID')

Unnamed: 0,Customer ID,Food ID,Food Item,Price
0,537,9,Donut,0.99
1,97,4,Quesadilla,4.25
2,658,1,Sushi,3.99
3,202,2,Burrito,9.99
4,155,9,Donut,0.99
...,...,...,...,...
245,413,9,Donut,0.99
246,926,6,Pasta,13.99
247,134,3,Taco,2.99
248,396,6,Pasta,13.99


In [17]:
# Joining foods DataFrame on week1 DataFrame and getting it sorted.

week1.merge(foods, how='left', on='Food ID', sort=True)

Unnamed: 0,Customer ID,Food ID,Food Item,Price
0,658,1,Sushi,3.99
1,600,1,Sushi,3.99
2,155,1,Sushi,3.99
3,341,1,Sushi,3.99
4,20,1,Sushi,3.99
...,...,...,...,...
245,809,10,Drink,1.75
246,584,10,Drink,1.75
247,274,10,Drink,1.75
248,151,10,Drink,1.75


### # Creating a join on DataFrames with different column names.

#### Here, The ID column in customers DataFrame represents customer ID as in week1 DataFrame, but the column-headers in both the DataFrames are not identical.

#### Use the "left_on" & "right_on" parameters of df.merge() method to create joins on columns with different name.

In [20]:
week1.head(3)

Unnamed: 0,Customer ID,Food ID
0,537,9
1,97,4
2,658,1


In [21]:
customers.head(3)

Unnamed: 0,ID,First Name,Last Name,Gender,Company,Occupation
0,1,Joseph,Perkins,Male,Dynazzy,Community Outreach Specialist
1,2,Jennifer,Alvarez,Female,DabZ,Senior Quality Engineer
2,3,Roger,Black,Male,Tagfeed,Account Executive


### # "left_on" takes the column-header of the left table as an argument and the "right_on" takes the column-header of the right table as an argument on which we want to perform the join.

In [18]:
week1.merge(customers, how='left', left_on='Customer ID', right_on='ID', sort=True)

Unnamed: 0,Customer ID,Food ID,ID,First Name,Last Name,Gender,Company,Occupation
0,3,2,3,Roger,Black,Male,Tagfeed,Account Executive
1,10,2,10,Steven,Ryan,Male,Twinder,Community Outreach Specialist
2,20,1,20,Lisa,Rice,Female,Oloo,Programmer IV
3,21,4,21,Albert,Burns,Male,Rhynoodle,Junior Executive
4,21,4,21,Albert,Burns,Male,Rhynoodle,Junior Executive
...,...,...,...,...,...,...,...,...
245,966,5,966,Robert,Ford,Male,Jabbertype,Account Representative IV
246,968,1,968,Teresa,Reynolds,Female,Flashdog,Budget/Accounting Analyst IV
247,985,5,985,Julia,Ortiz,Female,Kwideo,Structural Analysis Engineer
248,991,2,991,Melissa,Wells,Female,Lazzy,Senior Sales Associate


In [19]:
# Dropping the repeated 'ID' column.

week1.merge(customers, how='left', left_on='Customer ID', right_on='ID', sort=True).drop(columns='ID')

Unnamed: 0,Customer ID,Food ID,First Name,Last Name,Gender,Company,Occupation
0,3,2,Roger,Black,Male,Tagfeed,Account Executive
1,10,2,Steven,Ryan,Male,Twinder,Community Outreach Specialist
2,20,1,Lisa,Rice,Female,Oloo,Programmer IV
3,21,4,Albert,Burns,Male,Rhynoodle,Junior Executive
4,21,4,Albert,Burns,Male,Rhynoodle,Junior Executive
...,...,...,...,...,...,...,...
245,966,5,Robert,Ford,Male,Jabbertype,Account Representative IV
246,968,1,Teresa,Reynolds,Female,Flashdog,Budget/Accounting Analyst IV
247,985,5,Julia,Ortiz,Female,Kwideo,Structural Analysis Engineer
248,991,2,Melissa,Wells,Female,Lazzy,Senior Sales Associate


### # Creating joins on indexes (row-headers).
#### Use the "right_index", "left_index" parameters of df.merge() method to merge on the basis row indexes, ie. if a column in one DataFrame has common values with row-indexes of other DataFrame.

In [24]:
# Creating new DataFrame with ID as row-Index.

new_customers = customers.set_index(keys='ID')
new_customers

Unnamed: 0_level_0,First Name,Last Name,Gender,Company,Occupation
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Joseph,Perkins,Male,Dynazzy,Community Outreach Specialist
2,Jennifer,Alvarez,Female,DabZ,Senior Quality Engineer
3,Roger,Black,Male,Tagfeed,Account Executive
4,Steven,Evans,Male,Fatz,Registered Nurse
5,Judy,Morrison,Female,Demivee,Legal Assistant
...,...,...,...,...,...
996,Debra,Garcia,Female,Dazzlesphere,Structural Engineer
997,Douglas,Bishop,Male,Livepath,Developer I
998,Frank,Franklin,Male,Brainverse,Nurse Practicioner
999,Jessica,Burns,Female,Babbleblab,Financial Advisor


In [26]:
# Creating new DataFrame with 'Food ID' as row-Index.

new_foods = foods.set_index(keys='Food ID')
new_foods

Unnamed: 0_level_0,Food Item,Price
Food ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Sushi,3.99
2,Burrito,9.99
3,Taco,2.99
4,Quesadilla,4.25
5,Pizza,2.49
6,Pasta,13.99
7,Steak,24.99
8,Salad,11.25
9,Donut,0.99
10,Drink,1.75


#### # Here, 'Customer ID' column in week1 is common with 'ID' row-index of new_customers.

In [28]:
# Joining new_customers on week1 DataFrame.

week1.merge(new_customers, how='left', left_on='Customer ID', right_index=True)

Unnamed: 0,Customer ID,Food ID,First Name,Last Name,Gender,Company,Occupation
0,537,9,Cheryl,Carroll,Female,Zoombeat,Registered Nurse
1,97,4,Amanda,Watkins,Female,Ozu,Account Coordinator
2,658,1,Patrick,Webb,Male,Browsebug,Community Outreach Specialist
3,202,2,Louis,Campbell,Male,Rhynoodle,Account Representative III
4,155,9,Carolyn,Diaz,Female,Gigazoom,Database Administrator III
...,...,...,...,...,...,...,...
245,413,9,Diane,Bailey,Female,Wikibox,Technical Writer
246,926,6,Anne,Wagner,Female,Skyba,Legal Assistant
247,134,3,Diana,Hall,Female,Quinu,Financial Advisor
248,396,6,Juan,Romero,Male,Zoonder,Analyst Programmer


#### # Here, 'Food ID' column in week1 is common with 'Food ID' row-index of new_foods.

In [29]:
# Joining new_foods on week1 DataFrame.

week1.merge(new_foods, how='left', left_on='Food ID', right_index=True)

Unnamed: 0,Customer ID,Food ID,Food Item,Price
0,537,9,Donut,0.99
1,97,4,Quesadilla,4.25
2,658,1,Sushi,3.99
3,202,2,Burrito,9.99
4,155,9,Donut,0.99
...,...,...,...,...
245,413,9,Donut,0.99
246,926,6,Pasta,13.99
247,134,3,Taco,2.99
248,396,6,Pasta,13.99


In [31]:
# Joining week1 & week2 DataFrames using their row-indexes.

week1.merge(week2, how='left', left_index=True, right_index=True, suffixes=['-W1', '-W2'])

Unnamed: 0,Customer ID-W1,Food ID-W1,Customer ID-W2,Food ID-W2
0,537,9,688,10
1,97,4,813,7
2,658,1,495,10
3,202,2,189,5
4,155,9,267,3
...,...,...,...,...
245,413,9,783,10
246,926,6,556,10
247,134,3,547,9
248,396,6,252,9


# # df.join() method

#### Its a method to join 2 DataFrames vertically when they share exactly same index.

In [34]:
# Importing a new dataset

satisfaction = pd.read_csv('Datasets/Restaurant - Week 1 Satisfaction.csv')
satisfaction.head(3)

Unnamed: 0,Satisfaction Rating
0,2
1,7
2,3


In [35]:
# Joining satisfaction on week1.
week1.merge(satisfaction, how='left', right_index=True, left_index=True)

week1.join(satisfaction)     # does the same as the code above.

Unnamed: 0,Customer ID,Food ID,Satisfaction Rating
0,537,9,2
1,97,4,7
2,658,1,3
3,202,2,7
4,155,9,10
...,...,...,...
245,413,9,1
246,926,6,2
247,134,3,8
248,396,6,10


# # pd.merge() method.

#### pd.merge() is similar to df.merge(), It is executed at the module level and takes both left and right DataFrames as arguments.

In [36]:
# Joining foods on week1 by 'Food ID'.

pd.merge(week1, foods, on='Food ID')

Unnamed: 0,Customer ID,Food ID,Food Item,Price
0,537,9,Donut,0.99
1,155,9,Donut,0.99
2,961,9,Donut,0.99
3,147,9,Donut,0.99
4,680,9,Donut,0.99
...,...,...,...,...
245,809,10,Drink,1.75
246,584,10,Drink,1.75
247,274,10,Drink,1.75
248,151,10,Drink,1.75
