In [1]:
import pandas as pd

# Concatenation

In [2]:
#creating a dummy dataframe

df1_dummy = {
    "serial_id" : ["1","2","3","4","5"],
    "sale_month" : ["Jan", "Feb", "Mar", "Apr", "May"],
    "sales" : ["12300", "25100", "17800", "20100", "21000"]
}

In [3]:
df1 = pd.DataFrame(df1_dummy, columns = ["serial_id", "sale_month", "sales"])
df1

Unnamed: 0,serial_id,sale_month,sales
0,1,Jan,12300
1,2,Feb,25100
2,3,Mar,17800
3,4,Apr,20100
4,5,May,21000


In [4]:
# next dummy dataframe

df2_dummy = {
    "serial_id" : ["6","7","8","9","10"],
    "sale_month" : ["Jun", "Jul", "Aug", "Sep", "Oct"],
    "sales" : ["25000", "23700", "24600", "24000", "23950"]
}

In [5]:
df2 = pd.DataFrame(df2_dummy, columns = ["serial_id", "sale_month", "sales"])
df2

Unnamed: 0,serial_id,sale_month,sales
0,6,Jun,25000
1,7,Jul,23700
2,8,Aug,24600
3,9,Sep,24000
4,10,Oct,23950


In [6]:
# 3rd dummy data set that specifies yes or no depending on whether the sales threshold has been met

df3_dummy = {
    "sales_threshold" : ["No", "Yes", "No", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes"],
    "bonus_threshold" : ["No", "Yes", "No", "No", "No", "Yes", "No", "Yes", "Yes", "No"]
}

In [7]:
df3 = pd.DataFrame(df3_dummy, columns = ["sales_threshold", "bonus_threshold"])
df3

Unnamed: 0,sales_threshold,bonus_threshold
0,No,No
1,Yes,Yes
2,No,No
3,Yes,No
4,Yes,No
5,Yes,Yes
6,Yes,No
7,Yes,Yes
8,Yes,Yes
9,Yes,No


In [8]:
#Concatenating the first and second DF row-wise

df_row = pd.concat([df1, df2], ignore_index=True) #we ignore the index so we can later concatenate column wise with df3
df_row

Unnamed: 0,serial_id,sale_month,sales
0,1,Jan,12300
1,2,Feb,25100
2,3,Mar,17800
3,4,Apr,20100
4,5,May,21000
5,6,Jun,25000
6,7,Jul,23700
7,8,Aug,24600
8,9,Sep,24000
9,10,Oct,23950


In [9]:
# Concatening the new DF with the 3rd DF Column-wise

df_full = pd.concat([df_row, df3], axis=1) #row-wise is default and is axis=0
df_full

Unnamed: 0,serial_id,sale_month,sales,sales_threshold,bonus_threshold
0,1,Jan,12300,No,No
1,2,Feb,25100,Yes,Yes
2,3,Mar,17800,No,No
3,4,Apr,20100,Yes,No
4,5,May,21000,Yes,No
5,6,Jun,25000,Yes,Yes
6,7,Jul,23700,Yes,No
7,8,Aug,24600,Yes,Yes
8,9,Sep,24000,Yes,Yes
9,10,Oct,23950,Yes,No


# APPEND

In [10]:
df_append = df1.append(df2)
df_append

Unnamed: 0,serial_id,sale_month,sales
0,1,Jan,12300
1,2,Feb,25100
2,3,Mar,17800
3,4,Apr,20100
4,5,May,21000
0,6,Jun,25000
1,7,Jul,23700
2,8,Aug,24600
3,9,Sep,24000
4,10,Oct,23950


In [11]:
df_append_full = df_append.append(df3)
df_append_full

Unnamed: 0,serial_id,sale_month,sales,sales_threshold,bonus_threshold
0,1.0,Jan,12300.0,,
1,2.0,Feb,25100.0,,
2,3.0,Mar,17800.0,,
3,4.0,Apr,20100.0,,
4,5.0,May,21000.0,,
0,6.0,Jun,25000.0,,
1,7.0,Jul,23700.0,,
2,8.0,Aug,24600.0,,
3,9.0,Sep,24000.0,,
4,10.0,Oct,23950.0,,


# MERGING AND JOINING DATASETS

## Process of joinging two dataframes together based on some common columns
Types = inner, outer, left and right
method = pd.merge()

### One-to-one Joins

In [12]:
# create dummy dataframes

ddf1 = pd.DataFrame({"product" : ["Prod_1", "Prod_2", "Prod_3", "Prod_4"],
                    "division" : ["Div_A", "Div_B", "Div_C", "Div_B"]})

ddf2 = pd.DataFrame({"sales" : ["12500", "10800", "5600", "7900"],
                    "product" : ["Prod_3", "Prod_2", "Prod_4", "Prod_1"]})

display(ddf1, ddf2)

Unnamed: 0,product,division
0,Prod_1,Div_A
1,Prod_2,Div_B
2,Prod_3,Div_C
3,Prod_4,Div_B


Unnamed: 0,sales,product
0,12500,Prod_3
1,10800,Prod_2
2,5600,Prod_4
3,7900,Prod_1


In [13]:
#MERGING

ddf3 = pd.merge(ddf1, ddf2) #automatically knows to merge on the shared column
ddf3

Unnamed: 0,product,division,sales
0,Prod_1,Div_A,7900
1,Prod_2,Div_B,10800
2,Prod_3,Div_C,12500
3,Prod_4,Div_B,5600


### Many-to-one Joins

In [14]:
# dummy dataframe 4

ddf4 = pd.DataFrame({"division" : ["Div_A", "Div_B", "Div_C"],
                    "manager" : ["Roger", "Rafael", "Novak"]})
ddf4

Unnamed: 0,division,manager
0,Div_A,Roger
1,Div_B,Rafael
2,Div_C,Novak


In [15]:
#again the merge method knows how to properly join the columns
ddf5 = pd.merge(ddf3, ddf4)
ddf5

Unnamed: 0,product,division,sales,manager
0,Prod_1,Div_A,7900,Roger
1,Prod_2,Div_B,10800,Rafael
2,Prod_4,Div_B,5600,Rafael
3,Prod_3,Div_C,12500,Novak


### Many-to-many Joins

In [16]:
ddf6 = pd.DataFrame({"division" : ["Div_A", "Div_A", "Div_B", "Div_C", "Div_C", "Div_C"],
                    "emp_grade" : ["13", "14+", "12", "11", "10", "9-"]})
ddf6

Unnamed: 0,division,emp_grade
0,Div_A,13
1,Div_A,14+
2,Div_B,12
3,Div_C,11
4,Div_C,10
5,Div_C,9-


In [17]:
#merge dataframes that both have repeats
ddf7 = pd.merge(ddf1, ddf6)
ddf7

Unnamed: 0,product,division,emp_grade
0,Prod_1,Div_A,13
1,Prod_1,Div_A,14+
2,Prod_2,Div_B,12
3,Prod_4,Div_B,12
4,Prod_3,Div_C,11
5,Prod_3,Div_C,10
6,Prod_3,Div_C,9-


# Merge Keys 

### 'On' Parameter - explicitly specify the name of the key column

In [18]:
display(pd.merge(ddf1, ddf2, on = "product"))

Unnamed: 0,product,division,sales
0,Prod_1,Div_A,7900
1,Prod_2,Div_B,10800
2,Prod_3,Div_C,12500
3,Prod_4,Div_B,5600


### 'Left_on' or 'Right_on' parameters - for merging two datasets with different column names but similar data

In [19]:
ddf1_new = pd.DataFrame({"project" : ["Prod_1", "Prod_2", "Prod_3", "Prod_4"],
                        "division" : ["Div_A", "Div_B", "Div_C", "Div_B"]})
display(pd.merge(ddf1_new, ddf2, left_on = "project", right_on = "product"))

Unnamed: 0,project,division,sales,product
0,Prod_1,Div_A,7900,Prod_1
1,Prod_2,Div_B,10800,Prod_2
2,Prod_3,Div_C,12500,Prod_3
3,Prod_4,Div_B,5600,Prod_4


In [20]:
display(pd.merge(ddf1_new, ddf2, left_on = "project", right_on = "product").drop("product", axis=1))

Unnamed: 0,project,division,sales
0,Prod_1,Div_A,7900
1,Prod_2,Div_B,10800
2,Prod_3,Div_C,12500
3,Prod_4,Div_B,5600


### 'Left_index' or 'Right_index' parameters - for merging on an index rather than a column

In [21]:
ddf1_index = ddf1.set_index("product")
ddf2_index = ddf2.set_index("product")

display(ddf1_index, ddf2_index)

Unnamed: 0_level_0,division
product,Unnamed: 1_level_1
Prod_1,Div_A
Prod_2,Div_B
Prod_3,Div_C
Prod_4,Div_B


Unnamed: 0_level_0,sales
product,Unnamed: 1_level_1
Prod_3,12500
Prod_2,10800
Prod_4,5600
Prod_1,7900


In [22]:
display(pd.merge(ddf1_index, ddf2_index, left_index=True, right_index=True))

Unnamed: 0_level_0,division,sales
product,Unnamed: 1_level_1,Unnamed: 2_level_1
Prod_1,Div_A,7900
Prod_2,Div_B,10800
Prod_3,Div_C,12500
Prod_4,Div_B,5600


# The How Parameter - similar to SQL joins

In [23]:
# in order to demonstrate these we will create some dummy dataframes

df_a = pd.DataFrame({"id" : ["1", "2", "3", "4"],
                    "actor_first_name" : ["Robert", "Gwyneth", "Jon", "Paul"],
                    "actor_last_name" : ["Downey Jr.", "Paltrow", "Favreau", "Bettany"],
                    "value" : ["10", "6", "7", "7"]
                    })

df_b = pd.DataFrame({"id" : ["1", "2", "3", "4", "5", "6"],
                    "actor_first_name" : ["Robert", "Chris", "Chris", "Mark", "Scarlett", "Jeremy"],
                    "actor_last_name" : ["Downey Jr.", "Evans", "Hemsworth", "Ruffalo", "Johansson", "Renner"]
                    })
display(df_a, df_b)

Unnamed: 0,id,actor_first_name,actor_last_name,value
0,1,Robert,Downey Jr.,10
1,2,Gwyneth,Paltrow,6
2,3,Jon,Favreau,7
3,4,Paul,Bettany,7


Unnamed: 0,id,actor_first_name,actor_last_name
0,1,Robert,Downey Jr.
1,2,Chris,Evans
2,3,Chris,Hemsworth
3,4,Mark,Ruffalo
4,5,Scarlett,Johansson
5,6,Jeremy,Renner


### Inner Join

In [24]:
display(pd.merge(df_a, df_b, how="inner"))

Unnamed: 0,id,actor_first_name,actor_last_name,value
0,1,Robert,Downey Jr.,10


### Left Join

In [25]:
display(pd.merge(df_a, df_b, how = "left"))

Unnamed: 0,id,actor_first_name,actor_last_name,value
0,1,Robert,Downey Jr.,10
1,2,Gwyneth,Paltrow,6
2,3,Jon,Favreau,7
3,4,Paul,Bettany,7


### Right Join

In [26]:
display(pd.merge(df_a, df_b, how= "right"))

Unnamed: 0,id,actor_first_name,actor_last_name,value
0,1,Robert,Downey Jr.,10.0
1,2,Chris,Evans,
2,3,Chris,Hemsworth,
3,4,Mark,Ruffalo,
4,5,Scarlett,Johansson,
5,6,Jeremy,Renner,


### Outer Join

In [27]:
display(pd.merge(df_a, df_b, how = "outer"))

Unnamed: 0,id,actor_first_name,actor_last_name,value
0,1,Robert,Downey Jr.,10.0
1,2,Gwyneth,Paltrow,6.0
2,3,Jon,Favreau,7.0
3,4,Paul,Bettany,7.0
4,2,Chris,Evans,
5,3,Chris,Hemsworth,
6,4,Mark,Ruffalo,
7,5,Scarlett,Johansson,
8,6,Jeremy,Renner,


In [28]:
display(pd.merge(df_a, df_b, how = "outer", indicator=True))

Unnamed: 0,id,actor_first_name,actor_last_name,value,_merge
0,1,Robert,Downey Jr.,10.0,both
1,2,Gwyneth,Paltrow,6.0,left_only
2,3,Jon,Favreau,7.0,left_only
3,4,Paul,Bettany,7.0,left_only
4,2,Chris,Evans,,right_only
5,3,Chris,Hemsworth,,right_only
6,4,Mark,Ruffalo,,right_only
7,5,Scarlett,Johansson,,right_only
8,6,Jeremy,Renner,,right_only


### Merging columns with inconsistent data

In [29]:
df_s1 = pd.DataFrame({"product" : ["Prod_1", "Prod_2", "Prod_3", "Prod_4"],
                     "division" : ["Div_A", "Div_B", "Div_C", "Div_D"]
                     })
df_s2 = pd.DataFrame({"product" : ["Prod_1", "Prod_2", "Prod_3", "Prod_4"],
                     "division" : ["Div_C", "Div_A", "Div_B", "Div_D"]
                     })
display(pd.merge(df_s1, df_s2, on = "product"))

Unnamed: 0,product,division_x,division_y
0,Prod_1,Div_A,Div_C
1,Prod_2,Div_B,Div_A
2,Prod_3,Div_C,Div_B
3,Prod_4,Div_D,Div_D


In [30]:
#custom suffix

display(pd.merge(df_s1, df_s2, on="product", suffixes=["_LD", "_RD"]))

Unnamed: 0,product,division_LD,division_RD
0,Prod_1,Div_A,Div_C
1,Prod_2,Div_B,Div_A
2,Prod_3,Div_C,Div_B
3,Prod_4,Div_D,Div_D


### Updating Dataframes

In [31]:
df1_update = pd.DataFrame({'c1':['a','a','b','b'],
                          'c2' :['x','y','x','y'], 'val':0})

df2_update = pd.DataFrame({'c1':['a','a','b','b'],
                          'c2' :['x','y','x','y'], 'val':[12,31,14,20]})

display(df1_update, df2_update)

Unnamed: 0,c1,c2,val
0,a,x,0
1,a,y,0
2,b,x,0
3,b,y,0


Unnamed: 0,c1,c2,val
0,a,x,12
1,a,y,31
2,b,x,14
3,b,y,20


In [32]:
#update

df1_update.update(df2_update)
df1_update

Unnamed: 0,c1,c2,val
0,a,x,12
1,a,y,31
2,b,x,14
3,b,y,20


In [33]:
#combine first
df2_update.combine_first(df1_update)
df2_update

Unnamed: 0,c1,c2,val
0,a,x,12
1,a,y,31
2,b,x,14
3,b,y,20


# GroupBy Function

##### allows you to split a dataset into different groups or categories based on some feature or column

##### Split-Apply-Combine: Split the dataset, apply some function to the split set, aggregate the new sets

In [46]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

### split

In [47]:
titanic = pd.read_csv('titanic.csv')

In [48]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Parch,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,0,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,0,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,0,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,8.05,S


##### Let's figure out how many people from each Pclass survived, also the average age and max ticket fare based on the Pclass

In [49]:
# number of unique values
titanic["Pclass"].nunique()

3

In [50]:
# total number of survivors
titanic["Survived"].sum()

342

In [51]:
#create a groupby object using column Pclass
titanic_class = titanic.groupby("Pclass")

titanic_class

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000022719694DF0>

In [52]:
#this displays an abridged version of what our groupby objects look like, 
# ***I'm not sure where 'emb' and 'titanic_df' come from***
for emb, titanic_df in titanic_class:
    print(emb)
    print(titanic_df)

1
     PassengerId  Survived  Pclass  \
1              2         1       1   
3              4         1       1   
6              7         0       1   
11            12         1       1   
23            24         1       1   
..           ...       ...     ...   
871          872         1       1   
872          873         0       1   
879          880         1       1   
887          888         1       1   
889          890         1       1   

                                                  Name     Sex   Age  Parch  \
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      0   
6                              McCarthy, Mr. Timothy J    male  54.0      0   
11                            Bonnell, Miss. Elizabeth  female  58.0      0   
23                        Sloper, Mr. William Thompson    male  28.0      0   
..                                                 ...     ...   ..

In [53]:
# only getting the groups for Pclass = 1

titanic_class.get_group(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Parch,Fare,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,0,71.2833,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,0,53.1000,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,51.8625,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,26.5500,S
23,24,1,1,"Sloper, Mr. William Thompson",male,28.0,0,35.5000,S
...,...,...,...,...,...,...,...,...,...
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,52.5542,S
872,873,0,1,"Carlsson, Mr. Frans Olof",male,33.0,0,5.0000,S
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,1,83.1583,C
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,30.0000,S


#### Aggregate

In [54]:
# total number of people who survived per class
titanic_class.sum()

Unnamed: 0_level_0,PassengerId,Survived,Age,Parch,Fare
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,99705,136,7111.42,77,18177.4125
2,82056,87,5168.83,70,3801.8417
3,215625,119,8924.92,193,6714.6951


In [55]:
# average age of people from different classes
titanic_class.mean()

Unnamed: 0_level_0,PassengerId,Survived,Age,Parch,Fare
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,461.597222,0.62963,38.233441,0.356481,84.154687
2,445.956522,0.472826,29.87763,0.380435,20.662183
3,439.154786,0.242363,25.14062,0.393075,13.67555


In [56]:
# max ticket fare paid
titanic_class.max()

AssertionError: 

In [57]:
# multiple functions at once
titanic_class["Fare"].agg({"Fare" : ['sum', 'max']})

SpecificationError: nested renamer is not supported