# Import Libraries

In [1]:
import pandas as pd

# Load Dataset

In [3]:
df = pd.read_excel('./data_structuring_demo.xlsx',sheet_name=None)

# Unpivoting/Melting

## Creating an copy the dataframe for cleaning

In [4]:
example_data = df['Unpivot Example']
cleaned_example = example_data.copy()
cleaned_example

Unnamed: 0,Name,<50,50-70,70-90,90-100
0,Amy Linn,1,4,0,0
1,Marc Fletcher,2,3,0,0
2,Naima Berry,0,0,2,3
3,John Carter,1,2,2,0


## Performing the Unpivoting operations using `melt()`

In [5]:
cleaned_example = cleaned_example.melt(id_vars=['Name'],
                                      var_name='Binned Score',
                                      value_name='Frequency')
cleaned_example

Unnamed: 0,Name,Binned Score,Frequency
0,Amy Linn,<50,1
1,Marc Fletcher,<50,2
2,Naima Berry,<50,0
3,John Carter,<50,1
4,Amy Linn,50-70,4
5,Marc Fletcher,50-70,3
6,Naima Berry,50-70,0
7,John Carter,50-70,2
8,Amy Linn,70-90,0
9,Marc Fletcher,70-90,0


# Pivoting

Done using the `pivot()` function

In [6]:
example2_data = df['Pivot Example']
example2_data

Unnamed: 0,Product Classification,Product,Year,Revenue
0,Early Prototype,C,2021,0
1,Early Prototype,A,2021,0
2,Pilot,B,2021,3885
3,Pilot,A,2022,2193
4,Pilot,B,2022,4224
5,Product,A,2023,3918
6,Product,B,2023,5093


Should be creating a copy of the data but for the sake of simplicity won't be done

In [11]:
example2_data.pivot(index=['Product Classification','Product'],
                   columns="Year",
                   values="Revenue")

Unnamed: 0_level_0,Year,2021,2022,2023
Product Classification,Product,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Early Prototype,A,0.0,,
Early Prototype,C,0.0,,
Pilot,A,,2193.0,
Pilot,B,3885.0,4224.0,
Product,A,,,3918.0
Product,B,,,5093.0


To specify the aggregation function that should be applied with `pivot()`, the `pivot_table()` function should be used instead with the `aggfunc` parameter set to the desired aggregation. It works very similarly to `pivot()` but has more options.

# Transpose

Uses `.T` to apply the transpose operation

In [12]:
transpose_ex = df['Tranpose Example']
transpose_ex

Unnamed: 0,ID,1,2,3,4
0,Students,Amy Linn,Marc Fletcher,Naima Barry,John Smith
1,Test Score,95,50,100,73


In [14]:
transpose_ex.T

Unnamed: 0,0,1
ID,Students,Test Score
1,Amy Linn,95
2,Marc Fletcher,50
3,Naima Barry,100
4,John Smith,73


Applying the `.T` function directly will not maintain the values as columns, even though it specified the correct column as the the index. To fix this, setting the index column manually should be done first before applying the `.T` function.

In [16]:
transpose_df = transpose_ex.set_index('ID').T
transpose_df

ID,Students,Test Score
1,Amy Linn,95
2,Marc Fletcher,50
3,Naima Barry,100
4,John Smith,73


# Merging

Merge by default does an inner join based on similar columns. Uses the `merge()` function to merge dataframes.

In [17]:
merging_ex_1 = df['Merge Example 1']
merging_ex_1

Unnamed: 0,ID,Movie,Viewer
0,0,The Wizard of Oz (1939),"Mark,Mary"
1,1,Get Out (2017),"Tariq,Candice"
2,2,The Wizard of Oz (1939),Olga
3,3,Dunkirk (2017),"Candice,Tariq"
4,4,The Jungle Book (2016),Olga
5,5,High Noon (1952),Aaron
6,6,Get Out (2017),Olga
7,7,The Wizard of Oz (1939),Aaron


In [18]:
merging_ex_2 = df['Merge Example 2']
merging_ex_2

Unnamed: 0,ID,Review,Rating
0,0,"Great movie, excellent plot!",5
1,1,Could have had better character development.,3
2,2,Ok.,Not Collected
3,3,"I loved it, recommended it to all my friends!",5
4,4,"A great movie, but I felt the plot was rushed.",4
5,5,Will not watch again.,1
6,6,Loved it!,Not Collected
7,7,Timeless!,Not Collected


In [19]:
merged = pd.merge(merging_ex_1,merging_ex_2)
merged

Unnamed: 0,ID,Movie,Viewer,Review,Rating
0,0,The Wizard of Oz (1939),"Mark,Mary","Great movie, excellent plot!",5
1,1,Get Out (2017),"Tariq,Candice",Could have had better character development.,3
2,2,The Wizard of Oz (1939),Olga,Ok.,Not Collected
3,3,Dunkirk (2017),"Candice,Tariq","I loved it, recommended it to all my friends!",5
4,4,The Jungle Book (2016),Olga,"A great movie, but I felt the plot was rushed.",4
5,5,High Noon (1952),Aaron,Will not watch again.,1
6,6,Get Out (2017),Olga,Loved it!,Not Collected
7,7,The Wizard of Oz (1939),Aaron,Timeless!,Not Collected


# Appending (Concatenation)

Can be applied to a dataframe by using the `concat()` function.

In [20]:
append_ex_1 = df['Appending Example 1']
append_ex_1

Unnamed: 0,Name,Age,Test Score
0,Amy Linn,14.0,95
1,Marc Fletcher,15.0,50
2,Naima Berry,,100


In [21]:
append_ex_2 = df['Appending Example 2']
append_ex_2

Unnamed: 0,Name,Age,Test Score
0,John Carter,14,
1,Dewey Cobb,14,100.0
2,Amy Linn,14,85.0


In [23]:
appended_df = pd.concat([append_ex_1,append_ex_2],ignore_index=True)
appended_df

Unnamed: 0,Name,Age,Test Score
0,Amy Linn,14.0,95.0
1,Marc Fletcher,15.0,50.0
2,Naima Berry,,100.0
3,John Carter,14.0,
4,Dewey Cobb,14.0,100.0
5,Amy Linn,14.0,85.0


Not using the `ignore_index` parameter will cause the index values of the rows to duplicate, making them look messy

In [24]:
pd.concat([append_ex_1,append_ex_2])

Unnamed: 0,Name,Age,Test Score
0,Amy Linn,14.0,95.0
1,Marc Fletcher,15.0,50.0
2,Naima Berry,,100.0
0,John Carter,14.0,
1,Dewey Cobb,14.0,100.0
2,Amy Linn,14.0,85.0


# Group-by and Aggregation

Grouping by and aggregation can be applied to a dataframe by using the `groupby()` function followed by an aggregation function such as `sum()` or `mean()`

In [25]:
groupby_ex = df['Groupby-Agg Example']
groupby_ex

Unnamed: 0,date,score
0,March,9
1,March,1
2,March,3
3,April,5
4,April,6
5,April,4


In [28]:
groupby_ex.groupby('date',sort=False)['score'].agg(['sum','mean'])

Unnamed: 0_level_0,sum,mean
date,Unnamed: 1_level_1,Unnamed: 2_level_1
March,13,4.333333
April,15,5.0


Can be done quicker by doing this

In [30]:
groupby_ex.groupby('date').sum()

Unnamed: 0_level_0,score
date,Unnamed: 1_level_1
April,15
March,13


In [31]:
groupby_ex.groupby('date').mean()

Unnamed: 0_level_0,score
date,Unnamed: 1_level_1
April,5.0
March,4.333333


# Advanced Merging

In [34]:
df = pd.read_excel('advanced_merge_example.xlsx',sheet_name=None)
adv_merge_1 = df['Sheet1']
adv_merge_2 = df['Sheet2']

In [35]:
adv_merge_1

Unnamed: 0,Movie Audience,Movie ID,Movie Rating
0,Kids,0,G
1,Adults,1,R
2,Teens,2,PG-13
3,Kids,3,PG
4,Kids,4,PG


In [37]:
adv_merge_2

Unnamed: 0,Price (dollars),Movie ID,Movie Title,Score
0,14,0,The Wizard of Oz (1939)\t,5
1,26,1,Get Out (2017),3
2,12,3,Dunkirk (2017)\t,2
3,5,4,The Jungle Book (2016)\t,5
4,15,5,High Noon (1952)\t,4


## Merging the two dataframes based on right join

In [39]:
adv_merge_right = pd.merge(adv_merge_1,adv_merge_2, on=['Movie ID'], how='right')
adv_merge_right

Unnamed: 0,Movie Audience,Movie ID,Movie Rating,Price (dollars),Movie Title,Score
0,Kids,0,G,14,The Wizard of Oz (1939)\t,5
1,Adults,1,R,26,Get Out (2017),3
2,Kids,3,PG,12,Dunkirk (2017)\t,2
3,Kids,4,PG,5,The Jungle Book (2016)\t,5
4,,5,,15,High Noon (1952)\t,4


## Merging the two dataframes based on left join

In [41]:
adv_merge_left = pd.merge(adv_merge_1,adv_merge_2, on=['Movie ID'], how='left')
adv_merge_left

Unnamed: 0,Movie Audience,Movie ID,Movie Rating,Price (dollars),Movie Title,Score
0,Kids,0,G,14.0,The Wizard of Oz (1939)\t,5.0
1,Adults,1,R,26.0,Get Out (2017),3.0
2,Teens,2,PG-13,,,
3,Kids,3,PG,12.0,Dunkirk (2017)\t,2.0
4,Kids,4,PG,5.0,The Jungle Book (2016)\t,5.0


## Merging the two dataframes based on inner join

In [42]:
adv_merge_inner = pd.merge(adv_merge_1,adv_merge_2, on=['Movie ID'], how='inner')
adv_merge_inner

Unnamed: 0,Movie Audience,Movie ID,Movie Rating,Price (dollars),Movie Title,Score
0,Kids,0,G,14,The Wizard of Oz (1939)\t,5
1,Adults,1,R,26,Get Out (2017),3
2,Kids,3,PG,12,Dunkirk (2017)\t,2
3,Kids,4,PG,5,The Jungle Book (2016)\t,5
