# Advance Data Transformation
Author: Kim Hazed Delfino


## Imports

In [181]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os, json
import warnings
warnings.filterwarnings('ignore')

## Load the Data

In [182]:
superhero_info = pd.read_csv("/Users/kim/Documents/GitHub/repo/Advanced_Data_Transformation/superhero_info - superhero_info.csv")
superhero_powers  = pd.read_csv("/Users/kim/Documents/GitHub/repo/Advanced_Data_Transformation/superhero_powers - superhero_powers.csv")

## Exploratory Data Analysis 

In [183]:
superhero_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 463 entries, 0 to 462
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Hero|Publisher  463 non-null    object
 1   Gender          463 non-null    object
 2   Race            463 non-null    object
 3   Alignment       463 non-null    object
 4   Hair color      463 non-null    object
 5   Eye color       463 non-null    object
 6   Skin color      463 non-null    object
 7   Measurements    463 non-null    object
dtypes: object(8)
memory usage: 29.1+ KB


In [184]:
superhero_powers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 667 entries, 0 to 666
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   hero_names  667 non-null    object
 1   Powers      667 non-null    object
dtypes: object(2)
memory usage: 10.5+ KB


In [185]:
superhero_info.head()

Unnamed: 0,Hero|Publisher,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements
0,A-Bomb|Marvel Comics,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"
1,Abe Sapien|Dark Horse Comics,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}"
2,Abin Sur|DC Comics,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}"
3,Abomination|Marvel Comics,Male,Human / Radiation,bad,No Hair,green,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"
4,Absorbing Man|Marvel Comics,Male,Human,bad,No Hair,blue,Unknown,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}"


In [186]:
superhero_powers.head()

Unnamed: 0,hero_names,Powers
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed"
1,A-Bomb,"Accelerated Healing,Durability,Longevity,Super..."
2,Abe Sapien,"Agility,Accelerated Healing,Cold Resistance,Du..."
3,Abin Sur,Lantern Power Ring
4,Abomination,"Accelerated Healing,Intelligence,Super Strengt..."


## Data Transformation - Superhero_info

In [187]:
# Separate hero name and publisher
superhero_info[["Hero_names", "Publisher"]] = superhero_info['Hero|Publisher'].str.split('|',expand=True)

In [188]:
# Double check
superhero_info.head()

Unnamed: 0,Hero|Publisher,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements,Hero_names,Publisher
0,A-Bomb|Marvel Comics,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",A-Bomb,Marvel Comics
1,Abe Sapien|Dark Horse Comics,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}",Abe Sapien,Dark Horse Comics
2,Abin Sur|DC Comics,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}",Abin Sur,DC Comics
3,Abomination|Marvel Comics,Male,Human / Radiation,bad,No Hair,green,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",Abomination,Marvel Comics
4,Absorbing Man|Marvel Comics,Male,Human,bad,No Hair,blue,Unknown,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}",Absorbing Man,Marvel Comics


In [189]:
# Drop Hero|Publisher
superhero_info.drop(columns='Hero|Publisher',inplace=True)

In [190]:
# Double check
superhero_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 463 entries, 0 to 462
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Gender        463 non-null    object
 1   Race          463 non-null    object
 2   Alignment     463 non-null    object
 3   Hair color    463 non-null    object
 4   Eye color     463 non-null    object
 5   Skin color    463 non-null    object
 6   Measurements  463 non-null    object
 7   Hero_names    463 non-null    object
 8   Publisher     463 non-null    object
dtypes: object(9)
memory usage: 32.7+ KB


In [191]:
# Replace single quote to double quote 
superhero_info['Measurements']=superhero_info['Measurements'].str.replace("'",'"')

In [192]:
# Convert measurement from str to dict
superhero_info['Measurements']=superhero_info['Measurements'].apply(json.loads)

In [193]:
# Double check
test=superhero_info.loc[0,'Measurements']
print(type(test))
test

<class 'dict'>


{'Height': '203.0 cm', 'Weight': '441.0 kg'}

In [194]:
# separate heigh and weight from measurement
h_w = superhero_info['Measurements'].apply(pd.Series)
h_w

Unnamed: 0,Height,Weight
0,203.0 cm,441.0 kg
1,191.0 cm,65.0 kg
2,185.0 cm,90.0 kg
3,203.0 cm,441.0 kg
4,193.0 cm,122.0 kg
...,...,...
458,183.0 cm,83.0 kg
459,165.0 cm,52.0 kg
460,66.0 cm,17.0 kg
461,170.0 cm,57.0 kg


In [195]:
# Combine height and weight back to our dataset and drop measurements
superhero_info=pd.concat((superhero_info,h_w),axis=1).drop(columns='Measurements')
superhero_info.head()

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Hero_names,Publisher,Height,Weight
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0 cm,441.0 kg
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,191.0 cm,65.0 kg
2,Male,Ungaran,good,No Hair,blue,red,Abin Sur,DC Comics,185.0 cm,90.0 kg
3,Male,Human / Radiation,bad,No Hair,green,Unknown,Abomination,Marvel Comics,203.0 cm,441.0 kg
4,Male,Human,bad,No Hair,blue,Unknown,Absorbing Man,Marvel Comics,193.0 cm,122.0 kg


In [196]:
# Convert Height and weight into float 
superhero_info['Height(cm)'] = superhero_info['Height'].str.replace(" cm","").astype(float)
superhero_info['Weight(kg)'] = superhero_info['Weight'].str.replace(" kg","").astype(float)
# Drop the Height and Weight column
superhero_info.drop(columns=['Height','Weight'],inplace=True)
superhero_info.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 463 entries, 0 to 462
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Gender      463 non-null    object 
 1   Race        463 non-null    object 
 2   Alignment   463 non-null    object 
 3   Hair color  463 non-null    object 
 4   Eye color   463 non-null    object 
 5   Skin color  463 non-null    object 
 6   Hero_names  463 non-null    object 
 7   Publisher   463 non-null    object 
 8   Height(cm)  463 non-null    float64
 9   Weight(kg)  463 non-null    float64
dtypes: float64(2), object(8)
memory usage: 36.3+ KB


In [197]:
# Move hero name to first column
poped = superhero_info.pop('Hero_names')
superhero_info.insert(0,"Hero_names",poped)
superhero_info.head()


Unnamed: 0,Hero_names,Gender,Race,Alignment,Hair color,Eye color,Skin color,Publisher,Height(cm),Weight(kg)
0,A-Bomb,Male,Human,good,No Hair,yellow,Unknown,Marvel Comics,203.0,441.0
1,Abe Sapien,Male,Icthyo Sapien,good,No Hair,blue,blue,Dark Horse Comics,191.0,65.0
2,Abin Sur,Male,Ungaran,good,No Hair,blue,red,DC Comics,185.0,90.0
3,Abomination,Male,Human / Radiation,bad,No Hair,green,Unknown,Marvel Comics,203.0,441.0
4,Absorbing Man,Male,Human,bad,No Hair,blue,Unknown,Marvel Comics,193.0,122.0


In [198]:
# Double check
superhero_info.head()

Unnamed: 0,Hero_names,Gender,Race,Alignment,Hair color,Eye color,Skin color,Publisher,Height(cm),Weight(kg)
0,A-Bomb,Male,Human,good,No Hair,yellow,Unknown,Marvel Comics,203.0,441.0
1,Abe Sapien,Male,Icthyo Sapien,good,No Hair,blue,blue,Dark Horse Comics,191.0,65.0
2,Abin Sur,Male,Ungaran,good,No Hair,blue,red,DC Comics,185.0,90.0
3,Abomination,Male,Human / Radiation,bad,No Hair,green,Unknown,Marvel Comics,203.0,441.0
4,Absorbing Man,Male,Human,bad,No Hair,blue,Unknown,Marvel Comics,193.0,122.0


## Data Transformation - Superhero Powers

In [199]:
superhero_powers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 667 entries, 0 to 666
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   hero_names  667 non-null    object
 1   Powers      667 non-null    object
dtypes: object(2)
memory usage: 10.5+ KB


In [200]:
# # use only hero names that also in our info dataset
# superhero_powers_filter = superhero_powers['hero_names'].isin(superhero_info['Hero_names'])
# superhero_powers_df = superhero_powers[superhero_powers_filter].reset_index(drop=True)

In [201]:
# # Double check
# superhero_powers_df.info()

In [202]:
# Split and Explode the powers to separate each unique name
superhero_powers["Powers_split"]= superhero_powers["Powers"].str.split(',')
superhero_powers["Powers_split"]



0        [Agility, Super Strength, Stamina, Super Speed]
1      [Accelerated Healing, Durability, Longevity, S...
2      [Agility, Accelerated Healing, Cold Resistance...
3                                   [Lantern Power Ring]
4      [Accelerated Healing, Intelligence, Super Stre...
                             ...                        
662               [Flight, Energy Blasts, Size Changing]
663    [Cold Resistance, Durability, Longevity, Super...
664    [Agility, Stealth, Danger Sense, Marksmanship,...
665    [Cryokinesis, Telepathy, Magic, Fire Control, ...
666    [Super Speed, Intangibility, Time Travel, Time...
Name: Powers_split, Length: 667, dtype: object

In [203]:
# Explode to get each power
exploded = superhero_powers.explode('Powers_split')
exploded

Unnamed: 0,hero_names,Powers,Powers_split
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed",Agility
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed",Super Strength
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed",Stamina
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed",Super Speed
1,A-Bomb,"Accelerated Healing,Durability,Longevity,Super...",Accelerated Healing
...,...,...,...
665,Zatanna,"Cryokinesis,Telepathy,Magic,Fire Control,Proba...",Weather Control
666,Zoom,"Super Speed,Intangibility,Time Travel,Time Man...",Super Speed
666,Zoom,"Super Speed,Intangibility,Time Travel,Time Man...",Intangibility
666,Zoom,"Super Speed,Intangibility,Time Travel,Time Man...",Time Travel


In [204]:
# Get unique power
cols_to_make = exploded['Powers_split'].dropna().unique()
cols_to_make

array(['Agility', 'Super Strength', 'Stamina', 'Super Speed',
       'Accelerated Healing', 'Durability', 'Longevity', 'Camouflage',
       'Self-Sustenance', 'Cold Resistance', 'Underwater breathing',
       'Marksmanship', 'Weapons Master', 'Intelligence', 'Telepathy',
       'Immortality', 'Reflexes', 'Enhanced Sight', 'Sub-Mariner',
       'Lantern Power Ring', 'Invulnerability', 'Animation',
       'Super Breath', 'Dimensional Awareness', 'Flight', 'Size Changing',
       'Teleportation', 'Magic', 'Dimensional Travel',
       'Molecular Manipulation', 'Energy Manipulation', 'Power Cosmic',
       'Energy Absorption', 'Elemental Transmogrification',
       'Fire Resistance', 'Natural Armor', 'Heat Resistance',
       'Matter Absorption', 'Regeneration', 'Stealth', 'Power Suit',
       'Energy Blasts', 'Energy Beams', 'Heat Generation', 'Danger Sense',
       'Phasing', 'Force Fields', 'Hypnokinesis', 'Invisibility',
       'Enhanced Senses', 'Jump', 'Shapeshifting', 'Elasticity',
 

In [205]:
# Create new col for each power and One-hot-encode each value
for col in cols_to_make:
    superhero_powers[col] = superhero_powers['Powers'].str.contains(col,na=False)
superhero_powers.head()

Unnamed: 0,hero_names,Powers,Powers_split,Agility,Super Strength,Stamina,Super Speed,Accelerated Healing,Durability,Longevity,...,Weather Control,Omnipresent,Omniscient,Hair Manipulation,Nova Force,Odin Force,Phoenix Force,Intuitive aptitude,Melting,Changing Armor
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed","[Agility, Super Strength, Stamina, Super Speed]",True,True,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,A-Bomb,"Accelerated Healing,Durability,Longevity,Super...","[Accelerated Healing, Durability, Longevity, S...",False,True,True,False,True,True,True,...,False,False,False,False,False,False,False,False,False,False
2,Abe Sapien,"Agility,Accelerated Healing,Cold Resistance,Du...","[Agility, Accelerated Healing, Cold Resistance...",True,True,True,False,True,True,True,...,False,False,False,False,False,False,False,False,False,False
3,Abin Sur,Lantern Power Ring,[Lantern Power Ring],False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Abomination,"Accelerated Healing,Intelligence,Super Strengt...","[Accelerated Healing, Intelligence, Super Stre...",False,True,True,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False


In [206]:
# Change column name to match the both df
superhero_powers.rename(columns={"hero_names":"Hero_names"},inplace=True)

In [207]:
superhero_powers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 667 entries, 0 to 666
Columns: 170 entries, Hero_names to Changing Armor
dtypes: bool(167), object(3)
memory usage: 124.5+ KB


- now that both our df has the same values of index we just need to reset the index numbers and then we can merge them both

In [208]:
# Combine two df 
sh_combined_df = superhero_info.merge(superhero_powers)
sh_combined_df.head()

Unnamed: 0,Hero_names,Gender,Race,Alignment,Hair color,Eye color,Skin color,Publisher,Height(cm),Weight(kg),...,Weather Control,Omnipresent,Omniscient,Hair Manipulation,Nova Force,Odin Force,Phoenix Force,Intuitive aptitude,Melting,Changing Armor
0,A-Bomb,Male,Human,good,No Hair,yellow,Unknown,Marvel Comics,203.0,441.0,...,False,False,False,False,False,False,False,False,False,False
1,Abe Sapien,Male,Icthyo Sapien,good,No Hair,blue,blue,Dark Horse Comics,191.0,65.0,...,False,False,False,False,False,False,False,False,False,False
2,Abin Sur,Male,Ungaran,good,No Hair,blue,red,DC Comics,185.0,90.0,...,False,False,False,False,False,False,False,False,False,False
3,Abomination,Male,Human / Radiation,bad,No Hair,green,Unknown,Marvel Comics,203.0,441.0,...,False,False,False,False,False,False,False,False,False,False
4,Absorbing Man,Male,Human,bad,No Hair,blue,Unknown,Marvel Comics,193.0,122.0,...,False,False,False,False,False,False,False,False,False,False


## II. Use your combined DataFrame to answer the following questions.

- Compare the average weight of super powers who have Super Speed to those who do not.
- What is the average height of heroes for each publisher?

In [209]:
# create filter for heroes with super speed 
speedy_filter = sh_combined_df['Super Speed'] == True


In [210]:
# group the speedy and non speedy
speedsters = sh_combined_df[speedy_filter]
slowpokes = sh_combined_df[~speedy_filter]

In [211]:
# get avg weight for speedy and slowpokes 
print(f"Avg weight for heroes with super speed is {round(speedsters['Weight(kg)'].mean())}kg")
print(f"Avg weight for slowpokes heros is {round(slowpokes['Weight(kg)'].mean())}kg")


Avg weight for heroes with super speed is 129kg
Avg weight for slowpokes heros is 102kg


In [212]:
# Get avg height of heroes per publisher
sh_combined_df.groupby('Publisher')['Height(cm)'].mean()

Publisher
DC Comics            181.923913
Dark Horse Comics    176.909091
George Lucas         159.600000
Image Comics         211.000000
Marvel Comics        191.546128
Shueisha             171.500000
Star Trek            181.500000
Team Epic TV         180.750000
Unknown              178.000000
Name: Height(cm), dtype: float64