In [1]:
# Standard Imports
import numpy as np
import pandas as pd

# Importing the JSON Module
import json

import warnings
warnings.filterwarnings('ignore')


In [2]:
info_df = pd.read_csv('./data/superhero_info-superhero_info.csv') 
info_df.head()


Unnamed: 0,Hero|Publisher,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements
0,A-Bomb|Marvel Comics,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"
1,Abe Sapien|Dark Horse Comics,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}"
2,Abin Sur|DC Comics,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}"
3,Abomination|Marvel Comics,Male,Human / Radiation,bad,No Hair,green,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"
4,Absorbing Man|Marvel Comics,Male,Human,bad,No Hair,blue,Unknown,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}"


In [3]:
powers_df = pd.read_csv('./Data/superhero_powers-superhero_powers.csv')
powers_df.head()


Unnamed: 0,hero_names,Powers
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed"
1,A-Bomb,"Accelerated Healing,Durability,Longevity,Super..."
2,Abe Sapien,"Agility,Accelerated Healing,Cold Resistance,Du..."
3,Abin Sur,Lantern Power Ring
4,Abomination,"Accelerated Healing,Intelligence,Super Strengt..."


In [4]:
# splitting the hero|publisher column into two
info_df[['Hero', 'Publisher']] = info_df['Hero|Publisher'].str.split('|', expand=True)
info_df.head()

Unnamed: 0,Hero|Publisher,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements,Hero,Publisher
0,A-Bomb|Marvel Comics,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",A-Bomb,Marvel Comics
1,Abe Sapien|Dark Horse Comics,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}",Abe Sapien,Dark Horse Comics
2,Abin Sur|DC Comics,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}",Abin Sur,DC Comics
3,Abomination|Marvel Comics,Male,Human / Radiation,bad,No Hair,green,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",Abomination,Marvel Comics
4,Absorbing Man|Marvel Comics,Male,Human,bad,No Hair,blue,Unknown,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}",Absorbing Man,Marvel Comics


In [5]:
# removing the original hero|publisher column
info_df = info_df.drop(columns=['Hero|Publisher'])
info_df.head(2)


Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements,Hero,Publisher
0,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",A-Bomb,Marvel Comics
1,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}",Abe Sapien,Dark Horse Comics


In [6]:
# Fixing the measurements column

# use .str.replace to replace all single quotes
info_df['Measurements'] = info_df['Measurements'].str.replace("'",'"')
## Apply the json.loads to the full column
info_df['Measurements'] = info_df['Measurements'].apply(json.loads)
info_df['Measurements'].head()


0    {'Height': '203.0 cm', 'Weight': '441.0 kg'}
1     {'Height': '191.0 cm', 'Weight': '65.0 kg'}
2     {'Height': '185.0 cm', 'Weight': '90.0 kg'}
3    {'Height': '203.0 cm', 'Weight': '441.0 kg'}
4    {'Height': '193.0 cm', 'Weight': '122.0 kg'}
Name: Measurements, dtype: object

In [7]:
measure = info_df['Measurements'].apply(pd.Series)
measure


Unnamed: 0,Height,Weight
0,203.0 cm,441.0 kg
1,191.0 cm,65.0 kg
2,185.0 cm,90.0 kg
3,203.0 cm,441.0 kg
4,193.0 cm,122.0 kg
...,...,...
458,183.0 cm,83.0 kg
459,165.0 cm,52.0 kg
460,66.0 cm,17.0 kg
461,170.0 cm,57.0 kg


In [8]:
# Converting weight and height to numeric datatype
measure['Height'] = measure.loc[:, 'Height'].apply(lambda x: float(x.split()[0]))
measure['Weight'] = measure.loc[:, 'Weight'].apply(lambda x: float(x.split()[0]))

measure.head()

Unnamed: 0,Height,Weight
0,203.0,441.0
1,191.0,65.0
2,185.0,90.0
3,203.0,441.0
4,193.0,122.0


In [9]:
measure.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 463 entries, 0 to 462
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Height  463 non-null    float64
 1   Weight  463 non-null    float64
dtypes: float64(2)
memory usage: 7.4 KB


In [10]:
# concat long_lat with original dataframe
info_df = pd.concat((info_df, measure), axis = 1)
info_df.head(2)


Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements,Hero,Publisher,Height,Weight
0,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",A-Bomb,Marvel Comics,203.0,441.0
1,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}",Abe Sapien,Dark Horse Comics,191.0,65.0


In [11]:
info_df = info_df.drop(columns=['Measurements'])
info_df.head()

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Hero,Publisher,Height,Weight
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0,441.0
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,191.0,65.0
2,Male,Ungaran,good,No Hair,blue,red,Abin Sur,DC Comics,185.0,90.0
3,Male,Human / Radiation,bad,No Hair,green,Unknown,Abomination,Marvel Comics,203.0,441.0
4,Male,Human,bad,No Hair,blue,Unknown,Absorbing Man,Marvel Comics,193.0,122.0


In [12]:
info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 463 entries, 0 to 462
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Gender      463 non-null    object 
 1   Race        463 non-null    object 
 2   Alignment   463 non-null    object 
 3   Hair color  463 non-null    object 
 4   Eye color   463 non-null    object 
 5   Skin color  463 non-null    object 
 6   Hero        463 non-null    object 
 7   Publisher   463 non-null    object 
 8   Height      463 non-null    float64
 9   Weight      463 non-null    float64
dtypes: float64(2), object(8)
memory usage: 36.3+ KB


In [13]:
powers_df.head()

Unnamed: 0,hero_names,Powers
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed"
1,A-Bomb,"Accelerated Healing,Durability,Longevity,Super..."
2,Abe Sapien,"Agility,Accelerated Healing,Cold Resistance,Du..."
3,Abin Sur,Lantern Power Ring
4,Abomination,"Accelerated Healing,Intelligence,Super Strengt..."


In [14]:
# first splitting the powers values.
powers_df['powers_split'] = powers_df['Powers'].apply(lambda x: x.split(','))
powers_df.head()

Unnamed: 0,hero_names,Powers,powers_split
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed","[Agility, Super Strength, Stamina, Super Speed]"
1,A-Bomb,"Accelerated Healing,Durability,Longevity,Super...","[Accelerated Healing, Durability, Longevity, S..."
2,Abe Sapien,"Agility,Accelerated Healing,Cold Resistance,Du...","[Agility, Accelerated Healing, Cold Resistance..."
3,Abin Sur,Lantern Power Ring,[Lantern Power Ring]
4,Abomination,"Accelerated Healing,Intelligence,Super Strengt...","[Accelerated Healing, Intelligence, Super Stre..."


In [15]:
# loading split powers as json object
powers_df = powers_df.explode('powers_split', ignore_index=True)
powers_df.head()


Unnamed: 0,hero_names,Powers,powers_split
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed",Agility
1,3-D Man,"Agility,Super Strength,Stamina,Super Speed",Super Strength
2,3-D Man,"Agility,Super Strength,Stamina,Super Speed",Stamina
3,3-D Man,"Agility,Super Strength,Stamina,Super Speed",Super Speed
4,A-Bomb,"Accelerated Healing,Durability,Longevity,Super...",Accelerated Healing


In [16]:
# saving the unique values from the exploded column
cols_to_make = powers_df['powers_split'].dropna().unique()
cols_to_make


array(['Agility', 'Super Strength', 'Stamina', 'Super Speed',
       'Accelerated Healing', 'Durability', 'Longevity', 'Camouflage',
       'Self-Sustenance', 'Cold Resistance', 'Underwater breathing',
       'Marksmanship', 'Weapons Master', 'Intelligence', 'Telepathy',
       'Immortality', 'Reflexes', 'Enhanced Sight', 'Sub-Mariner',
       'Lantern Power Ring', 'Invulnerability', 'Animation',
       'Super Breath', 'Dimensional Awareness', 'Flight', 'Size Changing',
       'Teleportation', 'Magic', 'Dimensional Travel',
       'Molecular Manipulation', 'Energy Manipulation', 'Power Cosmic',
       'Energy Absorption', 'Elemental Transmogrification',
       'Fire Resistance', 'Natural Armor', 'Heat Resistance',
       'Matter Absorption', 'Regeneration', 'Stealth', 'Power Suit',
       'Energy Blasts', 'Energy Beams', 'Heat Generation', 'Danger Sense',
       'Phasing', 'Force Fields', 'Hypnokinesis', 'Invisibility',
       'Enhanced Senses', 'Jump', 'Shapeshifting', 'Elasticity',
 

In [17]:
# Using for loop to create new columns
for col in cols_to_make:
    powers_df[col] = powers_df['powers_split'].str.contains(col)

powers_df.head()


Unnamed: 0,hero_names,Powers,powers_split,Agility,Super Strength,Stamina,Super Speed,Accelerated Healing,Durability,Longevity,...,Weather Control,Omnipresent,Omniscient,Hair Manipulation,Nova Force,Odin Force,Phoenix Force,Intuitive aptitude,Melting,Changing Armor
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed",Agility,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,3-D Man,"Agility,Super Strength,Stamina,Super Speed",Super Strength,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,3-D Man,"Agility,Super Strength,Stamina,Super Speed",Stamina,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,3-D Man,"Agility,Super Strength,Stamina,Super Speed",Super Speed,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,A-Bomb,"Accelerated Healing,Durability,Longevity,Super...",Accelerated Healing,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False


In [18]:
# drop unnecessary columns
powers_df = powers_df.drop(columns=['Powers','powers_split'])
powers_df.head()

Unnamed: 0,hero_names,Agility,Super Strength,Stamina,Super Speed,Accelerated Healing,Durability,Longevity,Camouflage,Self-Sustenance,...,Weather Control,Omnipresent,Omniscient,Hair Manipulation,Nova Force,Odin Force,Phoenix Force,Intuitive aptitude,Melting,Changing Armor
0,3-D Man,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,3-D Man,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,3-D Man,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,3-D Man,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,A-Bomb,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [19]:
# merging the dataframes.
merged_df = pd.merge(info_df, powers_df, left_on='Hero', right_on='hero_names')
merged_df.head()

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Hero,Publisher,Height,Weight,...,Weather Control,Omnipresent,Omniscient,Hair Manipulation,Nova Force,Odin Force,Phoenix Force,Intuitive aptitude,Melting,Changing Armor
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0,441.0,...,False,False,False,False,False,False,False,False,False,False
1,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0,441.0,...,False,False,False,False,False,False,False,False,False,False
2,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0,441.0,...,False,False,False,False,False,False,False,False,False,False
3,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0,441.0,...,False,False,False,False,False,False,False,False,False,False
4,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0,441.0,...,False,False,False,False,False,False,False,False,False,False


> 1. Compare the average weight of super powers who have Super Speed to those who do not.

In [20]:
merged_df.groupby('Super Speed').mean(numeric_only=True)['Weight']

Super Speed
False    126.364574
True     129.404040
Name: Weight, dtype: float64

> 2. What is the average height of heroes for each publisher?


In [21]:
merged_df.groupby('Publisher').mean(numeric_only=True)['Height']

Publisher
DC Comics            187.582386
Dark Horse Comics    181.486667
George Lucas         154.692308
Image Comics         211.000000
Marvel Comics        201.015327
Shueisha             171.012658
Star Trek            182.500000
Team Epic TV         181.692308
Unknown              178.000000
Name: Height, dtype: float64