In [1]:
# Standard Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Additional Imports
# os - for saving and loading files
# json - to work with json files
# math - to round up results
# time - to add a short pause to not overwhelm the server
import os, json, math, time

# to make yelpapi calls
from yelpapi import YelpAPI

# progress bar from tqdm_notebook
from tqdm.notebook import tqdm_notebook

In [2]:
# Load the two CSV files
superhero_powers = pd.read_csv(r'C:\Users\lidiv\core\Applying Advanced Transformations (Core)\AdvancedTransformations\superhero_powers - superhero_powers.csv')
superhero_info = pd.read_csv(r'C:\Users\lidiv\core\Applying Advanced Transformations (Core)\AdvancedTransformations\superhero_info - superhero_info.csv')

In [3]:
superhero_info.head()

Unnamed: 0,Hero|Publisher,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements
0,A-Bomb|Marvel Comics,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"
1,Abe Sapien|Dark Horse Comics,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}"
2,Abin Sur|DC Comics,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}"
3,Abomination|Marvel Comics,Male,Human / Radiation,bad,No Hair,green,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"
4,Absorbing Man|Marvel Comics,Male,Human,bad,No Hair,blue,Unknown,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}"


In [4]:
print("Columns in superhero_info DataFrame:")
print(superhero_info.columns)

print("\nColumns in superhero_powers DataFrame:")
print(superhero_powers.columns)


Columns in superhero_info DataFrame:
Index(['Hero|Publisher', 'Gender', 'Race', 'Alignment', 'Hair color',
       'Eye color', 'Skin color', 'Measurements'],
      dtype='object')

Columns in superhero_powers DataFrame:
Index(['hero_names', 'Powers'], dtype='object')


In [5]:
# Add a new column 'Hero' in superhero_info by splitting 'Hero|Publisher'
superhero_info['hero'] = superhero_info['Hero|Publisher'].str.split('|').str[0]

In [6]:
# Add a new column 'Publisher' in superhero_info by splitting 'Hero|Publisher'
superhero_info['Publisher'] = superhero_info['Hero|Publisher'].str.split('|').str[1]

In [7]:
superhero_info.head()

Unnamed: 0,Hero|Publisher,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements,hero,Publisher
0,A-Bomb|Marvel Comics,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",A-Bomb,Marvel Comics
1,Abe Sapien|Dark Horse Comics,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}",Abe Sapien,Dark Horse Comics
2,Abin Sur|DC Comics,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}",Abin Sur,DC Comics
3,Abomination|Marvel Comics,Male,Human / Radiation,bad,No Hair,green,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",Abomination,Marvel Comics
4,Absorbing Man|Marvel Comics,Male,Human,bad,No Hair,blue,Unknown,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}",Absorbing Man,Marvel Comics


In [8]:
# Merge the two DataFrames using the 'hero' column
superheroes = pd.merge(superhero_info, superhero_powers, left_on='hero', right_on='hero_names', how='inner')


In [9]:
# Drop the redundant columns 'Hero|Publisher' and 'hero_names'
superheroes.drop(columns=['Hero|Publisher', 'hero_names'], inplace=True)


In [10]:
superheroes.head()

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements,hero,Publisher,Powers
0,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",A-Bomb,Marvel Comics,"Accelerated Healing,Durability,Longevity,Super..."
1,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}",Abe Sapien,Dark Horse Comics,"Agility,Accelerated Healing,Cold Resistance,Du..."
2,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}",Abin Sur,DC Comics,Lantern Power Ring
3,Male,Human / Radiation,bad,No Hair,green,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",Abomination,Marvel Comics,"Accelerated Healing,Intelligence,Super Strengt..."
4,Male,Human,bad,No Hair,blue,Unknown,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}",Absorbing Man,Marvel Comics,"Cold Resistance,Durability,Energy Absorption,S..."


In [11]:
## slice out a single test coordinate
test_mea = superheroes.loc[1, 'Measurements']
test_mea

"{'Height': '191.0 cm', 'Weight': '65.0 kg'}"

In [16]:

type(test_mea)

str

In [17]:
## Use json.loads on the test coordinate
json.loads(test_mea)

{'Height': '191.0 cm', 'Weight': '65.0 kg'}

In [18]:
## replace single ' with double " 
test_mea = test_mea.replace("'", '"')
test_mea

'{"Height": "191.0 cm", "Weight": "65.0 kg"}'

In [19]:
## Use json.loads on the test coordinate, again
json.loads(test_mea)

{'Height': '191.0 cm', 'Weight': '65.0 kg'}

In [20]:
# viewing type after using json.loads
type(json.loads(test_mea))

dict

In [21]:
## replace ' with " (entire column)
superheroes['Measurements'] = superheroes['Measurements'].str.replace("'", '"')
## apply json.loads
superheroes['Measurements'] = superheroes['Measurements'].apply(json.loads)

In [22]:
## use .apply pd.Series to convert a dict to columns
superheroes['Measurements'].apply(pd.Series)

Unnamed: 0,Height,Weight
0,203.0 cm,441.0 kg
1,191.0 cm,65.0 kg
2,185.0 cm,90.0 kg
3,203.0 cm,441.0 kg
4,193.0 cm,122.0 kg
...,...,...
458,183.0 cm,83.0 kg
459,165.0 cm,52.0 kg
460,66.0 cm,17.0 kg
461,170.0 cm,57.0 kg


In [23]:
## Concatenate the 2 new columns and drop the original.
superheroes = pd.concat([superheroes, superheroes['Measurements'].apply(pd.Series)], axis = 1)
#drop the 'measurement'colunm
superheroes = superheroes.drop(columns = 'Measurements')
superheroes.head(2)

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,hero,Publisher,Powers,Height,Weight
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,"Accelerated Healing,Durability,Longevity,Super...",203.0 cm,441.0 kg
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,"Agility,Accelerated Healing,Cold Resistance,Du...",191.0 cm,65.0 kg


In [24]:
superheroes.dtypes['Weight']

dtype('O')

In [25]:
superheroes.dtypes['Height']

dtype('O')

In [26]:
superheroes['Weight'] = superheroes['Weight'].replace('kg', '')


In [27]:
superheroes['Height'] = superheroes['Height'].replace('cm', '')


In [28]:
# Remove non-numeric characters (e.g., 'kg') and convert to float
superheroes['Weight'] = superheroes['Weight'].str.replace(r'[^\d.]', '', regex=True).astype(float)


In [29]:
# Remove non-numeric characters (e.g., 'kg') and convert to float
superheroes['Height'] = superheroes['Height'].str.replace(r'[^\d.]', '', regex=True).astype(float)



In [30]:
superheroes.dtypes['Weight']

dtype('float64')

In [31]:
superheroes.dtypes['Height']

dtype('float64')

In [32]:
superheroes.head(2)

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,hero,Publisher,Powers,Height,Weight
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,"Accelerated Healing,Durability,Longevity,Super...",203.0,441.0
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,"Agility,Accelerated Healing,Cold Resistance,Du...",191.0,65.0


In [33]:
superheroes['Powers'].value_counts()

Durability,Super Strength                                                                                                                                                                                                                                                                       4
Agility,Stealth,Marksmanship,Weapons Master,Stamina                                                                                                                                                                                                                                             4
Intelligence                                                                                                                                                                                                                                                                                    4
Agility,Accelerated Healing,Durability,Stealth,Danger Sense,Marksmanship,Animal Attributes,Super Strength,Stamina,Super Speed,Anim

In [34]:
# Converting transactions column into a one-hot-encoded column
exploded = superheroes.explode('Powers')
exploded[['hero', 'Powers']].head()

Unnamed: 0,hero,Powers
0,A-Bomb,"Accelerated Healing,Durability,Longevity,Super..."
1,Abe Sapien,"Agility,Accelerated Healing,Cold Resistance,Du..."
2,Abin Sur,Lantern Power Ring
3,Abomination,"Accelerated Healing,Intelligence,Super Strengt..."
4,Absorbing Man,"Cold Resistance,Durability,Energy Absorption,S..."


In [35]:
# Step 1: Replace NaN values with an empty string
superheroes['Powers'].fillna('', inplace=True)

In [36]:
# Step 1: Split the 'Powers' column into lists of powers
superheroes['Powers'] = superheroes['Powers'].str.split(',')

In [37]:
#Create a set of unique powers
all_powers = set()
for powers_list in superheroes['Powers']:
    all_powers.update(powers_list)

In [38]:
#  Create a DataFrame with one-hot-encoded columns
encoded_powers = pd.DataFrame()
for power in all_powers:
    encoded_powers[power] = superheroes['Powers'].apply(lambda x: 1 if power in x else 0)

  encoded_powers[power] = superheroes['Powers'].apply(lambda x: 1 if power in x else 0)
  encoded_powers[power] = superheroes['Powers'].apply(lambda x: 1 if power in x else 0)
  encoded_powers[power] = superheroes['Powers'].apply(lambda x: 1 if power in x else 0)
  encoded_powers[power] = superheroes['Powers'].apply(lambda x: 1 if power in x else 0)
  encoded_powers[power] = superheroes['Powers'].apply(lambda x: 1 if power in x else 0)
  encoded_powers[power] = superheroes['Powers'].apply(lambda x: 1 if power in x else 0)
  encoded_powers[power] = superheroes['Powers'].apply(lambda x: 1 if power in x else 0)
  encoded_powers[power] = superheroes['Powers'].apply(lambda x: 1 if power in x else 0)
  encoded_powers[power] = superheroes['Powers'].apply(lambda x: 1 if power in x else 0)
  encoded_powers[power] = superheroes['Powers'].apply(lambda x: 1 if power in x else 0)
  encoded_powers[power] = superheroes['Powers'].apply(lambda x: 1 if power in x else 0)
  encoded_powers[power] = superh

In [39]:
#  Concatenate the one-hot-encoded DataFrame with the original DataFrame
superheroes = pd.concat([superheroes, encoded_powers], axis=1)

In [40]:
#  Drop the original 'Powers' column
superheroes.drop(columns=['Powers'], inplace=True)

In [41]:
superheroes.head(2)

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,hero,Publisher,Height,Weight,...,Super Speed,Substance Secretion,Magnetism,Telekinesis,Vision - Infrared,Biokinesis,Sub-Mariner,Natural Weapons,Enhanced Memory,Energy Constructs
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0,441.0,...,0,0,0,0,0,0,0,0,0,0
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,191.0,65.0,...,0,0,0,0,0,0,1,0,0,0


In [46]:
# Filter superheroes with and without Super Speed
superheroes_with_super_speed = superheroes[superheroes['Super Speed'] == 1]
superheroes_without_super_speed = superheroes[superheroes['Super Speed'] == 0]

In [50]:
# Calculate the average weight for heroes with and without Super Speed
avg_weight_with_super_speed = superheroes[superheroes['hero'].isin(superheroes_with_super_speed['hero'])]['Weight'].mean()
avg_weight_without_super_speed = superheroes[superheroes['hero'].isin(superheroes_without_super_speed['hero'])]['Weight'].mean()

In [51]:
avg_weight_with_super_speed

129.40404040404042

In [52]:
avg_weight_without_super_speed

101.77358490566037

the hero with super speed have weigh bigger than hero without superspeed 

In [54]:
# Calculate the average height of heroes for each publisher
avg_height_by_publisher = superheroes.groupby('Publisher')['Height'].mean()

In [55]:
print("Average height of heroes by publisher:")
print(avg_height_by_publisher)

Average height of heroes by publisher:
Publisher
DC Comics            181.923913
Dark Horse Comics    176.909091
George Lucas         159.600000
Image Comics         211.000000
Marvel Comics        191.546128
Shueisha             171.500000
Star Trek            181.500000
Team Epic TV         180.750000
Unknown              178.000000
Name: Height, dtype: float64
