# **Applying Advanced Transformations**

Joe Lardie

March 2023

## **Part one:Clean the files and combine them into one final DataFrame.**

# **Imports**

In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import os, json, math, time
from tqdm.notebook import tqdm_notebook

## **Loading Data**

In [2]:
shi = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vS1ZstYLwFgwhZnqDsPjtnlHYhJp_cmW55J8JD5mym0seRsaem3px7QBtuFF0LiI7z1PLCkVKAkdO7J/pub?output=csv')

In [3]:
shi.head()

Unnamed: 0,Hero|Publisher,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements
0,A-Bomb|Marvel Comics,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"
1,Abe Sapien|Dark Horse Comics,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}"
2,Abin Sur|DC Comics,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}"
3,Abomination|Marvel Comics,Male,Human / Radiation,bad,No Hair,green,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"
4,Absorbing Man|Marvel Comics,Male,Human,bad,No Hair,blue,Unknown,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}"


In [4]:
shp = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vSzdWOBaXOoz52vPmCFV5idNlDBohLY1Lsbc1IfZIZQ7cV_aNB2wYBfhF49uE1TaO1B5MQCGWiNrFfd/pub?output=csv')

In [5]:
shp.head()

Unnamed: 0,hero_names,Powers
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed"
1,A-Bomb,"Accelerated Healing,Durability,Longevity,Super..."
2,Abe Sapien,"Agility,Accelerated Healing,Cold Resistance,Du..."
3,Abin Sur,Lantern Power Ring
4,Abomination,"Accelerated Healing,Intelligence,Super Strengt..."


## **Cleaning and Combing Files into one dataframe**

In [6]:
shi.info()
shp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 463 entries, 0 to 462
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Hero|Publisher  463 non-null    object
 1   Gender          463 non-null    object
 2   Race            463 non-null    object
 3   Alignment       463 non-null    object
 4   Hair color      463 non-null    object
 5   Eye color       463 non-null    object
 6   Skin color      463 non-null    object
 7   Measurements    463 non-null    object
dtypes: object(8)
memory usage: 29.1+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 667 entries, 0 to 666
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   hero_names  667 non-null    object
 1   Powers      667 non-null    object
dtypes: object(2)
memory usage: 10.5+ KB


In [7]:
shi['Hero|Publisher'].head()

0            A-Bomb|Marvel Comics
1    Abe Sapien|Dark Horse Comics
2              Abin Sur|DC Comics
3       Abomination|Marvel Comics
4     Absorbing Man|Marvel Comics
Name: Hero|Publisher, dtype: object

In [8]:
shi['Hero|Publisher'].str.split(' ',expand=True)

Unnamed: 0,0,1,2,3
0,A-Bomb|Marvel,Comics,,
1,Abe,Sapien|Dark,Horse,Comics
2,Abin,Sur|DC,Comics,
3,Abomination|Marvel,Comics,,
4,Absorbing,Man|Marvel,Comics,
...,...,...,...,...
458,Yellowjacket|Marvel,Comics,,
459,Yellowjacket,II|Marvel,Comics,
460,Yoda|George,Lucas,,
461,Zatanna|DC,Comics,,


In [9]:
shp['hero_names'].head()

0        3-D Man
1         A-Bomb
2     Abe Sapien
3       Abin Sur
4    Abomination
Name: hero_names, dtype: object

In [10]:
shp['hero_names'].str.split(' ',expand=True)

Unnamed: 0,0,1,2
0,3-D,Man,
1,A-Bomb,,
2,Abe,Sapien,
3,Abin,Sur,
4,Abomination,,
...,...,...,...
662,Yellowjacket,II,
663,Ymir,,
664,Yoda,,
665,Zatanna,,


In [11]:
hero_df= pd.concat([shi, shp])

In [12]:
print(hero_df)

                   Hero|Publisher Gender               Race Alignment  \
0            A-Bomb|Marvel Comics   Male              Human      good   
1    Abe Sapien|Dark Horse Comics   Male      Icthyo Sapien      good   
2              Abin Sur|DC Comics   Male            Ungaran      good   
3       Abomination|Marvel Comics   Male  Human / Radiation       bad   
4     Absorbing Man|Marvel Comics   Male              Human       bad   
..                            ...    ...                ...       ...   
662                           NaN    NaN                NaN       NaN   
663                           NaN    NaN                NaN       NaN   
664                           NaN    NaN                NaN       NaN   
665                           NaN    NaN                NaN       NaN   
666                           NaN    NaN                NaN       NaN   

    Hair color Eye color Skin color  \
0      No Hair    yellow    Unknown   
1      No Hair      blue       blue   
2     

In [13]:
hero_df.head()

Unnamed: 0,Hero|Publisher,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements,hero_names,Powers
0,A-Bomb|Marvel Comics,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",,
1,Abe Sapien|Dark Horse Comics,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}",,
2,Abin Sur|DC Comics,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}",,
3,Abomination|Marvel Comics,Male,Human / Radiation,bad,No Hair,green,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",,
4,Absorbing Man|Marvel Comics,Male,Human,bad,No Hair,blue,Unknown,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}",,


In [14]:
hero_df['Measurements'].apply(pd.Series)

Unnamed: 0,0
0,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"
1,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}"
2,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}"
3,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"
4,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}"
...,...
662,
663,
664,
665,


In [15]:
# Slice out a single test Measurement
test_ms = hero_df.loc[1,'Measurements']
test_ms

1    {'Height': '191.0 cm', 'Weight': '65.0 kg'}
1                                            NaN
Name: Measurements, dtype: object

In [16]:
test_ms.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_ms.dropna(inplace=True)


In [17]:
#json.loads(test_ms)

In [18]:
# Check the signle record
test_ms

1    {'Height': '191.0 cm', 'Weight': '65.0 kg'}
Name: Measurements, dtype: object

In [19]:
# replace single ' with "
test_ms = test_ms.replace("''", '"')
test_ms

1    {'Height': '191.0 cm', 'Weight': '65.0 kg'}
Name: Measurements, dtype: object

In [20]:
## replace ' with " (entire column)
hero_df['Measurements'] = hero_df['Measurements'].str.replace("'",'"')

In [21]:
## drop NaN values
hero_df.dropna(subset=['Measurements'], inplace=True)

In [22]:
## apply json.loads
hero_df['Measurements'] = hero_df['Measurements'].apply(json.loads)

In [23]:
## slice out a single test coordinate
test_ms = hero_df.loc[5,'Measurements']
type(test_ms)

dict

In [24]:
## use .apply pd.Series to convert a dict to columns
hero_df['Measurements'].apply(pd.Series)

Unnamed: 0,Height,Weight
0,203.0 cm,441.0 kg
1,191.0 cm,65.0 kg
2,185.0 cm,90.0 kg
3,203.0 cm,441.0 kg
4,193.0 cm,122.0 kg
...,...,...
458,183.0 cm,83.0 kg
459,165.0 cm,52.0 kg
460,66.0 cm,17.0 kg
461,170.0 cm,57.0 kg


In [25]:
## Concatenate the 2 new columns and drop the original.
hero_df = pd.concat([hero_df,hero_df['Measurements'].apply(pd.Series)],axis =1)
hero_df =hero_df.drop(columns= 'Measurements')
hero_df.head(2)

Unnamed: 0,Hero|Publisher,Gender,Race,Alignment,Hair color,Eye color,Skin color,hero_names,Powers,Height,Weight
0,A-Bomb|Marvel Comics,Male,Human,good,No Hair,yellow,Unknown,,,203.0 cm,441.0 kg
1,Abe Sapien|Dark Horse Comics,Male,Icthyo Sapien,good,No Hair,blue,blue,,,191.0 cm,65.0 kg


### **OHE abilities into columns**

In [26]:
powers = set()
for powers_list in hero_df['Powers'].str.split(','):
    if isinstance(powers_list, float):
        continue
    for powers in powers_list:
        powers.add(powers.strip())
        
for powers in powers:
    hero_df[powers] = hero_df['Powers'].apply(lambda x: int(powers in x.split(',')))

In [32]:
value_counts = hero_df['Powers'].value_counts(dropna=False)

In [33]:
print(hero_df['Powers'])

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
      ... 
458    NaN
459    NaN
460    NaN
461    NaN
462    NaN
Name: Powers, Length: 463, dtype: object


## **Part two:Use your combined DataFrame to answer the following questions.**

### **1Compare the average weight of super powers who have Super Speed to those who do not.**

In [37]:
# check for existence of 'Super Speed' in the columns
if 'Super Speed' in hero_df.columns:
    # calculate average weight of heroes with 'Super Speed'
    super_speed_avg_weight = np.mean(hero_df[hero_df['Super Speed'] == True]['Weight'])
    print(f"Average weight of heroes with 'Super Speed': {super_speed_avg_weight}")

    # calculate average weight of heroes without 'Super Speed'
    no_super_speed_avg_weight = np.mean(hero_df[hero_df['Super Speed'] == False]['Weight'])
    print(f"Average weight of heroes without 'Super Speed': {no_super_speed_avg_weight}")
else:
    print("'Super Speed' not found in columns")

'Super Speed' not found in columns


### **2What is the average height of heroes for each publisher?**

In [34]:
def convert_height_to_cm(height):
    if type(height) != str:
        return None
    try:
        height_cm = int(height.split()[0]) * 30.48 + int(height.split()[2]) * 2.54
        return height_cm
    except:
        return None

In [35]:
hero_df['Height'] = hero_df['Height'].str.replace(' cm', '').astype(float)

In [36]:
hero_df.groupby('Hero|Publisher')['Height'].mean()

Hero|Publisher
A-Bomb|Marvel Comics             203.0
Abe Sapien|Dark Horse Comics     191.0
Abin Sur|DC Comics               185.0
Abomination|Marvel Comics        203.0
Absorbing Man|Marvel Comics      193.0
                                 ...  
Yellowjacket II|Marvel Comics    165.0
Yellowjacket|Marvel Comics       183.0
Yoda|George Lucas                 66.0
Zatanna|DC Comics                170.0
Zoom|DC Comics                   185.0
Name: Height, Length: 457, dtype: float64