# Applying Advanced Transformations

In [19]:
## Standard Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Importing the OS and JSON Modules
import os,json

In [20]:
# Load the files
names = pd.read_csv('Data/superhero_info - superhero_info.csv')
power = pd.read_csv('Data/superhero_powers - superhero_powers.csv')

## Clean the files and combine them into one final DataFrame.

In [21]:
# Display basic info for names dataframe
names.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 463 entries, 0 to 462
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Hero|Publisher  463 non-null    object
 1   Gender          463 non-null    object
 2   Race            463 non-null    object
 3   Alignment       463 non-null    object
 4   Hair color      463 non-null    object
 5   Eye color       463 non-null    object
 6   Skin color      463 non-null    object
 7   Measurements    463 non-null    object
dtypes: object(8)
memory usage: 29.1+ KB


In [22]:
# Display basic info for power dataframe
power.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 667 entries, 0 to 666
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   hero_names  667 non-null    object
 1   Powers      667 non-null    object
dtypes: object(2)
memory usage: 10.5+ KB


In [23]:
# Make all the columns lowercase
names.columns = names.columns.str.lower()
power.columns = power.columns.str.lower()

In [24]:
# Check existing format for the doubled column
names['hero|publisher'].head(5)

0            A-Bomb|Marvel Comics
1    Abe Sapien|Dark Horse Comics
2              Abin Sur|DC Comics
3       Abomination|Marvel Comics
4     Absorbing Man|Marvel Comics
Name: hero|publisher, dtype: object

In [28]:
# Seperate these pieces of information
names['hero|publisher'].str.split(' ',expand=True)

Unnamed: 0,0,1,2,3
0,A-Bomb|Marvel,Comics,,
1,Abe,Sapien|Dark,Horse,Comics
2,Abin,Sur|DC,Comics,
3,Abomination|Marvel,Comics,,
4,Absorbing,Man|Marvel,Comics,
...,...,...,...,...
458,Yellowjacket|Marvel,Comics,,
459,Yellowjacket,II|Marvel,Comics,
460,Yoda|George,Lucas,,
461,Zatanna|DC,Comics,,


In [30]:
# split on '|' and save the 2 new columns for the dataframe
names[['hero','publisher']] = names['hero|publisher'].str.split('|',expand=True)

# Drop the original column 
names = names.drop(columns=['hero|publisher'])

# Display the new dataframe
names.head(2)

Unnamed: 0,gender,race,alignment,hair color,eye color,skin color,measurements,hero,publisher
0,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",A-Bomb,Marvel Comics
1,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}",Abe Sapien,Dark Horse Comics


In [31]:
# Display the measurments type
m = names.loc[0,"measurements"]
print(type(m))
m

<class 'str'>


"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"

In [32]:
# test the quotations replacement
m = m.replace("'",'"')
m

'{"Height": "203.0 cm", "Weight": "441.0 kg"}'

In [33]:
# Use .str.replace to replace all single quotes
names['measurements'] = names['measurements'].str.replace("'",'"')

# Apply the json.loads to the full column
names['measurements'] = names['measurements'].apply(json.loads)
names['measurements'].head()

0    {'Height': '203.0 cm', 'Weight': '441.0 kg'}
1     {'Height': '191.0 cm', 'Weight': '65.0 kg'}
2     {'Height': '185.0 cm', 'Weight': '90.0 kg'}
3    {'Height': '203.0 cm', 'Weight': '441.0 kg'}
4    {'Height': '193.0 cm', 'Weight': '122.0 kg'}
Name: measurements, dtype: object

In [37]:
# Use pd.Series to convert each key in the dictionary to new column
hw = names['measurements'].apply(pd.Series)
hw

Unnamed: 0,Height,Weight
0,203.0 cm,441.0 kg
1,191.0 cm,65.0 kg
2,185.0 cm,90.0 kg
3,203.0 cm,441.0 kg
4,193.0 cm,122.0 kg
...,...,...
458,183.0 cm,83.0 kg
459,165.0 cm,52.0 kg
460,66.0 cm,17.0 kg
461,170.0 cm,57.0 kg


In [38]:
# Concat the new columns with the original dataframe
names = pd.concat((names, hw), axis = 1)
names.head(2)

Unnamed: 0,gender,race,alignment,hair color,eye color,skin color,measurements,hero,publisher,Height,Weight
0,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",A-Bomb,Marvel Comics,203.0 cm,441.0 kg
1,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}",Abe Sapien,Dark Horse Comics,191.0 cm,65.0 kg


In [42]:
# Drop the measurments column
names = names.drop(columns=['measurements'])

In [43]:
# Display new info
names.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 463 entries, 0 to 462
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   gender      463 non-null    object
 1   race        463 non-null    object
 2   alignment   463 non-null    object
 3   hair color  463 non-null    object
 4   eye color   463 non-null    object
 5   skin color  463 non-null    object
 6   hero        463 non-null    object
 7   publisher   463 non-null    object
 8   Height      463 non-null    object
 9   Weight      463 non-null    object
dtypes: object(10)
memory usage: 36.3+ KB


In [46]:
# Display values for Height
names['Height'].value_counts()

183.0 cm    53
188.0 cm    44
180.0 cm    37
185.0 cm    34
178.0 cm    33
175.0 cm    32
168.0 cm    27
165.0 cm    26
170.0 cm    19
193.0 cm    18
191.0 cm    18
198.0 cm    18
173.0 cm    16
196.0 cm    10
201.0 cm    10
163.0 cm     8
203.0 cm     5
157.0 cm     5
213.0 cm     5
211.0 cm     4
244.0 cm     4
218.0 cm     3
155.0 cm     3
229.0 cm     3
137.0 cm     2
366.0 cm     2
206.0 cm     2
305.0 cm     2
122.0 cm     2
226.0 cm     2
279.0 cm     1
234.0 cm     1
15.2 cm      1
160.0 cm     1
64.0 cm      1
259.0 cm     1
287.0 cm     1
71.0 cm      1
701.0 cm     1
62.5 cm      1
876.0 cm     1
142.0 cm     1
975.0 cm     1
267.0 cm     1
257.0 cm     1
66.0 cm      1
Name: Height, dtype: int64

In [47]:
# Display values for weight
names['Weight'].value_counts()

81.0 kg     22
79.0 kg     21
54.0 kg     20
90.0 kg     19
86.0 kg     15
            ..
268.0 kg     1
16.0 kg      1
167.0 kg     1
45.0 kg      1
17.0 kg      1
Name: Weight, Length: 128, dtype: int64

In [48]:
# Create a new column without characters for height 
names['height (cm)'] = names['Height'].str.replace("cm",'')

In [49]:
# Create a new column without characters for weight
names['weight (kg)'] = names['Weight'].str.replace("kg",'')

In [54]:
# drop the original columns
names = names.drop(columns=['Height', 'Weight'])

In [63]:
# Change the 2 new columns to integers
names['height (cm)'] = names['height (cm)'].astype(float)

In [64]:
# Change the 2 new columns to integers
names['weight (kg)'] = names['weight (kg)'].astype(float)

#### After looking back at the lp, I realized I could have used the loop code listed in step 2 of the advanced transformations instead.

In [65]:
names.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 463 entries, 0 to 462
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   gender       463 non-null    object 
 1   race         463 non-null    object 
 2   alignment    463 non-null    object 
 3   hair color   463 non-null    object 
 4   eye color    463 non-null    object 
 5   skin color   463 non-null    object 
 6   hero         463 non-null    object 
 7   publisher    463 non-null    object 
 8   height (cm)  463 non-null    float64
 9   weight (kg)  463 non-null    float64
dtypes: float64(2), object(8)
memory usage: 36.3+ KB


In [66]:
names.head(5)

Unnamed: 0,gender,race,alignment,hair color,eye color,skin color,hero,publisher,height (cm),weight (kg)
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0,441.0
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,191.0,65.0
2,Male,Ungaran,good,No Hair,blue,red,Abin Sur,DC Comics,185.0,90.0
3,Male,Human / Radiation,bad,No Hair,green,Unknown,Abomination,Marvel Comics,203.0,441.0
4,Male,Human,bad,No Hair,blue,Unknown,Absorbing Man,Marvel Comics,193.0,122.0
