# Marvel Comic Characters

## Variable	Definition

page_id = identifier for characters page with the wikia

name 

urlslug = url that takes you to character within the wikia

id = identity status of a character

align = if the cahracter is good, bad, or neutral

eye 

hair

sex

gsm = if the character is a gender or sexual minority

appearances

first_appearance 

year

In [1]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')
plt.style.use("tableau-colorblind10")
import datetime
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, KFold
from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
import nltk
nltk.download('stopwords')
stop = set(stopwords.words('english'))
import os
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import json
import ast
from urllib.request import urlopen
from PIL import Image

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/natashabedford/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:

#import new data, use time to see how long it takes to download data
%time df=pd.read_csv('../Marvel_Fun/fivethirtyeight-comic-characters-dataset/marvel-wikia-data.csv')
df.head()

CPU times: user 53.4 ms, sys: 12.4 ms, total: 65.8 ms
Wall time: 82.5 ms


Unnamed: 0,page_id,name,urlslug,ID,ALIGN,EYE,HAIR,SEX,GSM,ALIVE,APPEARANCES,FIRST APPEARANCE,Year
0,1678,Spider-Man (Peter Parker),\/Spider-Man_(Peter_Parker),Secret Identity,Good Characters,Hazel Eyes,Brown Hair,Male Characters,,Living Characters,4043.0,Aug-62,1962.0
1,7139,Captain America (Steven Rogers),\/Captain_America_(Steven_Rogers),Public Identity,Good Characters,Blue Eyes,White Hair,Male Characters,,Living Characters,3360.0,Mar-41,1941.0
2,64786,"Wolverine (James \""Logan\"" Howlett)",\/Wolverine_(James_%22Logan%22_Howlett),Public Identity,Neutral Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,3061.0,Oct-74,1974.0
3,1868,"Iron Man (Anthony \""Tony\"" Stark)",\/Iron_Man_(Anthony_%22Tony%22_Stark),Public Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,2961.0,Mar-63,1963.0
4,2460,Thor (Thor Odinson),\/Thor_(Thor_Odinson),No Dual Identity,Good Characters,Blue Eyes,Blond Hair,Male Characters,,Living Characters,2258.0,Nov-50,1950.0


In [3]:
#change the naming convention of the features
df.columns=[i.replace(' ', '_').lower() for i in df.columns]

In [4]:
#use a function to visualize the data

def data_brief(df):
    d_type = df.dtypes
    columns = pd.DataFrame(df.columns, d_type, columns = ['Features:']) #columns
    
    print(f'\nDataTypes:', columns)
    print(f'\nNum of Features:', len(df.columns))
    print(f'\nNum of Rows:', df.shape)
    print(f'\nIndex:', df.index.name)
    
    return

data_brief(df)    


DataTypes:                 Features:
int64             page_id
object               name
object            urlslug
object                 id
object              align
object                eye
object               hair
object                sex
object                gsm
object              alive
float64       appearances
object   first_appearance
float64              year

Num of Features: 13

Num of Rows: (16376, 13)

Index: None


Most of the data is categorical.  I'll use this information when making the statistical function to further inspect the data.

### Inspect basic statistics of the data

I prefer to create a function to analyze the data.  Usually, this function would be filled with heavy statistical information, but this data is mostly categorical. 

In [5]:
def df_analysis(df):
    eda_df={} #create an open df
    eda_df['count'] = df.count()
    eda_df['null ct'] = df.isnull().sum()
    eda_df['null %'] = df.isnull().mean()
    
    return pd.DataFrame(eda_df)

df_analysis(df)

Unnamed: 0,count,null ct,null %
page_id,16376,0,0.0
name,16376,0,0.0
urlslug,16376,0,0.0
id,12606,3770,0.230215
align,13564,2812,0.171715
eye,6609,9767,0.596422
hair,12112,4264,0.260381
sex,15522,854,0.052149
gsm,90,16286,0.994504
alive,16373,3,0.000183


Going to drop some columns that I think are pointless in where I'm going with my analysis.

In [6]:
df.drop(['page_id', 'urlslug', 'gsm'], axis=1)
df.sample(8)

Unnamed: 0,page_id,name,urlslug,id,align,eye,hair,sex,gsm,alive,appearances,first_appearance,year
2873,39501,Bowman (Earth-616),\/Bowman_(Earth-616),Secret Identity,Bad Characters,,,Male Characters,,Living Characters,12.0,Jun-05,2005.0
8580,470926,Mayer (Earth-616),\/Mayer_(Earth-616),Public Identity,Good Characters,,,Male Characters,,Living Characters,2.0,Dec-63,1963.0
2451,195265,Miles Warren (Clone) (Earth-616),\/Miles_Warren_(Clone)_(Earth-616),Secret Identity,Bad Characters,Yellow Eyes,No Hair,Male Characters,,Deceased Characters,14.0,Dec-78,1978.0
7587,2387,Garbha-Hsien (Earth-616),\/Garbha-Hsien_(Earth-616),Secret Identity,Good Characters,,White Hair,Male Characters,,Deceased Characters,3.0,May-92,1992.0
6979,282523,Barnabus Mullen (Earth-616),\/Barnabus_Mullen_(Earth-616),No Dual Identity,Good Characters,Blue Eyes,Bald,Male Characters,,Living Characters,3.0,Jan-40,1940.0
5782,535604,Daniel Alves (Earth-616),\/Daniel_Alves_(Earth-616),No Dual Identity,Good Characters,,,Male Characters,,Deceased Characters,5.0,Oct-11,2011.0
2942,70072,Mona Simpson (Earth-616),\/Mona_Simpson_(Earth-616),,,,Brown Hair,Female Characters,,Deceased Characters,11.0,Aug-72,1972.0
11389,518755,Julius Aaroni (Earth-616),\/Julius_Aaroni_(Earth-616),Public Identity,Bad Characters,,Black Hair,Male Characters,,Living Characters,1.0,Jun-46,1946.0


I can see that some of the columns are categorical and can be converted into numeric valaues
1. align 
2. sex 
3. alive

Going to remove all columns containing NULL in the 'align' column and convert the columns to numbers.

1.  evil
2.  good/neutral

# Exploratory Data Analysis

My favorite part :)

In [7]:
#create a function to analyze relationships

def tops(df, index, column):
    x = pd.DataFrame(df.groupby(index)[column].max())
    
    return x.sort_values(column, ascending=False)

In [8]:
#sexes of the characters
df.sex.value_counts()

Male Characters           11638
Female Characters          3837
Agender Characters           45
Genderfluid Characters        2
Name: sex, dtype: int64

In [9]:
df.eye.value_counts()

Blue Eyes          1962
Brown Eyes         1924
Green Eyes          613
Black Eyes          555
Red Eyes            508
White Eyes          400
Yellow Eyes         256
Grey Eyes            95
Hazel Eyes           76
Variable Eyes        49
Purple Eyes          31
Orange Eyes          25
One Eye              21
Pink Eyes            21
Gold Eyes            14
Silver Eyes          12
Violet Eyes          11
Amber Eyes           10
No Eyes               7
Multiple Eyes         7
Yellow Eyeballs       6
Black Eyeballs        3
Magenta Eyes          2
Compound Eyes         1
Name: eye, dtype: int64

In [11]:
characters_align = list(df['align'].value_counts().index)
characters_sex = list(df['sex'].value_counts().index)

female = []
males = []
agenders = []
gender_fluids = []

for each in characters_align:
    x = df[df['align'] == each]
    print(x['sex'].value_counts())
    
    women = len(x[x['sex'] == "Female Characters"])
    female.append(women)
    
    male = len(x[x['sex'] == "Male Characters"])
    males.append(male)
    
    agender = len(x[x['sex'] == "Agender Characters"])
    agenders.append(agender)
    
    gender_fluid = len(x[x['sex'] == "Genderfluid Characters"])
    gender_fluids.append(gender_fluid)
    
dict_list = {"Male": male, "Female": female, "Agender": agender, "Gender Fluid": gender_fluid}

df= pd.DataFrame(dict_list, index = characters_align)
df

Male Characters       5338
Female Characters      976
Agender Characters      20
Name: sex, dtype: int64
Male Characters           2966
Female Characters         1537
Agender Characters          10
Genderfluid Characters       1
Name: sex, dtype: int64
Male Characters           1440
Female Characters          640
Agender Characters          13
Genderfluid Characters       1
Name: sex, dtype: int64


Unnamed: 0,Male,Female,Agender,Gender Fluid
Bad Characters,1440,976,13,1
Good Characters,1440,1537,13,1
Neutral Characters,1440,640,13,1


In [None]:
genders = pd.DataFrame(df, '')

In [None]:
#encoding
#replace neutral and good characters to 0, bad characters to 1
#data[(data['Neutral Characters','Good Characters'] == 0) & data['Bad Characters']==1].head(3)

obj_df = data.select_dtypes(include=['object']).copy()
obj_df.head(3)