# Organising

In [1]:
import numpy as np
import pandas as pd

## The `pandas.DataFrame` object

In [2]:
data = [
    {'state': 'California', 'area': 423967, 'population': 38332521},
    {'state': 'Florida', 'area': 170312, 'population': 19552860},
    {'state': 'Illinois', 'area': 149995, 'population': 12882135},
    {'state': 'New York', 'area': 141297, 'population': 19651127},
    {'state': 'Texas', 'area': 695662, 'population': 26448193},
]

states = pd.DataFrame(data)
states

Unnamed: 0,area,population,state
0,423967,38332521,California
1,170312,19552860,Florida
2,149995,12882135,Illinois
3,141297,19651127,New York
4,695662,26448193,Texas


*Notes: [There are many ways to construct DataFrames](https://github.com/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.01-Introducing-Pandas-Objects.ipynb) (see "Constructing DataFrame objects"), or [reading data from files](https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html). In most cases rows should be either independent samples (also known as [tidy format](http://vita.had.co.nz/papers/tidy-data.pdf)) or timestamps.*

### Loading from file

[Comic characters dataset from fivethirtyeight](https://github.com/fivethirtyeight/data/tree/master/comic-characters).

In [3]:
df = pd.read_csv('data/dc-wikia-data.csv')

In [4]:
df

Unnamed: 0,page_id,name,urlslug,ID,ALIGN,EYE,HAIR,SEX,GSM,ALIVE,APPEARANCES,FIRST APPEARANCE,YEAR
0,1422,Batman (Bruce Wayne),\/wiki\/Batman_(Bruce_Wayne),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,3093.0,"1939, May",1939.0
1,23387,Superman (Clark Kent),\/wiki\/Superman_(Clark_Kent),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,2496.0,"1986, October",1986.0
2,1458,Green Lantern (Hal Jordan),\/wiki\/Green_Lantern_(Hal_Jordan),Secret Identity,Good Characters,Brown Eyes,Brown Hair,Male Characters,,Living Characters,1565.0,"1959, October",1959.0
3,1659,James Gordon (New Earth),\/wiki\/James_Gordon_(New_Earth),Public Identity,Good Characters,Brown Eyes,White Hair,Male Characters,,Living Characters,1316.0,"1987, February",1987.0
4,1576,Richard Grayson (New Earth),\/wiki\/Richard_Grayson_(New_Earth),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,1237.0,"1940, April",1940.0
5,1448,Wonder Woman (Diana Prince),\/wiki\/Wonder_Woman_(Diana_Prince),Public Identity,Good Characters,Blue Eyes,Black Hair,Female Characters,,Living Characters,1231.0,"1941, December",1941.0
6,1486,Aquaman (Arthur Curry),\/wiki\/Aquaman_(Arthur_Curry),Public Identity,Good Characters,Blue Eyes,Blond Hair,Male Characters,,Living Characters,1121.0,"1941, November",1941.0
7,1451,Timothy Drake (New Earth),\/wiki\/Timothy_Drake_(New_Earth),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,1095.0,"1989, August",1989.0
8,71760,Dinah Laurel Lance (New Earth),\/wiki\/Dinah_Laurel_Lance_(New_Earth),Public Identity,Good Characters,Blue Eyes,Blond Hair,Female Characters,,Living Characters,1075.0,"1969, November",1969.0
9,1380,Flash (Barry Allen),\/wiki\/Flash_(Barry_Allen),Secret Identity,Good Characters,Blue Eyes,Blond Hair,Male Characters,,Living Characters,1028.0,"1956, October",1956.0


*Notes: [you can read and write files in many formats](https://pandas.pydata.org/pandas-docs/stable/io.html). `read_csv` (and other variants) can also read directly from a url.*

## Inspecting DataFrames

In [7]:
df.head()

Unnamed: 0,page_id,name,urlslug,ID,ALIGN,EYE,HAIR,SEX,GSM,ALIVE,APPEARANCES,FIRST APPEARANCE,YEAR
0,1422,Batman (Bruce Wayne),\/wiki\/Batman_(Bruce_Wayne),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,3093.0,"1939, May",1939.0
1,23387,Superman (Clark Kent),\/wiki\/Superman_(Clark_Kent),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,2496.0,"1986, October",1986.0
2,1458,Green Lantern (Hal Jordan),\/wiki\/Green_Lantern_(Hal_Jordan),Secret Identity,Good Characters,Brown Eyes,Brown Hair,Male Characters,,Living Characters,1565.0,"1959, October",1959.0
3,1659,James Gordon (New Earth),\/wiki\/James_Gordon_(New_Earth),Public Identity,Good Characters,Brown Eyes,White Hair,Male Characters,,Living Characters,1316.0,"1987, February",1987.0
4,1576,Richard Grayson (New Earth),\/wiki\/Richard_Grayson_(New_Earth),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,1237.0,"1940, April",1940.0


In [8]:
df.tail()

Unnamed: 0,page_id,name,urlslug,ID,ALIGN,EYE,HAIR,SEX,GSM,ALIVE,APPEARANCES,FIRST APPEARANCE,YEAR
6891,66302,Nadine West (New Earth),\/wiki\/Nadine_West_(New_Earth),Public Identity,Good Characters,,,Female Characters,,Living Characters,,,
6892,283475,Warren Harding (New Earth),\/wiki\/Warren_Harding_(New_Earth),Public Identity,Good Characters,,,Male Characters,,Living Characters,,,
6893,283478,William Harrison (New Earth),\/wiki\/William_Harrison_(New_Earth),Public Identity,Good Characters,,,Male Characters,,Living Characters,,,
6894,283471,William McKinley (New Earth),\/wiki\/William_McKinley_(New_Earth),Public Identity,Good Characters,,,Male Characters,,Living Characters,,,
6895,150660,Mookie (New Earth),\/wiki\/Mookie_(New_Earth),Public Identity,Bad Characters,Blue Eyes,Blond Hair,Male Characters,,Living Characters,,,


In [9]:
df.dtypes

page_id               int64
name                 object
urlslug              object
ID                   object
ALIGN                object
EYE                  object
HAIR                 object
SEX                  object
GSM                  object
ALIVE                object
APPEARANCES         float64
FIRST APPEARANCE     object
YEAR                float64
dtype: object

In [10]:
df.describe()

Unnamed: 0,page_id,APPEARANCES,YEAR
count,6896.0,6541.0,6827.0
mean,147441.209252,23.625134,1989.766662
std,108388.631149,87.378509,16.824194
min,1380.0,1.0,1935.0
25%,44105.5,2.0,1983.0
50%,141267.0,6.0,1992.0
75%,213203.0,15.0,2003.0
max,404010.0,3093.0,2013.0


In [11]:
print('len:', len(df))
print('shape:', df.shape)

len: 6896
shape: (6896, 13)


## Selecting

In [12]:
df['name']

0                  Batman (Bruce Wayne)
1                 Superman (Clark Kent)
2            Green Lantern (Hal Jordan)
3              James Gordon (New Earth)
4           Richard Grayson (New Earth)
5           Wonder Woman (Diana Prince)
6                Aquaman (Arthur Curry)
7             Timothy Drake (New Earth)
8        Dinah Laurel Lance (New Earth)
9                   Flash (Barry Allen)
10                           GenderTest
11               Alan Scott (New Earth)
12           Barbara Gordon (New Earth)
13            Jason Garrick (New Earth)
14                Lois Lane (New Earth)
15        Alfred Pennyworth (New Earth)
16              Carter Hall (New Earth)
17              Kyle Rayner (New Earth)
18           Raymond Palmer (New Earth)
19         Alexander Luthor (New Earth)
20               Roy Harper (New Earth)
21               Kara Zor-L (Earth-Two)
22                Ted Grant (New Earth)
23           Garfield Logan (New Earth)
24              Guy Gardner (New Earth)


In [13]:
df.name  # This is the same, but be careful with methods

0                  Batman (Bruce Wayne)
1                 Superman (Clark Kent)
2            Green Lantern (Hal Jordan)
3              James Gordon (New Earth)
4           Richard Grayson (New Earth)
5           Wonder Woman (Diana Prince)
6                Aquaman (Arthur Curry)
7             Timothy Drake (New Earth)
8        Dinah Laurel Lance (New Earth)
9                   Flash (Barry Allen)
10                           GenderTest
11               Alan Scott (New Earth)
12           Barbara Gordon (New Earth)
13            Jason Garrick (New Earth)
14                Lois Lane (New Earth)
15        Alfred Pennyworth (New Earth)
16              Carter Hall (New Earth)
17              Kyle Rayner (New Earth)
18           Raymond Palmer (New Earth)
19         Alexander Luthor (New Earth)
20               Roy Harper (New Earth)
21               Kara Zor-L (Earth-Two)
22                Ted Grant (New Earth)
23           Garfield Logan (New Earth)
24              Guy Gardner (New Earth)


In [17]:
df['ALIGN'].value_counts()  # Great for making sense of categorical columns

Bad Characters        2895
Good Characters       2832
Neutral Characters     565
Reformed Criminals       3
Name: ALIGN, dtype: int64

*Note: columns are `pandas.Series` objects.*

In [14]:
df[['name', 'YEAR']]

Unnamed: 0,name,YEAR
0,Batman (Bruce Wayne),1939.0
1,Superman (Clark Kent),1986.0
2,Green Lantern (Hal Jordan),1959.0
3,James Gordon (New Earth),1987.0
4,Richard Grayson (New Earth),1940.0
5,Wonder Woman (Diana Prince),1941.0
6,Aquaman (Arthur Curry),1941.0
7,Timothy Drake (New Earth),1989.0
8,Dinah Laurel Lance (New Earth),1969.0
9,Flash (Barry Allen),1956.0


Boolean indexing

In [19]:
df['ALIGN'].value_counts()

Bad Characters        2895
Good Characters       2832
Neutral Characters     565
Reformed Criminals       3
Name: ALIGN, dtype: int64

In [24]:
df['ALIGN'] == 'Bad Characters'

0       False
1       False
2       False
3       False
4       False
5       False
6       False
7       False
8       False
9       False
10      False
11      False
12      False
13      False
14      False
15      False
16      False
17      False
18      False
19       True
20      False
21      False
22      False
23      False
24      False
25      False
26      False
27      False
28      False
29      False
        ...  
6866     True
6867    False
6868    False
6869    False
6870    False
6871     True
6872     True
6873    False
6874     True
6875    False
6876    False
6877    False
6878    False
6879     True
6880    False
6881     True
6882    False
6883    False
6884     True
6885    False
6886    False
6887    False
6888    False
6889    False
6890    False
6891    False
6892    False
6893    False
6894    False
6895     True
Name: ALIGN, Length: 6896, dtype: bool

In [25]:
df[df['ALIGN'] == 'Bad Characters']

Unnamed: 0,page_id,name,urlslug,ID,ALIGN,EYE,HAIR,SEX,GSM,ALIVE,APPEARANCES,FIRST APPEARANCE,YEAR
19,1480,Alexander Luthor (New Earth),\/wiki\/Alexander_Luthor_(New_Earth),Public Identity,Bad Characters,Green Eyes,,Male Characters,,Living Characters,677.0,"1986, October",1986.0
30,1455,Joker (New Earth),\/wiki\/Joker_(New_Earth),Secret Identity,Bad Characters,Green Eyes,Green Hair,Male Characters,,Living Characters,517.0,"1940, June",1940.0
44,1514,Uxas (New Earth),\/wiki\/Uxas_(New_Earth),Public Identity,Bad Characters,Red Eyes,,Male Characters,,Deceased Characters,388.0,"1970, December",1970.0
66,1471,Harvey Dent (New Earth),\/wiki\/Harvey_Dent_(New_Earth),Public Identity,Bad Characters,Blue Eyes,Brown Hair,Male Characters,,Living Characters,306.0,"1942, August",1942.0
107,1675,Jonathan Crane (New Earth),\/wiki\/Jonathan_Crane_(New_Earth),Public Identity,Bad Characters,Blue Eyes,Brown Hair,Male Characters,,Living Characters,222.0,"1941, September",1941.0
135,37701,Waylon Jones (New Earth),\/wiki\/Waylon_Jones_(New_Earth),Public Identity,Bad Characters,Red Eyes,,Male Characters,,Living Characters,180.0,"1983, March",1983.0
137,1518,Gorilla Grodd (New Earth),\/wiki\/Gorilla_Grodd_(New_Earth),Public Identity,Bad Characters,Grey Eyes,Black Hair,Male Characters,,Living Characters,179.0,"1959, May",1959.0
138,4778,Talia al Ghul (New Earth),\/wiki\/Talia_al_Ghul_(New_Earth),Secret Identity,Bad Characters,Brown Eyes,Brown Hair,Female Characters,,Deceased Characters,177.0,"1971, May",1971.0
148,1583,Cyrus Gold (New Earth),\/wiki\/Cyrus_Gold_(New_Earth),Secret Identity,Bad Characters,Black Eyes,White Hair,Male Characters,,Deceased Characters,174.0,"1944, October",1944.0
155,1482,Jean Loring (New Earth),\/wiki\/Jean_Loring_(New_Earth),Public Identity,Bad Characters,Green Eyes,Black Hair,Female Characters,,Deceased Characters,168.0,"1961, October",1961.0


In [27]:
df[df['YEAR'] > 2012]

Unnamed: 0,page_id,name,urlslug,ID,ALIGN,EYE,HAIR,SEX,GSM,ALIVE,APPEARANCES,FIRST APPEARANCE,YEAR
5539,85254,Springheeled Jack (Prime Earth),\/wiki\/Springheeled_Jack_(Prime_Earth),Secret Identity,Bad Characters,,,Male Characters,,Living Characters,1.0,"2013, October",2013.0


In [None]:
df[df['SEX'].isin(['Male Characters', 'Female Characters'])]

In [31]:
df[(df['YEAR'] > 2011) & (df['ALIGN'] == 'Bad Characters')]

Unnamed: 0,page_id,name,urlslug,ID,ALIGN,EYE,HAIR,SEX,GSM,ALIVE,APPEARANCES,FIRST APPEARANCE,YEAR
4324,379078,Ragnar (Green Lantern Animated Series),\/wiki\/Ragnar_(Green_Lantern_Animated_Series),Public Identity,Bad Characters,,,Male Characters,,Living Characters,3.0,"2012, March",2012.0
5539,85254,Springheeled Jack (Prime Earth),\/wiki\/Springheeled_Jack_(Prime_Earth),Secret Identity,Bad Characters,,,Male Characters,,Living Characters,1.0,"2013, October",2013.0
5540,309868,Napalm (Prime Earth),\/wiki\/Napalm_(Prime_Earth),Secret Identity,Bad Characters,,,Male Characters,,Deceased Characters,1.0,"2012, June",2012.0
6541,306472,Matteo Bischoff (New Earth),\/wiki\/Matteo_Bischoff_(New_Earth),Secret Identity,Bad Characters,,Grey Hair,Male Characters,,Living Characters,,"2012, May",2012.0


*Note: the only place you should realy use bitwise `&` and `|`.*

## DataFrame operations

In [32]:
states

Unnamed: 0,area,population,state
0,423967,38332521,California
1,170312,19552860,Florida
2,149995,12882135,Illinois
3,141297,19651127,New York
4,695662,26448193,Texas


In [33]:
states['population'] / 1000000

0    38.332521
1    19.552860
2    12.882135
3    19.651127
4    26.448193
Name: population, dtype: float64

In [34]:
states['population'] / states['area']

0     90.413926
1    114.806121
2     85.883763
3    139.076746
4     38.018740
dtype: float64

For common functions beyond simple math operator (e.g log, sin, etc.) we use [numpy ufuncs](https://docs.scipy.org/doc/numpy-1.13.0/reference/ufuncs.html#available-ufuncs).

In [35]:
np.log(states['area'])

0    12.957411
1    12.045387
2    11.918357
3    11.858619
4    13.452619
Name: area, dtype: float64

Custom functions application is the slowest but handy.

In [37]:
def is_over_populated(row):
    if row['population'] > 30000000:
        return True
    density = row['population'] / row['area']
    if density > 100:
        return True
    return False

In [39]:
states.apply(is_over_populated, axis='columns')

0     True
1     True
2    False
3     True
4    False
dtype: bool

In [40]:
states

Unnamed: 0,area,population,state
0,423967,38332521,California
1,170312,19552860,Florida
2,149995,12882135,Illinois
3,141297,19651127,New York
4,695662,26448193,Texas


In [43]:
states['density'] = states['population'] / states['area']
states

Unnamed: 0,area,population,state,density,debt
0,423967,38332521,California,90.413926,16.5
1,170312,19552860,Florida,114.806121,16.5
2,149995,12882135,Illinois,85.883763,16.5
3,141297,19651127,New York,139.076746,16.5
4,695662,26448193,Texas,38.01874,16.5


In [46]:
states['population'] = states['population'] / 1000000

In [51]:
states = states.drop('popluation', axis='columns')

In [52]:
states

Unnamed: 0,area,population,state,density,debt
0,423967,38.332521,California,90.413926,16.5
1,170312,19.55286,Florida,114.806121,16.5
2,149995,12.882135,Illinois,85.883763,16.5
3,141297,19.651127,New York,139.076746,16.5
4,695662,26.448193,Texas,38.01874,16.5


In [42]:
states['debt'] = 16.5
states

Unnamed: 0,area,population,state,density,debt
0,423967,38332521,California,90.413926,16.5
1,170312,19552860,Florida,114.806121,16.5
2,149995,12882135,Illinois,85.883763,16.5
3,141297,19651127,New York,139.076746,16.5
4,695662,26448193,Texas,38.01874,16.5


In [53]:
states.sort_values('density')

Unnamed: 0,area,population,state,density,debt
4,695662,26.448193,Texas,38.01874,16.5
2,149995,12.882135,Illinois,85.883763,16.5
0,423967,38.332521,California,90.413926,16.5
1,170312,19.55286,Florida,114.806121,16.5
3,141297,19.651127,New York,139.076746,16.5


In [54]:
states.rename(columns={'population': 'pop'})

Unnamed: 0,area,pop,state,density,debt
0,423967,38.332521,California,90.413926,16.5
1,170312,19.55286,Florida,114.806121,16.5
2,149995,12.882135,Illinois,85.883763,16.5
3,141297,19.651127,New York,139.076746,16.5
4,695662,26.448193,Texas,38.01874,16.5


In [55]:
states.rename(columns=str.upper)

Unnamed: 0,AREA,POPULATION,STATE,DENSITY,DEBT
0,423967,38.332521,California,90.413926,16.5
1,170312,19.55286,Florida,114.806121,16.5
2,149995,12.882135,Illinois,85.883763,16.5
3,141297,19.651127,New York,139.076746,16.5
4,695662,26.448193,Texas,38.01874,16.5


*Note: operations return new DataFrames. They don't change them in-place.*

## *Exercise: Let's clean our comics dataset!*

In [56]:
def first_word(text):
    if isinstance(text, str):  # Because we can't split NaN
        words = text.split()
        return words[0]
    return text

In [59]:
df['SEX'].value_counts()

Male Characters           4783
Female Characters         1967
Genderless Characters       20
Transgender Characters       1
Name: SEX, dtype: int64

In [58]:
df['SEX'].apply(first_word)

0         Male
1         Male
2         Male
3         Male
4         Male
5       Female
6         Male
7         Male
8       Female
9         Male
10      Female
11        Male
12      Female
13        Male
14      Female
15        Male
16        Male
17        Male
18        Male
19        Male
20        Male
21      Female
22        Male
23        Male
24        Male
25        Male
26        Male
27        Male
28        Male
29        Male
         ...  
6866      Male
6867      Male
6868      Male
6869      Male
6870      Male
6871    Female
6872      Male
6873    Female
6874      Male
6875      Male
6876      Male
6877      Male
6878    Female
6879      Male
6880      Male
6881    Female
6882    Female
6883      Male
6884      Male
6885    Female
6886      Male
6887      Male
6888      Male
6889      Male
6890      Male
6891    Female
6892      Male
6893      Male
6894      Male
6895      Male
Name: SEX, Length: 6896, dtype: object

1. Apply the `first_word` function to the columns: `['ID', 'ALIGN', 'EYE', 'HAIR', 'SEX', 'GSM', 'ALIVE']`. Set the result into the same column.
1. Rename the columns to lower case letters. Hint: use the `str.lower` function.
1. Check your result using `df.head()`.
1. *If you have time*: change the `alive` column value to be boolean.
1. Save it to a new file with `to_csv`.

In [90]:
df = pd.read_csv('data/dc-wikia-data.csv')

In [91]:
columns = ['ID', 'ALIGN', 'EYE', 'HAIR', 'SEX', 'GSM', 'ALIVE']

In [92]:
for column in columns:
    df[column] = df[column].apply(first_word)

In [98]:
df = df.rename(columns=str.lower)

In [95]:
df['alive'] = (df['alive'] == 'Living')

In [97]:
df.to_csv('output.csv', index=False)

## Handling missing data

In [None]:
df = pd.read_csv('data/dc-wikia-data-clean.csv')

Checks

In [109]:
df[df['year'].isnull()]

Unnamed: 0,page_id,name,urlslug,id,align,eye,hair,sex,gsm,alive,appearances,first appearance,year
386,1891,Jakeem Williams (New Earth),\/wiki\/Jakeem_Williams_(New_Earth),Secret,,Brown,,Male,,True,79.0,,
1400,64303,Hadley Jaggar (New Earth),\/wiki\/Hadley_Jaggar_(New_Earth),Secret,Good,Blue,Blond,Male,,False,19.0,,
1401,13097,Nergal (New Earth),\/wiki\/Nergal_(New_Earth),,Bad,Yellow,,Male,,True,19.0,,
1832,65286,Gregory Wolfe (New Earth),\/wiki\/Gregory_Wolfe_(New_Earth),Public,Neutral,Brown,Black,Male,,True,14.0,,
1937,146333,Clarence Charles Batson V (New Earth),\/wiki\/Clarence_Charles_Batson_V_(New_Earth),Public,Good,,Black,Male,,False,13.0,,
1938,113413,Chad Graham (New Earth),\/wiki\/Chad_Graham_(New_Earth),Secret,Bad,,Blond,Male,,False,13.0,,
2065,344513,Jupiter (New Earth),\/wiki\/Jupiter_(New_Earth),,Good,,White,Male,,True,12.0,,
2066,344983,Pegasus (New Earth),\/wiki\/Pegasus_(New_Earth),,Good,,Black,Female,,True,12.0,,
2067,286906,Asteroth (New Earth),\/wiki\/Asteroth_(New_Earth),Secret,Bad,Yellow,Black,Male,,True,12.0,,
2230,155569,Red Panzer IV (New Earth),\/wiki\/Red_Panzer_IV_(New_Earth),Secret,Bad,,,Male,,True,11.0,,


In [110]:
df['year'].notnull()

0        True
1        True
2        True
3        True
4        True
5        True
6        True
7        True
8        True
9        True
10       True
11       True
12       True
13       True
14       True
15       True
16       True
17       True
18       True
19       True
20       True
21       True
22       True
23       True
24       True
25       True
26       True
27       True
28       True
29       True
        ...  
6866     True
6867     True
6868     True
6869     True
6870     True
6871     True
6872     True
6873     True
6874     True
6875     True
6876     True
6877     True
6878     True
6879     True
6880     True
6881     True
6882     True
6883     True
6884     True
6885     True
6886     True
6887    False
6888    False
6889    False
6890    False
6891    False
6892    False
6893    False
6894    False
6895    False
Name: year, Length: 6896, dtype: bool

Filtering out

In [113]:
df.dropna(subset=['gsm', 'year'])

Unnamed: 0,page_id,name,urlslug,id,align,eye,hair,sex,gsm,alive,appearances,first appearance,year
48,1704,John Constantine (New Earth),\/wiki\/John_Constantine_(New_Earth),Public,Good,Blue,Blond,Male,Bisexual,True,371.0,"1984, June",1984.0
65,8856,Renee Montoya (New Earth),\/wiki\/Renee_Montoya_(New_Earth),Secret,Good,Brown,Black,Female,Homosexual,True,308.0,"1992, March",1992.0
119,1862,Todd Rice (New Earth),\/wiki\/Todd_Rice_(New_Earth),Public,Good,Brown,Brown,Male,Homosexual,True,208.0,"1983, September",1983.0
134,1658,Margaret Sawyer (New Earth),\/wiki\/Margaret_Sawyer_(New_Earth),Public,,Blue,,Female,Homosexual,True,180.0,"1987, April",1987.0
173,1597,Hartley Rathaway (New Earth),\/wiki\/Hartley_Rathaway_(New_Earth),Secret,Good,Blue,Red,Male,Homosexual,True,160.0,"1959, May",1959.0
253,10139,Holly Robinson (New Earth),\/wiki\/Holly_Robinson_(New_Earth),Secret,Good,Blue,Red,Female,Homosexual,True,114.0,"1987, February",1987.0
292,1614,Grace Choi (New Earth),\/wiki\/Grace_Choi_(New_Earth),Secret,Good,Brown,Red,Female,Bisexual,True,100.0,"2003, August",2003.0
301,16693,Tef\u00e9 Holland (New Earth),\/wiki\/Tef%C3%A9_Holland_(New_Earth),,Bad,Blue,White,Female,Bisexual,True,97.0,"1989, December",1989.0
315,1618,Anissa Pierce (New Earth),\/wiki\/Anissa_Pierce_(New_Earth),Secret,Good,Brown,Black,Female,Homosexual,True,92.0,"2003, August",2003.0
339,1689,Mikaal Tomas (New Earth),\/wiki\/Mikaal_Tomas_(New_Earth),Secret,Good,Blue,Purple,Male,Homosexual,True,87.0,"1976, March",1976.0


Filling empty values

In [114]:
df['year'].fillna(2000)

0       1939.0
1       1986.0
2       1959.0
3       1987.0
4       1940.0
5       1941.0
6       1941.0
7       1989.0
8       1969.0
9       1956.0
10      1956.0
11      1940.0
12      1967.0
13      1940.0
14      1938.0
15      1943.0
16      1940.0
17      1994.0
18      1961.0
19      1986.0
20      1941.0
21      1976.0
22      1942.0
23      1965.0
24      1968.0
25      1980.0
26      1993.0
27      1960.0
28      1986.0
29      1971.0
         ...  
6866    1967.0
6867    1967.0
6868    1967.0
6869    1967.0
6870    1967.0
6871    1966.0
6872    1966.0
6873    1965.0
6874    1963.0
6875    1962.0
6876    1960.0
6877    1955.0
6878    1948.0
6879    1946.0
6880    1946.0
6881    1944.0
6882    1941.0
6883    1941.0
6884    1940.0
6885    1940.0
6886    1936.0
6887    2000.0
6888    2000.0
6889    2000.0
6890    2000.0
6891    2000.0
6892    2000.0
6893    2000.0
6894    2000.0
6895    2000.0
Name: year, Length: 6896, dtype: float64

## *Exercises*

1. What is the hair color of the first character that is of a gender or sexual minority?
1. When was the last neutral (align) gender or sexual minority character instroduced?
1. What is the percentage of good GSM characters? What is the percentage of good characters in general?

In [123]:
(
    df.dropna(subset=['gsm'])
    .sort_values('year')
    .head(1)
)

Unnamed: 0,page_id,name,urlslug,id,align,eye,hair,sex,gsm,alive,appearances,first appearance,year
886,19048,Mortimer Drake (New Earth),\/wiki\/Mortimer_Drake_(New_Earth),Secret,Bad,Brown,Brown,Male,Bisexual,True,32.0,"1943, November",1943.0


In [128]:
(
    df[(df['align'] == 'Neutral') & (df['gsm'].notnull())]
    .sort_values('year')
    .tail(1)
)

Unnamed: 0,page_id,name,urlslug,id,align,eye,hair,sex,gsm,alive,appearances,first appearance,year
2247,128152,Liana Kerzner (New Earth),\/wiki\/Liana_Kerzner_(New_Earth),Public,Neutral,Blue,Red,Female,Homosexual,True,10.0,"2008, November",2008.0
