# Organising

In [1]:
import numpy as np
import pandas as pd

## The `pandas.DataFrame` object

In [2]:
data = [
    {'state': 'California', 'area': 423967, 'population': 38332521},
    {'state': 'Florida', 'area': 170312, 'population': 19552860},
    {'state': 'Illinois', 'area': 149995, 'population': 12882135},
    {'state': 'New York', 'area': 141297, 'population': 19651127},
    {'state': 'Texas', 'area': 695662, 'population': 26448193},
]

states = pd.DataFrame(data)
states

Unnamed: 0,state,area,population
0,California,423967,38332521
1,Florida,170312,19552860
2,Illinois,149995,12882135
3,New York,141297,19651127
4,Texas,695662,26448193


*Notes: [There are many ways to construct DataFrames](https://github.com/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.01-Introducing-Pandas-Objects.ipynb) (see "Constructing DataFrame objects"), or [reading data from files](https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html). In most cases rows should be either independent samples (also known as [tidy format](http://vita.had.co.nz/papers/tidy-data.pdf)) or timestamps.*

### Loading from file

[Comic characters dataset from fivethirtyeight](https://github.com/fivethirtyeight/data/tree/master/comic-characters).

In [3]:
df = pd.read_csv('data/dc-wikia-data.csv')

In [4]:
df

Unnamed: 0,page_id,name,urlslug,ID,ALIGN,EYE,HAIR,SEX,GSM,ALIVE,APPEARANCES,FIRST APPEARANCE,YEAR
0,1422,Batman (Bruce Wayne),\/wiki\/Batman_(Bruce_Wayne),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,3093.0,"1939, May",1939.0
1,23387,Superman (Clark Kent),\/wiki\/Superman_(Clark_Kent),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,2496.0,"1986, October",1986.0
2,1458,Green Lantern (Hal Jordan),\/wiki\/Green_Lantern_(Hal_Jordan),Secret Identity,Good Characters,Brown Eyes,Brown Hair,Male Characters,,Living Characters,1565.0,"1959, October",1959.0
3,1659,James Gordon (New Earth),\/wiki\/James_Gordon_(New_Earth),Public Identity,Good Characters,Brown Eyes,White Hair,Male Characters,,Living Characters,1316.0,"1987, February",1987.0
4,1576,Richard Grayson (New Earth),\/wiki\/Richard_Grayson_(New_Earth),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,1237.0,"1940, April",1940.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6891,66302,Nadine West (New Earth),\/wiki\/Nadine_West_(New_Earth),Public Identity,Good Characters,,,Female Characters,,Living Characters,,,
6892,283475,Warren Harding (New Earth),\/wiki\/Warren_Harding_(New_Earth),Public Identity,Good Characters,,,Male Characters,,Living Characters,,,
6893,283478,William Harrison (New Earth),\/wiki\/William_Harrison_(New_Earth),Public Identity,Good Characters,,,Male Characters,,Living Characters,,,
6894,283471,William McKinley (New Earth),\/wiki\/William_McKinley_(New_Earth),Public Identity,Good Characters,,,Male Characters,,Living Characters,,,


*Notes: [you can read and write files in many formats](https://pandas.pydata.org/pandas-docs/stable/io.html). `read_csv` (and other variants) can also read directly from a url.*

## Inspecting DataFrames

In [5]:
df.head(2)

Unnamed: 0,page_id,name,urlslug,ID,ALIGN,EYE,HAIR,SEX,GSM,ALIVE,APPEARANCES,FIRST APPEARANCE,YEAR
0,1422,Batman (Bruce Wayne),\/wiki\/Batman_(Bruce_Wayne),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,3093.0,"1939, May",1939.0
1,23387,Superman (Clark Kent),\/wiki\/Superman_(Clark_Kent),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,2496.0,"1986, October",1986.0


In [6]:
df.tail()

Unnamed: 0,page_id,name,urlslug,ID,ALIGN,EYE,HAIR,SEX,GSM,ALIVE,APPEARANCES,FIRST APPEARANCE,YEAR
6891,66302,Nadine West (New Earth),\/wiki\/Nadine_West_(New_Earth),Public Identity,Good Characters,,,Female Characters,,Living Characters,,,
6892,283475,Warren Harding (New Earth),\/wiki\/Warren_Harding_(New_Earth),Public Identity,Good Characters,,,Male Characters,,Living Characters,,,
6893,283478,William Harrison (New Earth),\/wiki\/William_Harrison_(New_Earth),Public Identity,Good Characters,,,Male Characters,,Living Characters,,,
6894,283471,William McKinley (New Earth),\/wiki\/William_McKinley_(New_Earth),Public Identity,Good Characters,,,Male Characters,,Living Characters,,,
6895,150660,Mookie (New Earth),\/wiki\/Mookie_(New_Earth),Public Identity,Bad Characters,Blue Eyes,Blond Hair,Male Characters,,Living Characters,,,


In [7]:
df.dtypes

page_id               int64
name                 object
urlslug              object
ID                   object
ALIGN                object
EYE                  object
HAIR                 object
SEX                  object
GSM                  object
ALIVE                object
APPEARANCES         float64
FIRST APPEARANCE     object
YEAR                float64
dtype: object

In [8]:
df.describe()

Unnamed: 0,page_id,APPEARANCES,YEAR
count,6896.0,6541.0,6827.0
mean,147441.209252,23.625134,1989.766662
std,108388.631149,87.378509,16.824194
min,1380.0,1.0,1935.0
25%,44105.5,2.0,1983.0
50%,141267.0,6.0,1992.0
75%,213203.0,15.0,2003.0
max,404010.0,3093.0,2013.0


In [9]:
len(df)

6896

In [10]:
df.shape

(6896, 13)

In [11]:
df.columns

Index(['page_id', 'name', 'urlslug', 'ID', 'ALIGN', 'EYE', 'HAIR', 'SEX',
       'GSM', 'ALIVE', 'APPEARANCES', 'FIRST APPEARANCE', 'YEAR'],
      dtype='object')

## Selecting

In [12]:
df['name']

0               Batman (Bruce Wayne)
1              Superman (Clark Kent)
2         Green Lantern (Hal Jordan)
3           James Gordon (New Earth)
4        Richard Grayson (New Earth)
                    ...             
6891         Nadine West (New Earth)
6892      Warren Harding (New Earth)
6893    William Harrison (New Earth)
6894    William McKinley (New Earth)
6895              Mookie (New Earth)
Name: name, Length: 6896, dtype: object

In [13]:
df.name  # This is the same, but be careful with methods
df['name']

0               Batman (Bruce Wayne)
1              Superman (Clark Kent)
2         Green Lantern (Hal Jordan)
3           James Gordon (New Earth)
4        Richard Grayson (New Earth)
                    ...             
6891         Nadine West (New Earth)
6892      Warren Harding (New Earth)
6893    William Harrison (New Earth)
6894    William McKinley (New Earth)
6895              Mookie (New Earth)
Name: name, Length: 6896, dtype: object

In [14]:
df['ALIGN'].value_counts()  # Great for making sense of categorical columns

Bad Characters        2895
Good Characters       2832
Neutral Characters     565
Reformed Criminals       3
Name: ALIGN, dtype: int64

*Note: columns are `pandas.Series` objects.*

In [15]:
df[['name', 'YEAR']]

Unnamed: 0,name,YEAR
0,Batman (Bruce Wayne),1939.0
1,Superman (Clark Kent),1986.0
2,Green Lantern (Hal Jordan),1959.0
3,James Gordon (New Earth),1987.0
4,Richard Grayson (New Earth),1940.0
...,...,...
6891,Nadine West (New Earth),
6892,Warren Harding (New Earth),
6893,William Harrison (New Earth),
6894,William McKinley (New Earth),


Boolean indexing

In [16]:
df['ALIGN'] == 'Bad Characters'

0       False
1       False
2       False
3       False
4       False
        ...  
6891    False
6892    False
6893    False
6894    False
6895     True
Name: ALIGN, Length: 6896, dtype: bool

In [17]:
df[df['ALIGN'] == 'Bad Characters']

Unnamed: 0,page_id,name,urlslug,ID,ALIGN,EYE,HAIR,SEX,GSM,ALIVE,APPEARANCES,FIRST APPEARANCE,YEAR
19,1480,Alexander Luthor (New Earth),\/wiki\/Alexander_Luthor_(New_Earth),Public Identity,Bad Characters,Green Eyes,,Male Characters,,Living Characters,677.0,"1986, October",1986.0
30,1455,Joker (New Earth),\/wiki\/Joker_(New_Earth),Secret Identity,Bad Characters,Green Eyes,Green Hair,Male Characters,,Living Characters,517.0,"1940, June",1940.0
44,1514,Uxas (New Earth),\/wiki\/Uxas_(New_Earth),Public Identity,Bad Characters,Red Eyes,,Male Characters,,Deceased Characters,388.0,"1970, December",1970.0
66,1471,Harvey Dent (New Earth),\/wiki\/Harvey_Dent_(New_Earth),Public Identity,Bad Characters,Blue Eyes,Brown Hair,Male Characters,,Living Characters,306.0,"1942, August",1942.0
107,1675,Jonathan Crane (New Earth),\/wiki\/Jonathan_Crane_(New_Earth),Public Identity,Bad Characters,Blue Eyes,Brown Hair,Male Characters,,Living Characters,222.0,"1941, September",1941.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6874,128098,Bartholomew Magan (New Earth),\/wiki\/Bartholomew_Magan_(New_Earth),,Bad Characters,,,Male Characters,,Living Characters,,"1963, September",1963.0
6879,238641,Dmane (Earth-Two),\/wiki\/Dmane_(Earth-Two),,Bad Characters,Blue Eyes,,Male Characters,,Living Characters,,"1946, April",1946.0
6881,1624,Doris Zuel (New Earth),\/wiki\/Doris_Zuel_(New_Earth),Secret Identity,Bad Characters,Green Eyes,Red Hair,Female Characters,,Living Characters,,"1944, June",1944.0
6884,1473,Basil Karlo (New Earth),\/wiki\/Basil_Karlo_(New_Earth),Secret Identity,Bad Characters,Black Eyes,Black Hair,Male Characters,,Living Characters,,"1940, June",1940.0


In [18]:
df[df['YEAR'] > 2012]

Unnamed: 0,page_id,name,urlslug,ID,ALIGN,EYE,HAIR,SEX,GSM,ALIVE,APPEARANCES,FIRST APPEARANCE,YEAR
5539,85254,Springheeled Jack (Prime Earth),\/wiki\/Springheeled_Jack_(Prime_Earth),Secret Identity,Bad Characters,,,Male Characters,,Living Characters,1.0,"2013, October",2013.0


In [19]:
df[df['SEX'].isin(['Male Characters', 'Female Characters'])]

Unnamed: 0,page_id,name,urlslug,ID,ALIGN,EYE,HAIR,SEX,GSM,ALIVE,APPEARANCES,FIRST APPEARANCE,YEAR
0,1422,Batman (Bruce Wayne),\/wiki\/Batman_(Bruce_Wayne),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,3093.0,"1939, May",1939.0
1,23387,Superman (Clark Kent),\/wiki\/Superman_(Clark_Kent),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,2496.0,"1986, October",1986.0
2,1458,Green Lantern (Hal Jordan),\/wiki\/Green_Lantern_(Hal_Jordan),Secret Identity,Good Characters,Brown Eyes,Brown Hair,Male Characters,,Living Characters,1565.0,"1959, October",1959.0
3,1659,James Gordon (New Earth),\/wiki\/James_Gordon_(New_Earth),Public Identity,Good Characters,Brown Eyes,White Hair,Male Characters,,Living Characters,1316.0,"1987, February",1987.0
4,1576,Richard Grayson (New Earth),\/wiki\/Richard_Grayson_(New_Earth),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,1237.0,"1940, April",1940.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6891,66302,Nadine West (New Earth),\/wiki\/Nadine_West_(New_Earth),Public Identity,Good Characters,,,Female Characters,,Living Characters,,,
6892,283475,Warren Harding (New Earth),\/wiki\/Warren_Harding_(New_Earth),Public Identity,Good Characters,,,Male Characters,,Living Characters,,,
6893,283478,William Harrison (New Earth),\/wiki\/William_Harrison_(New_Earth),Public Identity,Good Characters,,,Male Characters,,Living Characters,,,
6894,283471,William McKinley (New Earth),\/wiki\/William_McKinley_(New_Earth),Public Identity,Good Characters,,,Male Characters,,Living Characters,,,


In [20]:
df.tail(10)['ALIGN'].isin(['Bad Characters', 'Good Characters'])

6886    False
6887     True
6888     True
6889     True
6890     True
6891     True
6892     True
6893     True
6894     True
6895     True
Name: ALIGN, dtype: bool

In [21]:
df[(df['YEAR'] > 2011) & (df['ALIGN'] == 'Bad Characters')]

Unnamed: 0,page_id,name,urlslug,ID,ALIGN,EYE,HAIR,SEX,GSM,ALIVE,APPEARANCES,FIRST APPEARANCE,YEAR
4324,379078,Ragnar (Green Lantern Animated Series),\/wiki\/Ragnar_(Green_Lantern_Animated_Series),Public Identity,Bad Characters,,,Male Characters,,Living Characters,3.0,"2012, March",2012.0
5539,85254,Springheeled Jack (Prime Earth),\/wiki\/Springheeled_Jack_(Prime_Earth),Secret Identity,Bad Characters,,,Male Characters,,Living Characters,1.0,"2013, October",2013.0
5540,309868,Napalm (Prime Earth),\/wiki\/Napalm_(Prime_Earth),Secret Identity,Bad Characters,,,Male Characters,,Deceased Characters,1.0,"2012, June",2012.0
6541,306472,Matteo Bischoff (New Earth),\/wiki\/Matteo_Bischoff_(New_Earth),Secret Identity,Bad Characters,,Grey Hair,Male Characters,,Living Characters,,"2012, May",2012.0


*Note: the only place you should realy use bitwise `&` and `|`.*

## DataFrame operations

In [22]:
states

Unnamed: 0,state,area,population
0,California,423967,38332521
1,Florida,170312,19552860
2,Illinois,149995,12882135
3,New York,141297,19651127
4,Texas,695662,26448193


In [23]:
states['population'] / 1000000

0    38.332521
1    19.552860
2    12.882135
3    19.651127
4    26.448193
Name: population, dtype: float64

In [24]:
states['population'] / states['area']

0     90.413926
1    114.806121
2     85.883763
3    139.076746
4     38.018740
dtype: float64

For common functions beyond simple math operator (e.g log, sin, etc.) we use [numpy ufuncs](https://docs.scipy.org/doc/numpy-1.13.0/reference/ufuncs.html#available-ufuncs).

In [25]:
np.log(states['area'])

0    12.957411
1    12.045387
2    11.918357
3    11.858619
4    13.452619
Name: area, dtype: float64

Custom functions application is the slowest but handy.

In [26]:
def is_over_populated(row):
    if row['population'] > 30000000:
        return True
    density = row['population'] / row['area']
    if density > 100:
        return True
    return False

In [27]:
states.apply(is_over_populated, axis='columns')

0     True
1     True
2    False
3     True
4    False
dtype: bool

In [28]:
states['density'] = states['population'] / states['area']
states

Unnamed: 0,state,area,population,density
0,California,423967,38332521,90.413926
1,Florida,170312,19552860,114.806121
2,Illinois,149995,12882135,85.883763
3,New York,141297,19651127,139.076746
4,Texas,695662,26448193,38.01874


In [29]:
states['debt'] = 16.5
states

Unnamed: 0,state,area,population,density,debt
0,California,423967,38332521,90.413926,16.5
1,Florida,170312,19552860,114.806121,16.5
2,Illinois,149995,12882135,85.883763,16.5
3,New York,141297,19651127,139.076746,16.5
4,Texas,695662,26448193,38.01874,16.5


In [30]:
states.sort_values('density', ascending=False)

Unnamed: 0,state,area,population,density,debt
3,New York,141297,19651127,139.076746,16.5
1,Florida,170312,19552860,114.806121,16.5
0,California,423967,38332521,90.413926,16.5
2,Illinois,149995,12882135,85.883763,16.5
4,Texas,695662,26448193,38.01874,16.5


In [31]:
states.rename(columns={'population': 'pop'})

Unnamed: 0,state,area,pop,density,debt
0,California,423967,38332521,90.413926,16.5
1,Florida,170312,19552860,114.806121,16.5
2,Illinois,149995,12882135,85.883763,16.5
3,New York,141297,19651127,139.076746,16.5
4,Texas,695662,26448193,38.01874,16.5


In [32]:
states = states.rename(columns=str.upper)
states

Unnamed: 0,STATE,AREA,POPULATION,DENSITY,DEBT
0,California,423967,38332521,90.413926,16.5
1,Florida,170312,19552860,114.806121,16.5
2,Illinois,149995,12882135,85.883763,16.5
3,New York,141297,19651127,139.076746,16.5
4,Texas,695662,26448193,38.01874,16.5


In [33]:
states

Unnamed: 0,STATE,AREA,POPULATION,DENSITY,DEBT
0,California,423967,38332521,90.413926,16.5
1,Florida,170312,19552860,114.806121,16.5
2,Illinois,149995,12882135,85.883763,16.5
3,New York,141297,19651127,139.076746,16.5
4,Texas,695662,26448193,38.01874,16.5


*Note: operations return new DataFrames. They don't change them in-place.*

## *Exercise: Let's clean our comics dataset!*

In [34]:
def first_word(text):
    if isinstance(text, str):  # Because we can't split NaN
        words = text.split()
        return words[0]
    return text

In [35]:
df['GSM'].value_counts()

Homosexual Characters    54
Bisexual Characters      10
Name: GSM, dtype: int64

In [36]:
df['SEX'].apply(first_word)

0         Male
1         Male
2         Male
3         Male
4         Male
         ...  
6891    Female
6892      Male
6893      Male
6894      Male
6895      Male
Name: SEX, Length: 6896, dtype: object

1. Apply the `first_word` function to the columns: `['ID', 'ALIGN', 'EYE', 'HAIR', 'SEX', 'GSM', 'ALIVE']`. Set the result into the same column.
1. Rename the columns to lower case letters. Hint: use the `str.lower` function.
1. *If you have time*: change the `alive` column value to be boolean.
1. Save it to a new file with `to_csv`.

In [37]:
df = pd.read_csv('data/dc-wikia-data.csv')

# Apply first_word
for column in ['ID', 'ALIGN', 'EYE', 'HAIR', 'SEX', 'GSM', 'ALIVE']:
    df[column] = df[column].apply(first_word)

# Rename columns to lower case letters
df = df.rename(columns=str.lower)

# Change the alive column to boolean
df['alive'] = (df['alive'] == 'Living')

# Save to a new csv file
df.to_csv('clean.csv')

## Handling missing data

In [43]:
df = pd.read_csv('data/dc-wikia-data-clean.csv')

Checks

In [44]:
df['year'].isnull()

0       False
1       False
2       False
3       False
4       False
        ...  
6891     True
6892     True
6893     True
6894     True
6895     True
Name: year, Length: 6896, dtype: bool

In [45]:
df['gsm'].notnull()

0       False
1       False
2       False
3       False
4       False
        ...  
6891    False
6892    False
6893    False
6894    False
6895    False
Name: gsm, Length: 6896, dtype: bool

Filtering out

In [46]:
df.dropna(subset=['gsm'])

Unnamed: 0,page_id,name,urlslug,id,align,eye,hair,sex,gsm,alive,appearances,first appearance,year
48,1704,John Constantine (New Earth),\/wiki\/John_Constantine_(New_Earth),Public,Good,Blue,Blond,Male,Bisexual,True,371.0,"1984, June",1984.0
65,8856,Renee Montoya (New Earth),\/wiki\/Renee_Montoya_(New_Earth),Secret,Good,Brown,Black,Female,Homosexual,True,308.0,"1992, March",1992.0
119,1862,Todd Rice (New Earth),\/wiki\/Todd_Rice_(New_Earth),Public,Good,Brown,Brown,Male,Homosexual,True,208.0,"1983, September",1983.0
173,1597,Hartley Rathaway (New Earth),\/wiki\/Hartley_Rathaway_(New_Earth),Secret,Good,Blue,Red,Male,Homosexual,True,160.0,"1959, May",1959.0
253,10139,Holly Robinson (New Earth),\/wiki\/Holly_Robinson_(New_Earth),Secret,Good,Blue,Red,Female,Homosexual,True,114.0,"1987, February",1987.0
292,1614,Grace Choi (New Earth),\/wiki\/Grace_Choi_(New_Earth),Secret,Good,Brown,Red,Female,Bisexual,True,100.0,"2003, August",2003.0
315,1618,Anissa Pierce (New Earth),\/wiki\/Anissa_Pierce_(New_Earth),Secret,Good,Brown,Black,Female,Homosexual,True,92.0,"2003, August",2003.0
339,1689,Mikaal Tomas (New Earth),\/wiki\/Mikaal_Tomas_(New_Earth),Secret,Good,Blue,Purple,Male,Homosexual,True,87.0,"1976, March",1976.0
348,56388,Mallah (New Earth),\/wiki\/Mallah_(New_Earth),Secret,Bad,Brown,Brown,Male,Homosexual,False,85.0,"1964, March",1964.0
409,2655,Hugh Dawkins (New Earth),\/wiki\/Hugh_Dawkins_(New_Earth),Secret,Good,White,Blond,Male,Homosexual,True,74.0,"1986, November",1986.0


Filling empty values

In [42]:
df['year'].fillna(2000)

0       1939.0
1       1986.0
2       1959.0
3       1987.0
4       1940.0
         ...  
6891    2000.0
6892    2000.0
6893    2000.0
6894    2000.0
6895    2000.0
Name: year, Length: 6896, dtype: float64

## *Exercises*

1. What is the hair color of the first character that is of a gender or sexual minority?
1. When was the last neutral gender or sexual minority character instroduced?
1. What is the percentage of good characters of gender or sexual minority? What is the percentage of good characters in general?