In [1]:
from bs4 import BeautifulSoup
import pandas as pd
from sklearn import linear_model
import sklearn as sk
import sklearn.tree as tree
import statsmodels.api as sm
import pydotplus
import matplotlib
import matplotlib.pyplot as plt
import requests
import re
import numpy as np

%matplotlib inline

In [2]:
# reads in a table with a list of episodes
df_episodes = pd.read_csv('../data/episodes.csv')

# this is the pages where we will get the url for each characters page
character_list_url = 'http://walkingdead.wikia.com/wiki/TV_Series_Characters'
episode_list_url = ''
base_url = 'http://walkingdead.wikia.com'
r = requests.get(character_list_url)
s = BeautifulSoup(r.text, 'lxml')

attribute_list = ['gender', 'hair', 'age', 'ethnicity', 'status', 'series lifespan']
drop_list = ['/wiki/Tim_(TV_Series)', '/wiki/Big_Tony_(TV_Series)']
character_list = ['Glenn Rhee', 'Maggie Greene', 'Daryl Dixon', 'Rick Grimes', 'Abraham Ford', \
                  'Carl Grimes', 'Michonne', 'Rosita Espinosa', 'Sasha Williams', 'Aaron', 'Eugene Porter']

In [3]:
# finds the urls of all the character
chars = s.findAll(class_ = 'image image-thumbnail link-internal')
characters_urls = []
for i in range(len(chars)):
    if chars[i].get('href').replace('/wiki/', '') != 'Characters':
        characters_urls.append(chars[i].get('href'))
        
        
# remove character pages that are incomplete or deleted
characters_urls = [c for c in characters_urls if c not in drop_list]

In [4]:
# goes through the character urls
i = 0
new_row = []

r = requests.get(base_url + characters_urls[i])
s = BeautifulSoup(r.text, 'lxml')

# grabs the info box from the page
infobox = s.find(class_ = re.compile('infobox.*'))
if infobox is not None:
    text = infobox.get_text()

    # formats the text a little
    for a in attribute_list:
        text = text.replace(a + '\n', '\n' + a + ': ')

In [5]:
res_list = []

# goes through the characters
for character_url in characters_urls:
    print(character_url)
    
    new_row = []

    # goes to that characters page
    r = requests.get(base_url + character_url)
    s = BeautifulSoup(r.text, 'lxml')

    # grabs the info box from the page
    infobox = s.find(class_ = re.compile('infobox.*'))
    text = infobox.get_text()

    # gets the name
    new_row.append(text.split('\n')[1].strip())

    # gets each of the attributes
    for a in attribute_list:
        try:
            result = re.search(a.lower() + '\n(.*)\n', text.lower())
            new_row.append(result.group().replace(a.lower() + '\n', '').strip())          
            
        except:
            new_row.append('NA')

    # gets the first and last episode they were in
    if '" to ' in new_row[6]:
        series_lifespan = new_row[6].split('" to ')
        new_row.append(series_lifespan[0].replace('"', ''))
        new_row.append(series_lifespan[1].replace('"', '').strip())  
    else:
        new_row.append(new_row[6].replace('"', ''))
        new_row.append(new_row[6].replace('"', ''))
        
    # appends the data to the results list
    res_list.append(new_row)

/wiki/Rick_Grimes_(TV_Series)
/wiki/Carl_Grimes_(TV_Series)
/wiki/Judith_Grimes_(TV_Series)
/wiki/Lori_Grimes_(TV_Series)
/wiki/Lambert_Kendal_(TV_Series)
/wiki/Shane_Walsh_(TV_Series)
/wiki/Leon_Basset_(TV_Series)
/wiki/Morgan_Jones_(TV_Series)
/wiki/Jenny_Jones_(TV_Series)
/wiki/Duane_Jones_(TV_Series)
/wiki/Paula_(TV_Series)
/wiki/Glenn_Rhee_(TV_Series)
/wiki/Theodore_Douglas_(TV_Series)
/wiki/Dale_Horvath_(TV_Series)
/wiki/Jacqui_(TV_Series)
/wiki/Jim_(TV_Series)
/wiki/Carol_Peletier_(TV_Series)
/wiki/Sophia_Peletier_(TV_Series)
/wiki/Ed_Peletier_(TV_Series)
/wiki/Daryl_Dixon_(TV_Series)
/wiki/Merle_Dixon_(TV_Series)
/wiki/Andrea_(TV_Series)
/wiki/Amy_(TV_Series)
/wiki/Morales_(TV_Series)
/wiki/Miranda_Morales_(TV_Series)
/wiki/Louis_Morales_(TV_Series)
/wiki/Eliza_Morales_(TV_Series)
/wiki/Guillermo_(TV_Series)
/wiki/Jorge_(TV_Series)
/wiki/Mr._Gilbert_(TV_Series)
/wiki/Abuela_(TV_Series)
/wiki/Felipe_(TV_Series)
/wiki/Miguel_(TV_Series)
/wiki/Vi_(TV_Series)
/wiki/Edwin_Jenner_(TV

In [6]:
df = pd.DataFrame(res_list, columns = ['name'] + attribute_list + ['start_episode', 'end_episode'])

# corrects the status of some of the characters
# the wiki list some of our characters in question as alive and others as unknown
df.ix[df['name'].isin(character_list), 'status'] = 'alive'

In [7]:
# remove unknown
print(df.shape)
df = df.ix[df['status'] != 'unknown', :]
print(df.shape)

# clean up age
print(df.shape)
df = df.ix[~df['age'].isin(['unknown', 'unspecified', 'NA']), :]
print(df.shape)

# some customer changes that are hard to correct
df.ix[df['age'] == '12 (season 2)  13 (season 3)[1]  14 (season 4)', 'age'] = '10'
df.ix[df['age'] == 'one week ("the suicide king")8 months (season 5)9 months/11 months (season 6)', 'age'] = '0'
df.ix[df['age'] == '16 (season 2)17 (season 3)18 (season 5)', 'age'] = '10'
df.ix[df['age'] == '36 (season 1)37 (season 3)', 'age'] = '30'
df.ix[df['age'] == '7-10', 'age'] = '10'
df.ix[df['age'] == '8-10', 'age'] = '10'
df.ix[df['age'] == 'possibly 5-10', 'age'] = '10'
df.ix[df['age'] == '22  (season 2)23 (season 3)24 (season 4)', 'age'] = '20'

# keeps just the digits
df['age'] = df['age'].str.replace('[^0-9]', '')

# simpliys multiple ages to just a single age
df.ix[df['age'].str.len() == 4, 'age'] = df.ix[df['age'].str.len() == 4, 'age'].str[2:]

# converts the ages to numbers and rounds them down
df = df.ix[df['age'] != '', :]
df['age'] = df['age'].astype(int)
df['age'] = df['age'].round(-1)
print(df['age'].unique())

df.head()

(268, 9)
(242, 9)
(242, 9)
(234, 9)
[40 10  0 30 20 60 50 70 80]


Unnamed: 0,name,gender,hair,age,ethnicity,status,series lifespan,start_episode,end_episode
0,Rick Grimes,male,dark brown (greying),40,caucasian-american,alive,"""days gone bye"" to present",days gone bye,present
1,Carl Grimes,male,brown,10,caucasian-american,alive,"""days gone bye"" to present",days gone bye,present
2,Judith Grimes,female,honey blonde,0,caucasian-american,alive,"""killer within"" to present",killer within,present
3,Lori Grimes,female,brown,30,caucasian-american,dead,"""days gone bye"" to ""killer within""",days gone bye,killer within
5,Shane Walsh,male,black,30,caucasian-american,dead,"""days gone bye"" to ""better angels""",days gone bye,better angels


In [8]:
# calculates thier series age
df['series_age'] = 0

# merges in the start episode number
df_full = df.merge(df_episodes.ix[:, ['episode_number', 'episode_name']], left_on = 'start_episode', right_on = 'episode_name', how = 'left')
df_full = df_full.drop('episode_name', 1)

# merges in the end episode number
df_full = df_full.merge(df_episodes.ix[:, ['episode_number', 'episode_name']], left_on = 'end_episode', right_on = 'episode_name', how = 'left')
df_full = df_full.drop('episode_name', 1)

# takes care of caracters that are still alive
df_full.ix[pd.isnull(df_full['episode_number_y']), 'episode_number_y'] = max(df_episodes['episode_number'])

# calculates the ages, we add a 1 so that if they lived 1 episode then they have an age of 1
df_full.ix[:, 'series_age'] = df_full['episode_number_y'] - df_full['episode_number_x'] + 1

# merges in the season they started in
df_full = df_full.merge(df_episodes.ix[:, ['season_number', 'episode_name']], left_on = 'start_episode', right_on = 'episode_name', how = 'left')
df_full.rename(columns={'season_number': 'season_birth'}, inplace=True)

# drops the extra column
df_full = df_full.drop(['episode_number_x', 'episode_number_y'], 1)

# drops the na's because we don't have clear data on how long they lived
print(df_full.shape)
df_full = df_full.ix[pd.notnull(df_full['series_age']), :]
print(df_full.shape)

(227, 12)
(157, 12)


In [9]:
# simplifies the ethnicity
df_full.ix[df_full['ethnicity'] == 'african-american', 'ethnicity'] = 'black'
df_full.ix[df_full['ethnicity'] == 'hispanic-american', 'ethnicity'] = 'hispanic'
df_full.ix[df_full['ethnicity'] == 'indian-american', 'ethnicity'] = 'indian'
df_full.ix[df_full['ethnicity'] == 'asian-american', 'ethnicity'] = 'asian'
df_full.ix[df_full['ethnicity'] == 'korean-american', 'ethnicity'] = 'asian'
df_full.ix[df_full.ix[:, 'ethnicity'].isin(['caucasian-american', 'irish-american', 'caucasian', \
                                      'italian-american', 'caucasian-irish']), 'ethnicity'] = 'white'
# drops the nulls for ethnicity
df_full = df_full.ix[df_full['ethnicity'] != 'NA', :]
print(df_full['ethnicity'].unique())
df_full.head()

['white' 'black' 'asian' 'hispanic' 'indian']


Unnamed: 0,name,gender,hair,age,ethnicity,status,series lifespan,start_episode,end_episode,series_age,season_birth,episode_name
0,Rick Grimes,male,dark brown (greying),40,white,alive,"""days gone bye"" to present",days gone bye,present,83.0,1.0,days gone bye
1,Carl Grimes,male,brown,10,white,alive,"""days gone bye"" to present",days gone bye,present,83.0,1.0,days gone bye
2,Judith Grimes,female,honey blonde,0,white,alive,"""killer within"" to present",killer within,present,61.0,3.0,killer within
3,Lori Grimes,female,brown,30,white,dead,"""days gone bye"" to ""killer within""",days gone bye,killer within,23.0,1.0,days gone bye
4,Shane Walsh,male,black,30,white,dead,"""days gone bye"" to ""better angels""",days gone bye,better angels,18.0,1.0,days gone bye


# Modeling

In [10]:
x_variables = ['male', 'female', 'age', 'asian', 'black', 'hispanic', 'white', 'season_birth']

df_model = df_full.ix[:, ['name', 'gender', 'age', 'ethnicity', 'series_age', 'season_birth', 'status']].reset_index(drop = True)

# some manual one hot encoding
## gender
df_model['male'] = 0
df_model.ix[df_model['gender'] == 'male', 'male'] = 1
df_model['female'] = 0
df_model.ix[df_model['gender'] == 'female', 'female'] = 1

## ethnicity
for e in df_model['ethnicity'].unique():
    df_model[e] = 0
    df_model.ix[df_model['ethnicity'] == e, e] = 1

# # season they started in
# for s in df_model['season_birth'].unique():
#     df_model['season_birth_' + str(int(s))] = 0
#     df_model.ix[df_model['season_birth'] == s, 'season_birth_' + str(int(s))] = 1

In [11]:
# splits the data into characters we want to predict on and all other
df_model_characters = df_model.ix[df_model['name'].isin(character_list), :]
df_model_characters.drop('status', 1)
df_model = df_model.ix[df_model['status'].isin(['dead', 'undead']), :]
df_model.drop('status', 1)

# splits the data into x and y
df_x = df_model.ix[:, x_variables]
df_y = df_model.ix[:, ['series_age']]

In [12]:
# makes a data frame for storing predictions
x_characters = df_model_characters.ix[:, x_variables]
y_characters = df_model_characters.ix[:, ['series_age']]
df_predictions = pd.DataFrame(df_model_characters['name'], columns = ['name'])
df_predictions['series_age'] = y_characters

In [13]:
# tree
tree_model = tree.DecisionTreeRegressor(min_samples_leaf = 3)
tree_model.fit(df_x, df_y)
y_hat = tree_model.predict(x_characters)
df_predictions['tree_y_hat'] = y_hat
df_predictions['tree_y_hat_diff'] = df_predictions['tree_y_hat'] - df_predictions['series_age']

df_predictions.head(11)

Unnamed: 0,name,series_age,tree_y_hat,tree_y_hat_diff
0,Rick Grimes,83.0,3.25,-79.75
1,Carl Grimes,83.0,19.333333,-63.666667
7,Glenn Rhee,83.0,7.0,-76.0
15,Daryl Dixon,81.0,3.25,-77.75
20,Maggie Greene,76.0,5.0,-71.0
27,Michonne,65.0,13.5,-51.5
49,Sasha Williams,57.0,13.5,-43.5
70,Eugene Porter,39.0,3.666667,-35.333333
71,Rosita Espinosa,39.0,7.2,-31.8
72,Abraham Ford,39.0,3.714286,-35.285714


In [14]:
# saves a pdf of the tree
from sklearn.externals.six import StringIO  

# saves the tree
dot_data = StringIO()
tree.export_graphviz(tree_model,
                     feature_names=df_x.columns,
                     filled = True,
                     rounded = True,
                     out_file=dot_data) 
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf('../viz/tree_model.pdf')

True