## Pandas and NumPy

Awesome libraries that help us work with tabular data

### Loading Data into a DataFrame

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/DoctorWhoEpisodes.csv')

df.head()

Unnamed: 0,episode_id,episodenbr,title,weekday,broadcasthour,duration,views,share,AI,chart,...,crew,summary,date,air_date,doctorid,number,rating,votes,description,season
0,1.01,697,Rose,Sat,7:00pm,00:44:14,10.81m,44.8%,76.0,7,...,"[{""role"":""Writer"",""name"":""Russell T Davies""},{...",,2005-03-26,"26 Mar, 2005",9,1,7.6,6504,When ordinary shop-worker Rose Tyler meets a m...,1
1,1.02,698,The End of the World,Sat,6:59pm,00:44:45,7.97m,37.8%,76.0,19,...,"[{""role"":""Writer"",""name"":""Russell T Davies""},{...",,2005-04-02,"2 Apr, 2005",9,2,7.6,5684,The Doctor takes Rose to the year 5 billion to...,1
2,1.03,699,The Unquiet Dead,Sat,7:00pm,00:44:50,8.86m,37.8%,80.0,15,...,"[{""role"":""Writer"",""name"":""Mark Gatiss""},{""role...",,2005-04-09,"9 Apr, 2005",9,3,7.6,5326,The Doctor has great expectations for his late...,1
3,1.04,700,Aliens of London,Sat,7:00pm,00:45:05,7.63m,35.7%,82.0,18,...,"[{""role"":""Writer"",""name"":""Russell T Davies""},{...",,2005-04-16,"16 Apr, 2005",9,4,7.0,5116,The Doctor returns Rose to her own time - well...,1
4,1.05,701,World War Three,Sat,7:01pm,00:40:40,7.98m,40.2%,81.0,20,...,"[{""role"":""Writer"",""name"":""Russell T Davies""},{...",,2005-04-23,"23 Apr, 2005",9,5,7.1,4943,The Slitheen have infiltrated Parliament and h...,1


In [2]:
df.columns

Index(['episode_id', 'episodenbr', 'title', 'weekday', 'broadcasthour',
       'duration', 'views', 'share', 'AI', 'chart', 'cast', 'crew', 'summary',
       'date', 'air_date', 'doctorid', 'number', 'rating', 'votes',
       'description', 'season'],
      dtype='object')

In [3]:
df.tail(1)

Unnamed: 0,episode_id,episodenbr,title,weekday,broadcasthour,duration,views,share,AI,chart,...,crew,summary,date,air_date,doctorid,number,rating,votes,description,season
145,11.11,851,Resolution,Tue,7:00pm,01:00:00,7.13m,26.6%,80.0,14,...,"[{""role"":""Writer"",""name"":""Chris Chibnall""},{""r...",,2019-01-01,1 Jan 2019,13,11,6.0,2690,"As the New Year begins, a terrifying evil is s...",11


In [6]:
df.sample(2)

Unnamed: 0,episode_id,episodenbr,title,weekday,broadcasthour,duration,views,share,AI,chart,...,crew,summary,date,air_date,doctorid,number,rating,votes,description,season
117,9.09,822,Sleep No More,Sat,8:14pm,00:45:07,5.61m,22.6%,78.0,28,...,"[{""role"":""Writer"",""name"":""Mark Gatiss""},{""role...",This terrifying story is assembled from footag...,2015-11-14,15 Nov 2015,12,9,6.0,3866,Vision recovered from the wreckage of Le Verri...,9
110,9.02,815,The Witch's Familiar,Sat,7:45pm,00:47:50,5.71m,22.9%,83.0,24,...,"[{""role"":""Writer"",""name"":""Steven Moffat""},{""ro...",Trapped and alone on the terrifying planet Ska...,2015-09-26,2015-09-26,12,2,8.5,4109,"Trapped and alone in a terrifying Dalek city, ...",9


### Descriptive Statistics

In [7]:
df.describe()

Unnamed: 0,episode_id,episodenbr,AI,chart,doctorid,number,rating,votes,season
count,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0
mean,5.93589,773.616438,84.5,14.39726,10.945205,6.60274,7.911644,4909.821918,5.869863
std,3.111132,45.569236,2.797166,6.666933,1.10035,3.885392,0.943984,1487.570969,3.112298
min,1.01,697.0,76.0,1.0,9.0,0.0,5.2,2494.0,1.0
25%,3.0925,733.25,83.0,9.25,10.0,3.0,7.3,4127.0,3.0
50%,6.045,774.5,85.0,14.0,11.0,7.0,7.9,4750.0,6.0
75%,8.7875,813.5,86.0,19.0,12.0,10.0,8.7,5405.75,8.75
max,11.11,851.0,91.0,30.0,13.0,14.0,9.8,16398.0,11.0


In [8]:
df.quantile(0.42)

episode_id       5.069
episodenbr     762.900
AI              84.000
chart           13.000
doctorid        11.000
number           5.900
rating           7.700
votes         4635.500
season           5.000
Name: 0.42, dtype: float64

In [10]:
df['doctorid'].unique()

array([ 9, 10, 11, 12, 13], dtype=int64)

### Renaming Columns

In [12]:
df = df.rename(columns={'AI': 'appreciation'})
df.columns

Index(['episode_id', 'episodenbr', 'title', 'weekday', 'broadcasthour',
       'duration', 'views', 'share', 'appreciation', 'chart', 'cast', 'crew',
       'summary', 'date', 'air_date', 'doctorid', 'number', 'rating', 'votes',
       'description', 'season'],
      dtype='object')

### Dropping Columns

In [13]:
df.columns

Index(['episode_id', 'episodenbr', 'title', 'weekday', 'broadcasthour',
       'duration', 'views', 'share', 'appreciation', 'chart', 'cast', 'crew',
       'summary', 'date', 'air_date', 'doctorid', 'number', 'rating', 'votes',
       'description', 'season'],
      dtype='object')

In [14]:
df = df.drop(columns=['cast', 'crew', 'air_date', 'number', 'duration', 'broadcasthour'])
df.head(1)

Unnamed: 0,episode_id,episodenbr,title,weekday,views,share,appreciation,chart,summary,date,doctorid,rating,votes,description,season
0,1.01,697,Rose,Sat,10.81m,44.8%,76.0,7,,2005-03-26,9,7.6,6504,When ordinary shop-worker Rose Tyler meets a m...,1


### Cleaning Data

In [15]:
# Remove % from Share
df['share'] = df['share'].str.replace('%','')

# Remove m from viewership
df['views'] = df['views'].str.replace('m', '')

df.head(1)

Unnamed: 0,episode_id,episodenbr,title,weekday,views,share,appreciation,chart,summary,date,doctorid,rating,votes,description,season
0,1.01,697,Rose,Sat,10.81,44.8,76.0,7,,2005-03-26,9,7.6,6504,When ordinary shop-worker Rose Tyler meets a m...,1


In [16]:
df['summary'] = df['summary'].fillna('No Summary Available')
df.head()

Unnamed: 0,episode_id,episodenbr,title,weekday,views,share,appreciation,chart,summary,date,doctorid,rating,votes,description,season
0,1.01,697,Rose,Sat,10.81,44.8,76.0,7,No Summary Available,2005-03-26,9,7.6,6504,When ordinary shop-worker Rose Tyler meets a m...,1
1,1.02,698,The End of the World,Sat,7.97,37.8,76.0,19,No Summary Available,2005-04-02,9,7.6,5684,The Doctor takes Rose to the year 5 billion to...,1
2,1.03,699,The Unquiet Dead,Sat,8.86,37.8,80.0,15,No Summary Available,2005-04-09,9,7.6,5326,The Doctor has great expectations for his late...,1
3,1.04,700,Aliens of London,Sat,7.63,35.7,82.0,18,No Summary Available,2005-04-16,9,7.0,5116,The Doctor returns Rose to her own time - well...,1
4,1.05,701,World War Three,Sat,7.98,40.2,81.0,20,No Summary Available,2005-04-23,9,7.1,4943,The Slitheen have infiltrated Parliament and h...,1


In [17]:
df['summary'].unique()

array(['No Summary Available',
       "When Rose travels back to the day of her father's death, her meddling in the course of events has terrible consequences for the human race - in the form of the fiendish Reapers.\n\nSynopsis Source: Radio Times",
       'The newly regenerated Doctor takes Rose home for the festive season, but\n the transition process has tired him and he falls unconscious - just as\n the planet comes under threat from evil alien race the Sycorax, who \nhave employed sinister Santas and murderous trees to do their bidding.',
       "On a parallel Earth where Rose's parents remained together, the Doctor sees a terrifying enemy reborn... The Cybermen are coming!",
       'The Cybermen take control of London and start converting the populace. It looks like even the Doctor is beaten...',
       "The Doctor and Rose land in a base on an 'impossible planet'. But the TARDIS is missing, there's a killer in the base, and who are the Ood?",
       'As Rose battles the murdero

### Adding Columns

In [19]:
df['has_9']  = df['doctorid'] == 9
df['has_10'] = df['doctorid'] == 10
df['has_11'] = df['doctorid'] == 11
df['has_12'] = df['doctorid'] == 12
df['has_13'] = df['doctorid'] == 13

df.head()

Unnamed: 0,episode_id,episodenbr,title,weekday,views,share,appreciation,chart,summary,date,doctorid,rating,votes,description,season,has_9,has_10,has_11,has_12,has_13
0,1.01,697,Rose,Sat,10.81,44.8,76.0,7,No Summary Available,2005-03-26,9,7.6,6504,When ordinary shop-worker Rose Tyler meets a m...,1,True,False,False,False,False
1,1.02,698,The End of the World,Sat,7.97,37.8,76.0,19,No Summary Available,2005-04-02,9,7.6,5684,The Doctor takes Rose to the year 5 billion to...,1,True,False,False,False,False
2,1.03,699,The Unquiet Dead,Sat,8.86,37.8,80.0,15,No Summary Available,2005-04-09,9,7.6,5326,The Doctor has great expectations for his late...,1,True,False,False,False,False
3,1.04,700,Aliens of London,Sat,7.63,35.7,82.0,18,No Summary Available,2005-04-16,9,7.0,5116,The Doctor returns Rose to her own time - well...,1,True,False,False,False,False
4,1.05,701,World War Three,Sat,7.98,40.2,81.0,20,No Summary Available,2005-04-23,9,7.1,4943,The Slitheen have infiltrated Parliament and h...,1,True,False,False,False,False


In [20]:
df['weekday'].unique()

array(['Sat', 'Sun', 'Mon', 'Tue', 'Fri'], dtype=object)

In [21]:
df = pd.get_dummies(df, columns=['weekday'], drop_first=True, prefix='day')
df.head()

Unnamed: 0,episode_id,episodenbr,title,views,share,appreciation,chart,summary,date,doctorid,...,season,has_9,has_10,has_11,has_12,has_13,day_Mon,day_Sat,day_Sun,day_Tue
0,1.01,697,Rose,10.81,44.8,76.0,7,No Summary Available,2005-03-26,9,...,1,True,False,False,False,False,0,1,0,0
1,1.02,698,The End of the World,7.97,37.8,76.0,19,No Summary Available,2005-04-02,9,...,1,True,False,False,False,False,0,1,0,0
2,1.03,699,The Unquiet Dead,8.86,37.8,80.0,15,No Summary Available,2005-04-09,9,...,1,True,False,False,False,False,0,1,0,0
3,1.04,700,Aliens of London,7.63,35.7,82.0,18,No Summary Available,2005-04-16,9,...,1,True,False,False,False,False,0,1,0,0
4,1.05,701,World War Three,7.98,40.2,81.0,20,No Summary Available,2005-04-23,9,...,1,True,False,False,False,False,0,1,0,0


In [22]:
df.replace({False: 0, True: 1}, inplace=True)
df['has_11'].unique()

array([0, 1], dtype=int64)

### Correlation Analysis

In [23]:
df.corr()

Unnamed: 0,episode_id,episodenbr,appreciation,chart,doctorid,rating,votes,season,has_9,has_10,has_11,has_12,has_13,day_Mon,day_Sat,day_Sun,day_Tue
episode_id,1.0,0.996921,-0.329043,0.168053,0.966445,-0.304684,-0.395395,0.999922,-0.490662,-0.588435,0.036907,0.603687,0.471761,-0.078638,-0.383035,0.406322,0.06793
episodenbr,0.996921,1.0,-0.31695,0.194821,0.964009,-0.283154,-0.395498,0.995919,-0.486152,-0.603353,0.054345,0.608256,0.454978,-0.090732,-0.357784,0.380753,0.061823
appreciation,-0.329043,-0.31695,1.0,-0.260721,-0.347308,0.549926,0.235551,-0.330742,-0.254506,0.379954,0.352814,-0.375185,-0.358403,-0.014896,0.313893,-0.323525,-0.02598
chart,0.168053,0.194821,-0.260721,1.0,0.11298,-0.038217,-0.294992,0.164042,0.118854,-0.252066,-0.110871,0.417973,-0.200637,-0.054962,0.320307,-0.264831,-0.132194
doctorid,0.966445,0.964009,-0.347308,0.11298,1.0,-0.337997,-0.341335,0.96655,-0.554591,-0.547767,0.031755,0.570571,0.534884,-0.071582,-0.417708,0.454248,0.051266
rating,-0.304684,-0.283154,0.549926,-0.038217,-0.337997,1.0,0.465623,-0.307224,0.042146,0.162622,0.103112,0.019207,-0.533155,-0.027511,0.427622,-0.464023,-0.078775
votes,-0.395395,-0.395498,0.235551,-0.294992,-0.341335,0.465623,1.0,-0.395378,0.132802,0.292113,-0.027256,-0.339602,-0.032979,0.004547,0.051488,-0.016931,-0.05665
season,0.999922,0.995919,-0.330742,0.164042,0.96655,-0.307224,-0.395378,1.0,-0.490878,-0.588004,0.03642,0.603657,0.472139,-0.07684,-0.385165,0.408561,0.068342
has_9,-0.490662,-0.486152,-0.254506,0.118854,-0.554591,0.042146,0.132802,-0.490878,1.0,-0.19868,-0.19868,-0.18545,-0.089243,-0.025963,0.11724,-0.097744,-0.045283
has_10,-0.588435,-0.603353,0.379954,-0.252066,-0.547767,0.162622,0.292113,-0.588004,-0.19868,1.0,-0.403846,-0.376954,-0.1814,0.13068,0.100241,-0.145553,0.01461


In [24]:
df['rating'].dtype

dtype('float64')

### Exporting Results

In [26]:
df.to_csv('data/Processed.csv')

df.head()

Unnamed: 0,episode_id,episodenbr,title,views,share,appreciation,chart,summary,date,doctorid,...,season,has_9,has_10,has_11,has_12,has_13,day_Mon,day_Sat,day_Sun,day_Tue
0,1.01,697,Rose,10.81,44.8,76.0,7,No Summary Available,2005-03-26,9,...,1,1,0,0,0,0,0,1,0,0
1,1.02,698,The End of the World,7.97,37.8,76.0,19,No Summary Available,2005-04-02,9,...,1,1,0,0,0,0,0,1,0,0
2,1.03,699,The Unquiet Dead,8.86,37.8,80.0,15,No Summary Available,2005-04-09,9,...,1,1,0,0,0,0,0,1,0,0
3,1.04,700,Aliens of London,7.63,35.7,82.0,18,No Summary Available,2005-04-16,9,...,1,1,0,0,0,0,0,1,0,0
4,1.05,701,World War Three,7.98,40.2,81.0,20,No Summary Available,2005-04-23,9,...,1,1,0,0,0,0,0,1,0,0
