# Predicting TV Show Success

## Notes
- Shonda Rhimes predictive superpower

- The problem statement
    - Find out which of the IVs/combination of IVs contributes the most to a shows success
    - Using this information after the pilot can you determine whether or not a show will be sucessful 
    - Using this information before the pilot can you determine whether or not a show will be sucessful 

- Possible independent variables:
    - producers/creators
    - network
    - viewer ratings
    - genre
    - air times (frequency and time of day)
    - actors
    - plot key words
    - certificate
    - runtime
    
- Operational definition of TV show success?

In [1]:
TITLE=[]
GENRE=[]
CREATED_BY=[]
STARRING=[]
NETWORK=[]
RELEASE=[]
RUNNING_TIME=[]
NUM_SEASONS=[]
NUM_EPISODES=[]

DATABASE = {'title':TITLE, 'genre':GENRE, 'created_by':CREATED_BY, 'starring':STARRING, 'network':NETWORK, 'release':RELEASE, 'running_time':RUNNING_TIME, 'num_seasons':NUM_SEASONS, 'num_episodes':NUM_EPISODES}

TABLE_COLUMNS = ['title', 'genre', 'created_by', 'starring', 'network', 'release', 'running_time', 'num_seasons', 'num_episodes']

TABLE_NAME = 'shows.csv'

In [2]:
#Collect a list of Wikipedia links to all American TV shows

from bs4 import BeautifulSoup as soup
import pandas as pd
import re
from urllib import request
 
def soupTheLink(url):
    html = request.urlopen(url).read().decode('utf8')
    return soup(html, 'lxml')

def collectLinks(url):
    souped = soupTheLink(url)
    
    showLinks = []
    
    for listItem in souped.find_all('li'):
        link = listItem.find('a', href = True)
        try:
            fullLink = 'https://en.wikipedia.org' + link['href']
            showLinks.append(fullLink)
        except TypeError:
            pass
    showLinks = showLinks[28:2282]
    return showLinks

def compileData(listOfShowLinks):
    
    shows = [url[30:] for url in listOfShowLinks[:5]]
    DATABASE['title'].extend(shows)
    
    for url in listOfShowLinks[:5]:
        
        souped = soupTheLink(url)
            
        for table in souped.find_all(class_='infobox'):
             
            has_genre = 0
            has_created_by = 0
            has_starring = 0
            has_network = 0
            has_release = 0
            has_running_time = 0
            has_num_seasons = 0
            has_num_episodes = 0
                
            for row in table.find_all('tr'):
                    
                for category in row.find_all('th'):
                    feature = category.contents
                    print('feature = ', feature)

                    if feature == ['Genre']:
                        has_genre = 1
                        
                        rowContent = []
                        
                        for info in row.find_all('td'):
                            for genre in info.find_all('a'):
                                rowContent.extend(genre.contents)
                        print('Genre: ', rowContent)

                        DATABASE['genre'].append(rowContent)

                    elif feature == ['Created by'] or feature == ['Written by']:
                        has_created_by = 1
                        
                        rowContent = []

                        for info in row.find_all('td'):
                            creators = []

                            if len(info) == 1:
                                creators.extend(info.contents)
                            else:
                                for creator in info.find_all('a'):
                                    name = str(creator.contents)
                                    if name[2] != '[':
                                        creators.extend(creator.contents)
                            rowContent.extend(creators)
                        print('Created by: ', rowContent)

                        DATABASE['created_by'].append(rowContent)

                    elif feature == ['Starring']:
                        has_starring = 1
                        
                        rowContent = []

                        for info in row.find_all('td'):
                            
                            if len(info) == 1:
                                rowContent.extend(info.contents)
                                
                            else:
                                for stars in info.find_all('a'):
                                    rowContent.extend(stars.contents)
                            
                        print('Starring: ', rowContent)
                            
                        DATABASE['starring'].append(rowContent)

                    elif feature == ['Original network']:
                        has_network = 1
                        
                        rowContent = []
                        
                        for info in row.find_all('td'):
                            for network in info.find_all('a'):
                                rowContent.extend(network.contents)
                        print('Network: ', rowContent)

                        DATABASE['network'].append(rowContent)

                    elif feature == ['Original release']:
                        has_release = 1

                        for info in row.find_all('td'):
                            rowContent = info.contents
                            
                            if type(rowContent[0]) != str:
                                rowContent = rowContent[2]
                            else:
                                rowContent = rowContent[0]
                                
                            if rowContent[1] == '–':
                                print('it starts with a dash')
                                rowContent = rowContent[2:]
                            print('Original release: ', rowContent)
                            
                            print('rowContent[0]', rowContent[0])
                            print('rowContent[1]', rowContent[1])
                            print('rowContent[2]', rowContent[2])
                            print('rowContent[3]', rowContent[3])
                            print('rowContent[4]', rowContent[4])
                            print('rowContent[5]', rowContent[5])
                                                                
                            DATABASE['release'].append(rowContent)
                    
                    elif feature == ['Running time']:
                        has_running_time = 1
                        
                        rowContent = []

                        for info in row.find_all('td'):
                            rowContent = info.contents
                            print('Running time: ', rowContent)
                            
                            rowContent = rowContent[0]
                            rowContent = rowContent[0:2]
                            
                            DATABASE['running_time'].append(rowContent)

                    elif feature[-1] == ' of seasons':
                        has_num_seasons = 1

                        for info in row.find_all('td'):
                            rowContent = info.contents[0]
                            print('seasons: ', rowContent)
                            
                            DATABASE['num_seasons'].append(rowContent)

                    elif feature[-1] == ' of episodes':
                        has_num_episodes = 1

                        for info in row.find_all('td'):
                            rowContent = info.contents[0]
                            print('episodes:', rowContent)
                            DATABASE['num_episodes'].append(rowContent)
                            

                    else:
                        continue

            if has_genre == 0:
                DATABASE['genre'].append("None")
            if has_created_by == 0:
                DATABASE['created_by'].append("None")
            if has_starring == 0:
                DATABASE['starring'].append("None")
            if has_network == 0:
                DATABASE['network'].append("None")
            if has_release == 0:
                DATABASE['release'].append("None")
            if has_running_time == 0:
                DATABASE['running_time'].append("None")
            if has_num_seasons == 0:
                DATABASE['num_seasons'].append("None")
            if has_num_episodes == 0:
                DATABASE['num_episodes'].append("None")

In [3]:
mainUrl = 'https://en.wikipedia.org/wiki/List_of_American_television_series'

In [4]:
links = collectLinks(mainUrl)

In [5]:
compileData(links)

feature =  ['$#*! My Dad Says']
feature =  ['Genre']
Genre:  ['Sitcom']
feature =  ['Created by']
Created by:  ['David Kohan', 'Max Mutchnick', 'Justin Halpern']
feature =  ['Starring']
Starring:  ['William Shatner', 'Jonathan Sadowski', 'Will Sasso', 'Nicole Sullivan', 'Tim Bagley']
feature =  ['Opening theme']
feature =  ['Composer(s)']
feature =  ['Country of origin']
feature =  ['Original ', <span class="nowrap">language(s)</span>]
feature =  [<abbr title="Number">No.</abbr>, ' of seasons']
seasons:  1
feature =  [<abbr title="Number">No.</abbr>, ' of episodes']
episodes: 18 
feature =  ['Production']
feature =  ['Executive ', <span class="nowrap">producer(s)</span>]
feature =  ['Camera setup']
feature =  ['Running time']
Running time:  ['21 minutes']
feature =  ['Production ', <span class="nowrap">company(s)</span>]
feature =  ['Distributor']
feature =  ['Release']
feature =  ['Original network']
Network:  ['CBS']
feature =  ['Picture format']
feature =  ['Original release']
it st

In [6]:
print(DATABASE)

{'network': [['CBS'], ['CBS'], ['CBS'], ['NBC', 'GSN'], ['HBO']], 'running_time': ['21', '22', '22', '41', '25'], 'num_seasons': ['1', '4', '4', '3', '6'], 'num_episodes': ['18 ', 'None', 'None', '68', '80'], 'genre': [['Sitcom'], ['Game show'], ['Game show'], ['Game show'], 'None'], 'starring': [['William Shatner', 'Jonathan Sadowski', 'Will Sasso', 'Nicole Sullivan', 'Tim Bagley'], 'None', 'None', 'None', ['Delta Burke', 'O. J. Simpson', 'Shannon Tweed', 'John Matuszak', 'Jason Beghe', 'Geoffrey Scott', 'Keith Amos', 'Paul Tuerpe', 'Prince Hughes', 'Shanna Reed', 'Reid Shelton']], 'title': ['$h*!_My_Dad_Says', 'The_$64,000_Question', 'The_$64,000_Question', '1_vs._100_(U.S._game_show)', '1st_%26_Ten_(HBO_TV_series)'], 'release': [' February 17, 2011', ' November 2, 1958', ' November 2, 1958', '\nOctober 13, 2006', ' January 23, 1991'], 'created_by': [['David Kohan', 'Max Mutchnick', 'Justin Halpern'], ['Joseph Nathan Kane'], ['Joseph Nathan Kane'], 'None', ['Carl Kleinschmitt']]}


In [7]:
df = pd.DataFrame(DATABASE, columns = TABLE_COLUMNS)

In [8]:
df.head()

Unnamed: 0,title,genre,created_by,starring,network,release,running_time,num_seasons,num_episodes
0,$h*!_My_Dad_Says,[Sitcom],"[David Kohan, Max Mutchnick, Justin Halpern]","[William Shatner, Jonathan Sadowski, Will Sass...",[CBS],"February 17, 2011",21,1,18.0
1,"The_$64,000_Question",[Game show],[Joseph Nathan Kane],,[CBS],"November 2, 1958",22,4,
2,"The_$64,000_Question",[Game show],[Joseph Nathan Kane],,[CBS],"November 2, 1958",22,4,
3,1_vs._100_(U.S._game_show),[Game show],,,"[NBC, GSN]","October 13, 2006",41,3,68.0
4,1st_%26_Ten_(HBO_TV_series),,[Carl Kleinschmitt],"[Delta Burke, O. J. Simpson, Shannon Tweed, Jo...",[HBO],"January 23, 1991",25,6,80.0
