In [544]:
import requests
from bs4 import BeautifulSoup
import re
import csv
from datetime import datetime
import time
import random
import pandas as pd
import pickle

# Source: https://www.kickstarter.com/discover/advanced?sort=end_date

kickstarter_pages = list(range(1, 201)) 
# There are 200 pages of data available, each with 12 results. 

In [512]:
def get_data(pages):
    url = "https://www.kickstarter.com/discover/advanced?woe_id=0&page={}" 
    https://www.kickstarter.com/discover/advanced?raised=0&sort=end_date&seed=2540281&page=1
    find_projects = re.compile(r'www.kickstarter.com/projects/[^",&]+') # Establish our regex
    all_project_links = []
    i = 1
    for page in pages:    
        print('Working on Page: ' + str(i)) # So we can watch it work
        full_url = url.format(page) # Generate the full page URL using page={}
        response = requests.get(full_url) 
        page = response.text
        soup = BeautifulSoup(page,"lxml")
        one_page_project_links = find_projects.findall(soup.prettify()) # Apply the regex to pull the links we want
        one_page_project_links.pop(0) # The first link is not a part of our data set.
        one_page_project_links_trimmed = one_page_project_links[0::2] # Every other link is a duplicate
        all_project_links.append(one_page_project_links_trimmed) # Add this page to the master list
        i += 1
    flat_list = [item for sublist in all_project_links for item in sublist] # Flatten the master list
    return flat_list    

In [513]:
def examine_links (link_list):
    print('Links contained in list: ' + str(len(link_list)))
    if len(link_list) != 12*len(kickstarter_pages):
        print("Something went wrong, we didn't get the number of lists we expected!")
    else:
        print("Correct number of links detected!\n")
    for link in link_list:
        print(link)

In [514]:
campaign_url_list = get_data(kickstarter_pages) #  Returns a list of links which is 12*len(kickstarter_pages) long. 
examine_links(campaign_url_list) # Prints the full list of links and confirm that it is the correct length

Working on Page: 1
Working on Page: 2
Working on Page: 3
Working on Page: 4
Working on Page: 5
Working on Page: 6
Working on Page: 7
Working on Page: 8
Working on Page: 9
Working on Page: 10
Working on Page: 11
Working on Page: 12
Working on Page: 13
Working on Page: 14
Working on Page: 15
Working on Page: 16
Working on Page: 17
Working on Page: 18
Working on Page: 19
Working on Page: 20
Working on Page: 21
Working on Page: 22
Working on Page: 23
Working on Page: 24
Working on Page: 25
Working on Page: 26
Working on Page: 27
Working on Page: 28
Working on Page: 29
Working on Page: 30
Working on Page: 31
Working on Page: 32
Working on Page: 33
Working on Page: 34
Working on Page: 35
Working on Page: 36
Working on Page: 37
Working on Page: 38
Working on Page: 39
Working on Page: 40
Working on Page: 41
Working on Page: 42
Working on Page: 43
Working on Page: 44
Working on Page: 45
Working on Page: 46
Working on Page: 47
Working on Page: 48
Working on Page: 49
Working on Page: 50
Working o

In [546]:
def scrape_vairables(project_soup, url):
    
    # How much money have they raised so far?
    raised_so_far = project_soup.find("div", class_='mb2-lg').text.splitlines()[2].replace(',', '')[1:]
    if raised_so_far == '':
        raised_so_far = project_soup.select_one("span[class*='green-700 inline-block js-convert_pledged medium type-16 type-24-md']").text.strip('\n')[1:].replace(',', '')
    raised_so_far = int(float(raised_so_far))
    
    # What was the total goal they are trying to reach?
    total_goal = int(project_soup.select_one("span[class*='block navy-600 type-12 type-14-md lh3-lg']").text[13:-6].replace(',', ''))

    # How many updates have they sent to their network through KS?
    updates_total = int(project_soup.find(attrs={'class' : 'js-load-project-content js-load-project-updates mx3 project-nav__link--updates tabbed-nav__link type-14'}).text[-2].replace(',', ''))

    # How many days remain in the campaign?
    remaining_time_html = project_soup.find_all("div", class_="poll ksr_page_timer")
    get_days_remaining = re.compile(r'data-end_time="[^"]+') # Establish our regex
    end_date_string = str(get_days_remaining.findall(str(remaining_time_html[1]))) # Apply the regex to pull the links we want
    end_date__string_cleaned = end_date_string[17:-8]
    campaign_end_date = datetime.strptime(end_date__string_cleaned, '%Y-%m-%dT%H:%M:%S')
    right_now = datetime.now()
    time_remaining = campaign_end_date - right_now
    days_remaining = int(time_remaining.days)

    # What was the campaign's title?
    title = project_soup.select_one("h2[class*='type-24 type-28-sm type-38-md navy-700 medium mb3']").text.strip('\n')

    # How many people have backed it so far?
    backers_count = int(project_soup.find(attrs={'id' : 'backers_count'}).text.strip('\n').replace(',', ''))

    # Was it an all-or-nothing campaign?
    all_or_nothing_text = project_soup.find(attrs={'class' : 'navy-700 pointer'}).text
    all_or_nothing = (all_or_nothing_text == "All or nothing.")

    # What Sub-Category was the campaign? 
    sub_category = project_soup.select_one("a[class*='nowrap navy-700 flex items-center medium mr3 type-12']").text.strip('\n')
    if sub_category == 'Project We Love':
        sub_category = project_soup.find_all("a", class_='nowrap navy-700 flex items-center medium mr3 type-12')[1].text.splitlines()[-1]
    
    # What Category was the campaign?
    all_categories = [['Art', 'Ceramics', 'Conceptual Art', 'Digital Art', 'Illustration', 'Installations', 'Mixed Media', 'Painting', 'Performance Art', 'Public Art', 'Sculpture', 'Textiles', 'Video Art'], 
                      ['Comics', 'Anthologies', 'Comic Books', 'Events', 'Graphic Novels', 'Webcomics'], 
                      ['Crafts', 'Candles', 'Crochet', 'DIY', 'Embroidery', 'Glass', 'Knitting', 'Pottery', 'Printing', 'Quilts', 'Stationery', 'Taxidermy', 'Weaving', 'Woodworking'], 
                      ['Dance', 'Performances', 'Residencies', 'Spaces', 'Workshops'], 
                      ['Design', 'Architecture', 'Civic Design', 'Graphic Design', 'Interactive Design', 'Product Design', 'Typography'], 
                      ['Fashion', 'Accessories', 'Apparel', 'Childrenswear', 'Couture', 'Footwear', 'Jewelry', 'Pet Fashion', 'Ready-to-wear'], 
                      ['Film_And_Video', 'Action', 'Animation', 'Comedy', 'Documentary', 'Drama', 'Experimental', 'Family', 'Fantasy', 'Festivals', 'Horror', 'Movie Theaters', 'Music Videos', 'Narrative Film', 'Romance', 'Science Fiction', 'Shorts', 'Television', 'Thrillers', 'Webseries'], 
                      ['Food', 'Bacon', 'Community Gardens', 'Cookbooks', 'Drinks', 'Events', "Farmer's Markets", 'Farms', 'Food Trucks', 'Restaurants', 'Small Batch', 'Spaces', 'Vegan'], 
                      ['Games', 'Gaming Hardware', 'Live Games', 'Mobile Games', 'Playing Cards', 'Puzzles', 'Tabletop Games', 'Video Games'], 
                      ['Journalism', 'Audio', 'Photo', 'Print', 'Video', 'Web'], 
                      ['Music', 'Blues', 'Chiptune', 'Classical Music', 'Comedy', 'Country & Folk', 'Electronic Music', 'Faith', 'Hip-Hop', 'Indie Rock', 'Jazz', 'Kids', 'Latin', 'Metal', 'Pop', 'Punk', 'R&B', 'Rock', 'World Music'], 
                      ['Photography', 'Animals', 'Fine Art', 'Nature', 'People', 'Photobooks', 'Places'], 
                      ['Publishing', 'Academic', 'Anthologies', 'Art Books', 'Calendars', "Children's Books", 'Comedy', 'Fiction', 'Letterpress', 'Literary Journals', 'Nonfiction', 'Periodicals', 'Poetry', 'Radio & Podcasts', 'Translations', 'Young Adult', 'Zines', 'Literary Spaces'], 
                      ['Technology', '3D Printing', 'Apps', 'Camera Equipment', 'DIY Electronics', 'Fabrication Tools', 'Flight', 'Gadgets', 'Hardware', 'Makerspaces', 'Robots', 'Software', 'Sound', 'Space Exploration', 'Wearables', 'Web'], 
                      ['Theater', 'Comedy', 'Experimental', 'Festivals', 'Immersive', 'Musical', 'Plays', 'Spaces']]
    category = [cat[0] for cat in all_categories if sub_category in cat]

    # How long was the title? 
    title_length = len(title) # how long title in characters

    # What is the total number of tiers
    tier_count = len(project_soup.find_all("div", class_='pledge__info')) - 1

    # Is there a video? 
    video_contents = project_soup.find("video", class_='has_hls landscape')
    if not video_contents: 
        contains_video = False
    else:
        contains_video = True

    # How many images are on the page?
    number_images = len(project_soup.find_all("div", class_='template asset'))


    # How much text is on the page? 
    all_page_paragraphs = project_soup.find_all("p")
    page_text = [item.text for item in all_page_paragraphs]
    flattened_page_text = ''.join(page_text)
    total_text = len(flattened_page_text)

    data = [title, url, total_goal, raised_so_far, backers_count, days_remaining, number_images, contains_video, tier_count, updates_total, title_length, category, sub_category, all_or_nothing, total_text] # incomplete
    return data


In [540]:
# Starting with a shorter list of URLs
short_list = campaign_url_list[0:2400]
# Note, if any of the campaigns in campaign_url_list finished while we were scraping then their format will have changed and they will throw errors.  This is why, later on, we'll see several different dataframes concatenated into one.  Each is separated by an error.  
#def scrape_in_sequence(short_list):
url1 = 'www.kickstarter.com/projects/1227872561/unbroken-a-solo-game-of-survival-and-revenge'
url2 = 'http://www.kickstarter.com/projects/1227872561/unbroken-a-solo-game-of-survival-and-revenge'
scrape_store = {}
i = 0
data_headers = ['title', 'URL','total_goal', 'raised_so_far', 'backers_count', 'days_remaining', 'number_images', 'contains_video', 'tier_count', 'updates_total', 'title_length', 'category', 'sub_category', 'all_or_nothing', 'total_text']
df = pd.DataFrame(columns=data_headers)
for url in short_list:
    project_response = requests.get('http://' + url) 
    project_page = project_response.text
    scrape_store[i] = [url, project_page]
    project_soup = BeautifulSoup(project_page,"lxml")
    i += 1
    time.sleep(.5 + random.random())
    one_page_features = scrape_vairables(project_soup, url)
    df = df.append(pd.Series(one_page_features, index=data_headers), ignore_index=True)

    print('Sraping page: ' + str(i))
with open('all_scrapes.csv', 'w') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in scrape_store.items():
        writer.writerow([key, value])
        

#Response variable is %of planned $s pledged


Sraping page: 1
Sraping page: 2
Sraping page: 3
Sraping page: 4
Sraping page: 5
Sraping page: 6
Sraping page: 7
Sraping page: 8
Sraping page: 9
Sraping page: 10
Sraping page: 11
Sraping page: 12
Sraping page: 13
Sraping page: 14
Sraping page: 15
Sraping page: 16
Sraping page: 17
Sraping page: 18
Sraping page: 19
Sraping page: 20
Sraping page: 21
Sraping page: 22
Sraping page: 23
Sraping page: 24
Sraping page: 25
Sraping page: 26
Sraping page: 27
Sraping page: 28
Sraping page: 29
Sraping page: 30
Sraping page: 31
Sraping page: 32
Sraping page: 33
Sraping page: 34
Sraping page: 35
Sraping page: 36
Sraping page: 37
Sraping page: 38
Sraping page: 39
Sraping page: 40
Sraping page: 41
Sraping page: 42
Sraping page: 43
Sraping page: 44
Sraping page: 45
Sraping page: 46
Sraping page: 47
Sraping page: 48
Sraping page: 49
Sraping page: 50
Sraping page: 51
Sraping page: 52
Sraping page: 53
Sraping page: 54
Sraping page: 55
Sraping page: 56
Sraping page: 57
Sraping page: 58
Sraping page: 59
Srapin

In [542]:
kickstarter_data = df1.append(df2)        
kickstarter_data = kickstarter_data.append(df3)        
kickstarter_data = kickstarter_data.append(df4)        
kickstarter_data = kickstarter_data.append(df5)        
kickstarter_data = kickstarter_data.append(df6)        

print(df1.shape)
print(df2.shape)
print(df3.shape)
print(df4.shape)
print(df5.shape)
print(df6.shape)

print(kickstarter_data.shape)
kickstarter_data.sample(20)

(448, 15)
(910, 15)
(343, 15)
(143, 15)
(225, 15)
(325, 15)
(2394, 15)


Unnamed: 0,title,URL,total_goal,raised_so_far,backers_count,days_remaining,number_images,contains_video,tier_count,updates_total,title_length,category,sub_category,all_or_nothing,total_text
442,CubeYourLife! One Cube 80 games to Play Togeth...,www.kickstarter.com/projects/1207576737/cube-y...,24446,9733,148,23,67,True,12,3,59,[Games],Games,True,8267
477,"Pleasure Boat Studio: A Literary Press, a book...",www.kickstarter.com/projects/1678535734/pleasu...,5000,325,6,3,0,True,1,0,59,[Publishing],Publishing,True,2531
142,Physical Skycoin Limited Edition Collectable Coin,www.kickstarter.com/projects/293386705/physica...,7333,7272,50,15,4,False,12,4,49,[Technology],Gadgets,True,10787
432,Bad Mouth Party Game,www.kickstarter.com/projects/jaysterbaby/bad-m...,7790,222,9,19,6,True,5,2,20,[Games],Tabletop Games,True,5046
160,Responsible Firearm Programs From Top Master I...,www.kickstarter.com/projects/1689485012/respon...,122000,310,5,51,12,True,8,0,56,[Publishing],Publishing,True,12225
291,GEORGE® THE WORLD'S BEST ARMATURE FOR STOP MOT...,www.kickstarter.com/projects/973708455/george-...,3178,8025,46,50,73,True,11,3,59,[Design],Product Design,True,13676
858,Hollow Monsters: WHO IS THE HOLLOW MAN?,www.kickstarter.com/projects/1435105183/hollow...,2796,3478,108,15,34,False,14,2,39,[Comics],Comics,True,10583
74,Zeno™ | Italian Design meets Non-Iron & No-Sta...,www.kickstarter.com/projects/zenoshirt/zenotm-...,12223,16066,109,0,29,True,10,3,59,[Fashion],Apparel,True,8268
816,Save the Kestrel! They need nest boxes.,www.kickstarter.com/projects/621619349/save-th...,500,110,4,12,0,True,7,0,39,[Crafts],Woodworking,True,5970
862,The Cityscape Table - Inspired by Iconic Cities,www.kickstarter.com/projects/agardc/the-citysc...,1887,461,2,5,25,True,6,1,47,[Design],Product Design,True,5232


In [545]:
with open('kickstarter_data_pickle.pkl', 'wb') as picklefile:
    pickle.dump(kickstarter_data, picklefile)