# <center>An example of SNS</center>

- Threadless.com is a crowdsouring website for graphic designs.
- Desginers submit artworks and recieve ratings from the community within a seven-day period. 
- Designs with the best scores will be selected to print on T-shirts and other products for sale. 

### Webscraping objectives

- Get a sample of users and artifacts. Consider a sampling strategy. 
- Scrape artifact-level features.
- Scrape user-level features. 

In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import time


In [2]:
# Get five urls of pages as a sample of latest artifacts.

link="https://www.threadless.com/designs/archive?page="
num=list(range(1,6))
pages=[]
for i in num:
    page=link+str(i)
    pages.append(page)
print(pages)


['https://www.threadless.com/designs/archive?page=1', 'https://www.threadless.com/designs/archive?page=2', 'https://www.threadless.com/designs/archive?page=3', 'https://www.threadless.com/designs/archive?page=4', 'https://www.threadless.com/designs/archive?page=5']


In [3]:
# Get urls of all the designs in these pages
# To reduce the load to their server, will demonnstrate one page

designs=[]
for i in pages:
    print('working on page'+str(' ')+str(i))
    response=requests.get(i)
    soup=BeautifulSoup(response.content, "html.parser")
    links=soup.find('ol',class_='feed-archive th-grided')
    li=links.find_all('li',class_="old")
    for j in li:
        name=j.find("a")["href"]
        designs.append(name)
   

working on page https://www.threadless.com/designs/archive?page=1
working on page https://www.threadless.com/designs/archive?page=2
working on page https://www.threadless.com/designs/archive?page=3
working on page https://www.threadless.com/designs/archive?page=4
working on page https://www.threadless.com/designs/archive?page=5


In [4]:
designs[:5]

# can write out the sample of artifacts 
# with open('designs.csv', 'w') as csvfile:
#    writer=csv.writer(csvfile, delimiter=',')
#    writer.writerows(zip(designs))


# read in your sample
# raw_data_file = open("designs.csv", 'r')
# csv_data_file = csv.reader(raw_data_file, delimiter=',')
# designs = []
# for line in csv_data_file:
#     print(line[0])
#     designs.append(line[0])

 '/designs/unstoppable-9',
 '/designs/corporate-greed-2',
 '/designs/solar-lion',
 '/designs/goblincore-18']

In [21]:
# Get artifact level features
# For each design, get title, author, average score, number of scores, challenge name

rows=[]

for i in designs[:10]:
    try:
        url="https://www.threadless.com"+i
        response=requests.get(url)
        soup=BeautifulSoup(response.content, "html.parser")
        
        # initiate the variable for each period
        title=None
        author=None
        avg_score=None
        total_score=None
        
        ##title
        title=soup.select('div.submission-title h1')
        if title!=[]:
            title=title[0].text

        ##author
        author=soup.select('div.author-block a.author')
        if author!=[]:
            author=author[0].text

        ##score
        avg_score=soup.select('div.vote-avg span')
        if avg_score!=[]:
            avg_score=avg_score[0].text

        ##total scores
        total_score=soup.select('div.vote-count span')
        if total_score!=[]:
            total_score=total_score[0].text
        
        rows.append((title, author, avg_score, total_score))
        print((title, author, avg_score, total_score))
    
    except AttributeError:
        pass



('Un....Stoppable.', 'ArtofPig18', '2.80', '5')
('Corporate Greed 2', 'ArtofPig18', '2.00', '6')
('Solar Lion', 'MitsukiRising', '3.77', '26')
('goblincore', 'muktiharjanto', '2.33', '6')
('Brains!', 'SpaceDat120', '3.75', '4')
('Halloween T-shirt', 'sunsetandchill', '2.00', '2')
('Retro Atlantic blue ocean coast beach landscape in Portugal with 3D rhythmic concrete docking', 'EdyArtSpace', '1.60', '5')
('Break Wall', 'i997', '1.00', '2')
('Sinister Claus', 'gulayfather', '4.00', '3')
('Forever Yours', 'Elaraslove', '2.00', '1')
('Frog Witch', 'monstiker', '2.78', '9')
("Bushido '89", 'TheWestwood', '2.57', '7')
('Game Over Graffity Style', 'bakhus', '1.00', '2')
('WE ARE PEOPLE TO', 'CruzArtGallery', '3.00', '4')
('boo donut', 'KTRK', '3.33', '3')
('super mario x street fighter ken', 'KTRK', '3.00', '5')
('Banner', 'CallyRaphael', '2.00', '1')
('You Saw Nothing', 'Xentee', '2.50', '4')
('Okay to decay', 'biernatt', '2.86', '7')
('I am gaming', 'mhs23', '3.00', '2')
('please', 'Turbora

In [16]:
# Question: How to scrape the challenge information?

# 1. challenge name
# 2. how many designs per challenge


# add your code here

for i in designs[:5]:
    
    url="https://www.threadless.com"+i
    response=requests.get(url)
    soup=BeautifulSoup(response.content, "html.parser")
        
    challenge=soup.find("article",class_="about-the-challenge")
    title=challenge.select("li.challenge-title")[0].text
    num=challenge.select("i.fa-thumbs-up")[0].next_sibling.next_sibling.text
    print(title, num)



Threadless
 123012 designs

Horror
 4011 designs

Video Games That Don’t Exist Part 2., Presented by DesignerCon
 215 designs

Threadless
 123012 designs

Goblincore
 356 designs


In [22]:
# get authors
authors=[row[1] for row in rows]
authors=filter(None, authors)
authors_unique=list(set(authors))
print(authors_unique)
len(authors_unique)

['moviereplicars', 'muktiharjanto', 'NoMotiveInc', 'rteestyle', 'Xentee', 'PammyB', 'leeagosila', 'kimprut', 'bakhus', 'gerberaloka', 'Astrid13', 'mhs23', 'CruzArtGallery', 'MitsukiRising', 'Gabriel0', 'KTRK', 'biernatt', 'SpaceDat120', 'churrumiaus', 'EdyArtSpace', 'ClosetPrints', 'Turborat14', 'CallyRaphael', 'i997', 'makerofstuff', 'UnfoundedHope', 'TheWestwood', 'santokie', 'tobefonseca', 'ArtofPig18', 'gulayfather', 'Elaraslove', 'monstiker', 'dragongrr', 'sunsetandchill', 'EmeraldMakes']


36

In [18]:
# For the designers we found, get the summary of their experience
full=[]

for i in authors_unique[:5]:
    url="https://www.threadless.com/@"+i
    time.sleep(5)
    response=requests.get(url)
    soup=BeautifulSoup(response.content, "html.parser")
    
    # find all stats
    stats=soup.select('div.stats ul')
    li=stats[0].find_all('li')
    
    line=[None] * 5
    for j in li:
        char=(j.text).strip()
        
        # threads
        if re.search("started",char):
            line[0]=char
            #line[1]=re.findall(r"[0-9.]+", char)[0]
            
        # submitted
        if re.search("submitted",char):
            line[1]=char   
            #line[1]=re.findall(r"[0-9.]+", char)[0]

        # scored
        if re.search("scored",char):
            line[2]=char
            #line[2]=re.findall(r"[0-9.]+", char)[0]
        
        # given
        if re.search("Given",char):
            line[3]=char
            #line[3]=re.findall(r"[0-9.]+", char)[0]

        # since
        if re.search("since",char):
            line[4]=char
            #line[4]=re.findall(r"[0-9.]+", char)[0]
    
    line.append(i)
    print(line)
    full.append(line)
                     

[None, '34 designs submitted', '339 designs scored', 'Avg Score Given: 3.87', 'Member since 2022', 'ClosetPrints']
[None, '16 designs submitted', '6 designs scored', 'Avg Score Given: 5.00', 'Member since 2019', 'bakhus']
[None, '162 designs submitted', '126 designs scored', 'Avg Score Given: 4.76', 'Member since 2022', 'muktiharjanto']
[None, '9 designs submitted', '3 designs scored', 'Avg Score Given: 3.67', 'Member since 2022', 'CallyRaphael']
[None, '12 designs submitted', '1 design scored', 'Avg Score Given: 5.00', 'Member since 2015', 'i997']


In [19]:
# Question: how to scrape each designers' numbers of followers and following?



# add you code here
for i in authors_unique[:5]:
    url="https://www.threadless.com/@"+i
    time.sleep(5)
    response=requests.get(url)
    soup=BeautifulSoup(response.content, 'lxml')

    
    # get the section
    follow=soup.select("div.following li")
    following=follow[0].select("a span")[0].text
    follower=follow[1].select("a span")[0].text

    print(i, following, follower)



ClosetPrints 16 30
bakhus 5 25
muktiharjanto 94 65
CallyRaphael 1 1
i997 0 29


In [11]:
# Scrape the follower-followee network for each designer.
# Can we do this with beautifulsoup? 

from selenium import webdriver
from selenium.webdriver.chrome.options import Options


In [23]:
relations=[]

for i in authors_unique[3:5]:
    
    i=i.replace(" ","%20")
    
    follower_url="https://www.threadless.com/@"+i+"/followers"
    following_url="https://www.threadless.com/@"+i+"/following"

    # close a pop ad
    opts = Options()
    opts.add_argument("user-agent=gene")
    driver = webdriver.Chrome(options=opts)

    # one's follower   
    driver.get(follower_url)  
    time.sleep(5)
    
    # you can scroll many times if not reaching the end
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")  
    time.sleep(10)        
    html = driver.page_source
    soup = BeautifulSoup(html.encode('utf-8'),"html.parser")
    comp=soup.find('ol',class_="following-list")
    comp=comp.find_all("li")

    line=[]
    for k in comp:
        name=k.find("a")["href"]
        name=name.lstrip("/@")
        if name in authors_unique:
            # one's followers send the following tie
            line=[name, i]
            print(line)
            relations.append(line)
    
    # one's follwing
    driver.get(following_url)
    time.sleep(10)   
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")                
    time.sleep(25)  
    html = driver.page_source
    soup = BeautifulSoup(html.encode('utf-8'),"html.parser")
    comp=soup.find('ol',class_="following-list")
    comp=comp.find_all("li")

    line=[]
    for k in comp:
        name=k.find("a")["href"]
        name=name.lstrip("/@")
        if name in authors_unique:
            # one sends the following tie to those to follow
            line=[i, name]
            print(line)
            relations.append(line)
    driver.quit() 

['makerofstuff', 'Xentee']
