Name: Khawlah Alarfaj

Project: Project 4 Clustering

## 1. Data Acquisition (Scraping)

In [1]:
import numpy as np 
import pandas as pd
import string
import pickle
import time
import json

# Used for Data Acquisition(Scraping)
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen
from selenium import webdriver

# Used for text preprocessing/nlp
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
import langid # !pip install langid
import nltk
import re
# nltk.download('stopwords')
# nltk.download('punkt')

# Used for Modelling 
from sklearn.model_selection import train_test_split, KFold

# Used for Confusion Matrix
from sklearn import metrics

# Used to disable printing warnings 
import warnings
warnings.filterwarnings("ignore")

# Used for plotting data
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
fashion_page1 = 'html_pages (scraping)/fashion_page1.html'
fashion_page2 = 'html_pages (scraping)/fashion_page2.html'
fashion_page3 = 'html_pages (scraping)/fashion_page3.html'

art_page1 = 'html_pages (scraping)/art_page1.html'
art_page2 = 'html_pages (scraping)/art_page2.html'
art_page3 = 'html_pages (scraping)/art_page3.html'

food_page1 = 'html_pages (scraping)/food_page1.html'
food_page2 = 'html_pages (scraping)/food_page2.html'
food_page3 = 'html_pages (scraping)/food_page3.html'

health_page1 = 'html_pages (scraping)/health_page1.html'
health_page2 = 'html_pages (scraping)/health_page2.html'
health_page3 = 'html_pages (scraping)/health_page3.html'

entertainment_page1 = 'html_pages (scraping)/entertainment_page1.html'
entertainment_page2 = 'html_pages (scraping)/entertainment_page2.html'
entertainment_page3 = 'html_pages (scraping)/entertainment_page3.html'

technology_page1 = 'html_pages (scraping)/technology_page1.html'
technology_page2 = 'html_pages (scraping)/technology_page2.html'
technology_page3 = 'html_pages (scraping)/technology_page3.html'

cats_page1 = 'html_pages (scraping)/cats_page1.html'
cats_page2 = 'html_pages (scraping)/cats_page2.html'
cats_page3 = 'html_pages (scraping)/cats_page3.html'


In [3]:
def get_usernames(pages_list, start=0, end=52):
    '''
    Function to get the influencers' usernames from (influence.co)
    '''
    usernames = []
    for page in pages_list:
        f = open(page, "r").read()
        soup=bs(f, "html")
     
        for num in range (start, end):
            username = soup.find_all(class_="advanced-search-card clearfix")[num]['href'].split('/')[-1]
            usernames.append(username)
            
    return usernames

In [4]:
fashion_usernames       = get_usernames([fashion_page1, fashion_page2, fashion_page3])
art_usernames           = get_usernames([art_page1, art_page2, art_page3])
food_usernames          = get_usernames([food_page1, food_page2, food_page3])
health_usernames        = get_usernames([health_page1, health_page2, health_page3])
entertainment_usernames = get_usernames([entertainment_page1, entertainment_page2, entertainment_page3])
technology_usernames    = get_usernames([technology_page1, technology_page2, technology_page3])
cats_usernames          = get_usernames([cats_page1,cats_page3])
cats_usernames.extend(get_usernames([cats_page2], 13, 39))

In [5]:
usernames = set([*fashion_usernames, *art_usernames, 
                 *food_usernames, *health_usernames, 
                 *entertainment_usernames, 
                 *technology_usernames,
                 *cats_usernames])

In [6]:
class InstagramScraper:
    def __init__(self):
        self.browser = webdriver.Chrome()

    def get_posts_from_user(self, username, number_of_posts=10, wait_for_scroll=1.0):
        self.browser.get('https://www.instagram.com/' + username + '/?hl=en')
        links = []
        last_page_length = 0
        while len(links) < number_of_posts:
            page_length = self.browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(wait_for_scroll)
            if page_length == last_page_length:
                break
            last_page_length = page_length
            try:
                source = self.browser.page_source
                data = bs(source, 'html.parser')
                body = data.find('body')
                script = body.find('span')
                for link in script.findAll('a'):
                    link = 'https://www.instagram.com' + link.get('href')
                    if not link in links and '/p/' in link:
                        links.append(link)
            except:
                pass
        if number_of_posts < len(links):
            number_of_posts = len(links)
        #print("len", len(links))
        return links[:10]

    def get_data(self, influencer, retries=5):
        post_links = self.get_posts_from_user(influencer)
        influencer_data = {}
        captions = []
        for post_link in post_links[:10]:
            successfully_opened = False
            no_tries = 0          
            while not successfully_opened and no_tries <= retries:
                try:
                    no_tries += 1
                    page = urlopen(post_link).read()
                    successfully_opened = True
                    data = bs(page, 'html.parser')
                    body = data.find('body')
                    script = body.find('script')
                    raw = script.text.strip().replace('window._sharedData =', '').replace(';', '')
                    json_data = json.loads(raw)
                    if 'PostPage' not in json_data['entry_data']:
                        continue
                    post_data = json_data['entry_data']['PostPage'][0]['graphql']["shortcode_media"]
                    caption_text = post_data["edge_media_to_caption"]["edges"][0]['node']['text']
                    captions.append(caption_text)
                except:
                    print("Oops. Something went wrong, trying again.")
        influencer_data[influencer] = {
            "username": influencer,
            "biography": self.get_user_biography(influencer, 5),
            "captions": captions}
        return influencer_data
    
    def get_user_biography(self, username, retries=5):
        profile_link= 'https://www.instagram.com/' + username + '/?hl=en'
        successfully_opened = False
        no_tries = 0
        bio = ''
        while not successfully_opened and no_tries <= retries:
            try:
                no_tries += 1
                page = urlopen(profile_link).read()
                successfully_opened = True
                data = bs(page, 'html.parser')
                body = data.find('body')
                script = body.find('script')
                raw = script.text.strip().replace('window._sharedData =', '').replace(';', '')
                json_data = json.loads(raw)
                bio = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['biography']
            except:
                print("Oops. Something went wrong, trying again..")
        return bio
        
        

In [7]:
def write_pickle(obj, filename):
    with open(filename+".pkl", "wb") as to_write:
        pickle.dump(obj, to_write)

In [8]:
def load_influencer_list(path):
    '''
    Read Influencers's usernames from a text file
    '''
    with open(path, 'r') as in_file:
        influencers = in_file.read().splitlines()
    return influencers

scraper = InstagramScraper()
influencers = sorted(usernames)
#load_influencer_list("influencers_usernames.txt")
all_data = {}
for i, influencer in enumerate(influencers):
    print("{}/{}: Getting {}'s posts...".format(i + 1, len(influencers),
                                                influencer))        
    all_data.update(scraper.get_data(influencer))
    

1/959: Getting 100daysofadulting's posts...
2/959: Getting 1stopmom's posts...
3/959: Getting 420weedempire's posts...
Oops. Something went wrong, trying again.
Oops. Something went wrong, trying again.
Oops. Something went wrong, trying again.
Oops. Something went wrong, trying again.
Oops. Something went wrong, trying again.
Oops. Something went wrong, trying again.
Oops. Something went wrong, trying again.
Oops. Something went wrong, trying again.
4/959: Getting 4realrico's posts...
5/959: Getting 50_nerdsofgrey's posts...
Oops. Something went wrong, trying again.
Oops. Something went wrong, trying again.
Oops. Something went wrong, trying again.
Oops. Something went wrong, trying again.
Oops. Something went wrong, trying again.
Oops. Something went wrong, trying again.
Oops. Something went wrong, trying again.
Oops. Something went wrong, trying again.
6/959: Getting 52dozendonuts's posts...
7/959: Getting 730sagestreet's posts...
8/959: Getting 88nae88's posts...
9/959: Getting _an

Oops. Something went wrong, trying again.
Oops. Something went wrong, trying again.
Oops. Something went wrong, trying again.
Oops. Something went wrong, trying again.
118/959: Getting blushingnoir's posts...
119/959: Getting bodybyasia's posts...
120/959: Getting bohemiantrails's posts...
121/959: Getting bonnierzm's posts...
122/959: Getting boopmynose's posts...
123/959: Getting bow_chicka_meow's posts...
124/959: Getting bradenwilfong's posts...
125/959: Getting brdypps's posts...
Oops. Something went wrong, trying again..
Oops. Something went wrong, trying again..
Oops. Something went wrong, trying again..
Oops. Something went wrong, trying again..
Oops. Something went wrong, trying again..
Oops. Something went wrong, trying again..
126/959: Getting breezewaybakery's posts...
127/959: Getting brendarcolon's posts...
128/959: Getting brettbmartin's posts...
129/959: Getting brettwestgrovemusic's posts...
130/959: Getting brexmarieee's posts...
Oops. Something went wrong, trying aga

Oops. Something went wrong, trying again..
237/959: Getting deweythepersian's posts...
238/959: Getting diabetesstrong_ig's posts...
239/959: Getting diamondkesawn's posts...
240/959: Getting dianafontanez's posts...
241/959: Getting dianamoseni's posts...
242/959: Getting dietitiandeanna's posts...
243/959: Getting dishourtown's posts...
244/959: Getting djwrex's posts...
245/959: Getting doddblog's posts...
246/959: Getting donnadadondada's posts...
247/959: Getting dr.chapman.dc's posts...
248/959: Getting drewdoyon's posts...
249/959: Getting drmicheleross's posts...
250/959: Getting dslrnation's posts...
251/959: Getting duncjohnson's posts...
252/959: Getting dylanobrienigofficial's posts...
Oops. Something went wrong, trying again..
Oops. Something went wrong, trying again..
Oops. Something went wrong, trying again..
Oops. Something went wrong, trying again..
Oops. Something went wrong, trying again..
Oops. Something went wrong, trying again..
253/959: Getting eastcoastfeastcoas

369/959: Getting homeinhighheels's posts...
Oops. Something went wrong, trying again.
370/959: Getting housexterior's posts...
371/959: Getting hreyes21's posts...
372/959: Getting hungrytwins's posts...
373/959: Getting hussycats's posts...
374/959: Getting hybriddoug's posts...
375/959: Getting hyper_n_helios's posts...
376/959: Getting iamcarlabuggs's posts...
377/959: Getting iamcls's posts...
378/959: Getting iamissen's posts...
379/959: Getting iamlamarwatkins's posts...
380/959: Getting iamprincehenry's posts...
381/959: Getting iamromarius's posts...
382/959: Getting iamsandygarza's posts...
383/959: Getting iamthegreatwent's posts...
384/959: Getting icancookthat's posts...
385/959: Getting iceanddustpottery's posts...
386/959: Getting icegirlash's posts...
387/959: Getting igbeautiesdaily's posts...
388/959: Getting impeccable_blah's posts...
389/959: Getting imsarahconley's posts...
390/959: Getting indrastra's posts...
391/959: Getting indulgenteats's posts...
392/959: Gett

Oops. Something went wrong, trying again..
514/959: Getting lady_dani_cat's posts...
Oops. Something went wrong, trying again..
515/959: Getting ladygreazer's posts...
Oops. Something went wrong, trying again..
516/959: Getting landiduh's posts...
Oops. Something went wrong, trying again..
517/959: Getting lastingredient's posts...
Oops. Something went wrong, trying again..
518/959: Getting laura.browning.art's posts...
Oops. Something went wrong, trying again..
519/959: Getting laurenabraham's posts...
Oops. Something went wrong, trying again..
520/959: Getting laurenmarek's posts...
Oops. Something went wrong, trying again..
521/959: Getting lauryncakes's posts...
Oops. Something went wrong, trying again..
522/959: Getting lean.clean.christine's posts...
Oops. Something went wrong, trying again..
523/959: Getting lefthandedlenya's posts...
Oops. Something went wrong, trying again..
524/959: Getting leo_n_friends's posts...
Oops. Something went wrong, trying again..
525/959: Getting l

Oops. Something went wrong, trying again..
611/959: Getting missmisschelle's posts...
Oops. Something went wrong, trying again..
612/959: Getting missnikkileigh's posts...
Oops. Something went wrong, trying again..
613/959: Getting missybriggs_'s posts...
Oops. Something went wrong, trying again..
614/959: Getting moew_moew_cats's posts...
Oops. Something went wrong, trying again..
615/959: Getting momfoodie's posts...
Oops. Something went wrong, trying again..
616/959: Getting mommaofthreecubs's posts...
Oops. Something went wrong, trying again..
617/959: Getting mommygonehealthy's posts...
Oops. Something went wrong, trying again..
618/959: Getting mommyjenna's posts...
Oops. Something went wrong, trying again..
619/959: Getting momskoop's posts...
Oops. Something went wrong, trying again..
620/959: Getting monchatdore's posts...
Oops. Something went wrong, trying again..
621/959: Getting mongol.wanderess's posts...
Oops. Something went wrong, trying again..
622/959: Getting monica_e

Oops. Something went wrong, trying again..
708/959: Getting popcircumstance's posts...
Oops. Something went wrong, trying again..
709/959: Getting predupre's posts...
Oops. Something went wrong, trying again..
710/959: Getting priscillacornwell's posts...
Oops. Something went wrong, trying again..
711/959: Getting prissy_pig's posts...
Oops. Something went wrong, trying again..
712/959: Getting professionaltraveler's posts...
Oops. Something went wrong, trying again..
713/959: Getting quilly.quinnstagram's posts...
Oops. Something went wrong, trying again..
714/959: Getting quoththeraivyn's posts...
Oops. Something went wrong, trying again..
715/959: Getting quoty_love's posts...
Oops. Something went wrong, trying again..
716/959: Getting rachellenyc's posts...
Oops. Something went wrong, trying again..
717/959: Getting racheltravels's posts...
Oops. Something went wrong, trying again..
718/959: Getting raepublic's posts...
Oops. Something went wrong, trying again..
719/959: Getting ra

Oops. Something went wrong, trying again..
805/959: Getting stephvanburk's posts...
Oops. Something went wrong, trying again..
806/959: Getting stormcalysta's posts...
Oops. Something went wrong, trying again..
807/959: Getting stupiddope's posts...
Oops. Something went wrong, trying again..
808/959: Getting styledevotee's posts...
Oops. Something went wrong, trying again..
809/959: Getting suetrannn's posts...
Oops. Something went wrong, trying again..
810/959: Getting sugarfreeinnyc's posts...
Oops. Something went wrong, trying again..
811/959: Getting sugarmilkk's posts...
Oops. Something went wrong, trying again..
812/959: Getting sunglasscat's posts...
Oops. Something went wrong, trying again..
813/959: Getting sunnyandjudy's posts...
Oops. Something went wrong, trying again..
814/959: Getting sunshineflgirl's posts...
Oops. Something went wrong, trying again..
815/959: Getting susvasquez's posts...
Oops. Something went wrong, trying again..
816/959: Getting suzipratt's posts...
O

Oops. Something went wrong, trying again..
903/959: Getting travthroughlife's posts...
Oops. Something went wrong, trying again..
904/959: Getting trendmeup's posts...
Oops. Something went wrong, trying again..
905/959: Getting treynkennedy's posts...
Oops. Something went wrong, trying again..
906/959: Getting tru_def's posts...
Oops. Something went wrong, trying again..
907/959: Getting trulyblazey's posts...
Oops. Something went wrong, trying again..
908/959: Getting tuxedotrio's posts...
Oops. Something went wrong, trying again..
909/959: Getting two.himalayan.cats's posts...
Oops. Something went wrong, trying again..
910/959: Getting twoadorablelabs's posts...
Oops. Something went wrong, trying again..
911/959: Getting twoiggiesandakitty's posts...
Oops. Something went wrong, trying again..
912/959: Getting tynathanclark's posts...
Oops. Something went wrong, trying again..
913/959: Getting uknojb's posts...
Oops. Something went wrong, trying again..
914/959: Getting ula.rocks's po

In [9]:
df = pd.DataFrame.from_dict(all_data, orient='index')
df.set_index('username', inplace=True)
write_pickle(df, 'influencers_data')