In [None]:
from random import choice
import json
import re
import os
import requests
import time
import urllib
from bs4 import BeautifulSoup
from selenium import webdriver
import datetime
 
_user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
]
 
def get_nested(data, *args): #Recursive function to get nested json params
    if args and data:
        element  = args[0]
        if element:
            value = data.get(element)
            return value if len(args) == 1 else get_nested(value, *args[1:])   
     
 
class InstagramScraper:
 
    def __init__(self, user_agents=None, proxy=None):
        self.user_agents = user_agents
        self.proxy = proxy
 
    def __random_agent(self):
        if self.user_agents and isinstance(self.user_agents, list):
            return choice(self.user_agents)
        return choice(_user_agents)
 
    def __request_url(self, url):
        try:
            response = requests.get(url, headers={'User-Agent': self.__random_agent()}, proxies={'http': self.proxy,
                                                                                                 'https': self.proxy})
            print(response)
            response.raise_for_status()
        except requests.HTTPError:
            raise requests.HTTPError('Received non 200 status code from Instagram')
        except requests.RequestException:
            raise requests.RequestException
        else:
            return response.text
        
   

    
    @staticmethod
    def extract_json_data(html):
        soup = BeautifulSoup(html, 'html.parser')
        body = soup.find('body')
        script_tag = body.find('script')
        raw_string = script_tag.text.strip().replace('window._sharedData =', '').replace(';', '')
        return json.loads(raw_string)
 
    def profile_page_metrics(self, profile_url):
        if not os.path.exists("InstagramProject/InstagramImages/"+profile_url):
            os.makedirs("InstagramProject/InstagramImages/"+profile_url) #create path for images
        ig_url = 'https://www.instagram.com/'+profile_url
        results = {}
        try:
            response = self.__request_url(ig_url)
            json_data = self.extract_json_data(response)
            metrics = json_data['entry_data']['ProfilePage'][0]['graphql']['user']
        except Exception as e:
            raise e
        else:
            for key, value in metrics.items():
                    if value and isinstance(value, dict):
                        value = value['count']
                        results[key] = value
                    else: #value:
                        results[key] = value

        urllib.request.urlretrieve(results['profile_pic_url'],'InstagramProject/InstagramImages/'+profile_url+'/profile.jpg') #save all images    
        return results

        
   
    def profile_page_recent_posts(self, profile_url):
        ig_url = 'https://www.instagram.com/'+profile_url

        big_results = []  
        i=1
        try:
            response = self.__request_url(ig_url)
            json_data = self.extract_json_data(response)
            metrics = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']["edges"]
        except Exception as e:
            raise e
        else:
            for node in metrics:
                
                results = {}
                node = node.get('node')
                for key, value in node.items():
                    if value and isinstance(value, dict):
                        if key == 'edge_media_to_caption':
                            temp = get_nested(value, "edges")
                            results[key] = get_nested(temp[0],"node", "text")
                        elif key == 'edge_media_to_comment':
                            results[key] = value['count']
                        elif key == 'edge_liked_by':
                            results[key] = value['count']
                        elif key == 'edge_media_preview_like':
                            results[key] = value['count'] 
                        elif key == 'location':
                            results[key] = value['name']
                        else:
                            results[key] = value
                    else:
                        results[key] = value
                urllib.request.urlretrieve(node['display_url'],'InstagramProject/InstagramImages/'+profile_url+'/%i.jpg'%i)    
                i+=1
                big_results.append(results)
        
        return big_results
    
    def createProfileData(self, profile_url): #function to create json for user
        pageResults = self.profile_page_metrics(profile_url)
        postResults = self.profile_page_recent_posts(profile_url)
        print(postResults)
        pageResults['posts']=postResults
        with open('InstagramProject/UserData/'+profile_url+'.json', 'w', encoding="utf-8") as f:
          json.dump(pageResults, f, ensure_ascii=False)

In [None]:
Path = "InstagramProject/UserData/"
filelist = []
for file in os.listdir(Path): #create filelist with all user json files
    if file.endswith(".json"):
        filelist.append(file)

In [None]:
import pandas as pd
import numpy as np
firstPostDone=0
firstUserDone=0

profileColumns = ['biography', 'business_category_name', 'connected_fb_page', 'country_block', 'edge_felix_video_timeline', 'edge_follow', 'edge_followed_by', 'edge_media_collections','edge_owner_to_timeline_media','edge_saved_media','external_url','external_url_linkshimmed','full_name','has_channel','highlight_reel_count', 'id','is_business_account','is_joined_recently','is_private','is_verified', 'profile_pic_url', 'profile_pic_url_hd', 'username'] 
postColumns = ['accessibility_caption', 'comments_disabled', 'edge_liked_by', 'display_url', 'edge_media_preview_like', 'edge_media_to_caption', 'edge_media_to_comment', 'is_video', 'location', 'taken_at_timestamp', 'hashtags', 'mentions','time_between','number_of_likes/mean', 'number_of_likes/median']   
for k in filelist:
    with open(Path + str(k), encoding="utf-8") as json_data:
        data = json.load(json_data) #open file


        profileData=dict((k, data[k]) for k in profileColumns if k in data) #create dict from json
        profileDF=pd.DataFrame.from_dict(profileData, orient='index').T  #create dataframe

        postData = data['posts']
        for i in range(0,len(postData)): #run for all posts
            for k in postColumns:
                if k in postData[i]:
                    if isinstance(postData[i][k], str):
                        postData[i][k]=postData[i][k].replace('\n','')
                else:
                    postData[i][k] = "None"
                if k == 'edge_media_to_caption':
                    postData[i]["hashtags"] = [i[1:] for i in postData[i][k].split() if i.startswith("#")] #create hashtag field
                    postData[i]["mentions"] = [i[1:] for i in postData[i][k].split() if i.startswith("@")] #create mentions field  
            post = dict((k, postData[i][k]) for k in postColumns if k in postData[i])
            
            if (firstPostDone == 0):
                totalPosts = np.hstack((profileDF, pd.DataFrame.from_dict(post, orient='index').T))
                firstPostDone=1
            else:
                nextPost = np.hstack((profileDF, pd.DataFrame.from_dict(post, orient='index').T))
                totalPosts = np.vstack((totalPosts,nextPost)) 

        intermediateDF=pd.DataFrame(totalPosts, columns=list((profileColumns))+list((postColumns)))
        intermediateDF['time_between'] = intermediateDF['taken_at_timestamp'] - intermediateDF['taken_at_timestamp'].shift(-1)
        intermediateDF['time_between'].fillna((intermediateDF['time_between'].mean()), inplace=True)
        intermediateDF['number_of_likes/mean'] = intermediateDF['edge_liked_by'].divide(intermediateDF['edge_liked_by'].mean())
        intermediateDF['number_of_likes/median'] = intermediateDF['edge_liked_by'].divide(intermediateDF['edge_liked_by'].median())
                     
        if (firstUserDone == 0):
            aggregateData = intermediateDF
            firstUserDone=1
        else:
            aggregateData=np.vstack((aggregateData,pd.DataFrame(intermediateDF, columns=list((profileColumns))+list((postColumns)))))          
        firstPostDone=0  

In [None]:
InstagramDataset=pd.DataFrame(aggregateData, columns=profileColumns+postColumns)

In [None]:
InstagramDataset['day'] = InstagramDataset['taken_at_timestamp'].apply(lambda x: (datetime.datetime.fromtimestamp(x)).strftime('%A'))

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
data1_dummy = pd.get_dummies(InstagramDataset["day"])
InstagramDataset = InstagramDataset.join(data1_dummy)

In [None]:
InstagramDataset['hour_of_day'] = InstagramDataset['taken_at_timestamp'].apply(lambda x: int((datetime.datetime.fromtimestamp(x)).strftime('%H')))

In [None]:
InstagramDataset['hr_sin'] = np.sin(InstagramDataset.hour_of_day*(2.*np.pi/24))
InstagramDataset['hr_cos'] = np.cos(InstagramDataset.hour_of_day*(2.*np.pi/24))

In [None]:
bins = [0, 4, 8, 12, 16, 20, 24]
InstagramDataset['HourBin'] = pd.cut(InstagramDataset['hour_of_day'], bins)

In [None]:
label = LabelEncoder()
InstagramDataset['HourBin_Code'] = label.fit_transform(InstagramDataset['HourBin'])