In [5]:
import requests
import json

<u>Query User ID:</u> 17841458726840626
```ts
GET {USER-ID}?fields=business_discovery.username(asasjostromphotography){
    name,
    ...
    media{
        media_url,
        ...
    }
}
```



<div>
    <u><strong>In order to get USER-ID:</strong></u>
    <ol>
        <li>GET me/accounts</li>
        <li>Click on the page ID</li>
        <li>Query for instagram_business_account</li>
        <li>That will give you the USER-ID</li>
    <ol>
</div>

<div>
    <u><strong>Notes:</strong></u>
    <ul>
        <li>By default, the API will return 25 media objects. You can use the "limit" parameter to get more media objects.</li>
        <li>To get the next page of media objects, use the "after" parameter.</li>
        <li>The "after" parameter is the cursor value returned in the "page_info" object of the previous response.</li>
    </ul>
</div>


In [14]:
HOST = "https://graph.facebook.com/v16.0/"
TOKEN = "EAAkrxJG28MYBALv4ZA1gq2lBz0dtIKDvA8viWsCEgXEItrtcZCV1gI4x22x1bVxgmqe11Qj6eM23ptItpwaTGWybaLLjpuNlDIdeKJaGEBgMtSsD1obW1kHwPcwfqNC4emmY5uaw8yz8yxkBGnTp7yOgkBVZBvZAheql3AEV0vw8uUB6p75tpHVILaLkFnIQuMrDZA5HpPRfn3vdhHLZCo"
IG_ACCOUNT_ID = "17841458726840626"

In [3]:
def read_dataset(filename="dataset.json"):
    with open(filename, "r") as f:
        return json.load(f)


def save(dataset, filename="updated_dataset.json"):
    with open(filename, "w") as f:
        json.dump(dataset, f, indent=4)

In [12]:
def extract_hashtags(caption):
    return list(map(lambda word: word[1:], [word for word in caption.split() if word.startswith("#") and len(word) > 1]))


def get_instagram_user_details(username, limit=200):
    url = f"""{HOST}{IG_ACCOUNT_ID}?fields=
    business_discovery.username({username}){{
        name,
        biography,
        followers_count,
        follows_count,
        media_count,
        profile_picture_url,
        media.limit({limit}){{
            media_type,
            media_url,
            timestamp,
            caption,
            comments_count,
            like_count
        }}}}
        &access_token={TOKEN}"""
    url = url.replace("\n", "").replace(" ", "")
    response = requests.get(url)
    return response.json()


In [15]:
get_instagram_user_details(username="nike", limit=200)

{'business_discovery': {'name': 'Nike',
  'biography': 'Spotlighting athlete* and\xa0👟\xa0stories\n#BlackLivesMatter and #StopAsianHate',
  'followers_count': 295158500,
  'follows_count': 151,
  'media_count': 1134,
  'profile_picture_url': 'https://scontent.ftun16-1.fna.fbcdn.net/v/t51.2885-15/285265415_157543166760141_7125906423211419857_n.jpg?_nc_cat=1&ccb=1-7&_nc_sid=86c713&_nc_ohc=XVAjDvQE66AAX-9x9aK&_nc_ht=scontent.ftun16-1.fna&edm=AL-3X8kEAAAA&oh=00_AfBQdKN0pOlWFnti9oPSQR-tK098OTzFfb7F6q73KIlQzw&oe=647E5693',
  'media': {'data': [{'media_type': 'CAROUSEL_ALBUM',
     'media_url': 'https://scontent.cdninstagram.com/v/t51.29350-15/350442700_635812141751378_1531557628601014694_n.jpg?_nc_cat=1&ccb=1-7&_nc_sid=8ae9d6&_nc_ohc=y71D5Otv7mwAX-huCG8&_nc_ht=scontent.cdninstagram.com&edm=AL-3X8kEAAAA&oh=00_AfAGd20I_t1cTnXUUUtOqOaDfcswq09m51ZlH3JCwwJL1w&oe=647E1EFE',
     'timestamp': '2023-06-01T16:01:55+0000',
     'caption': 'The 2023 Be True Collection is here. Made in collaboration wit

In [5]:
def update_dataset(old_dataset, fail_limit=25, start_username=None, skip_users=[]):
    skip_users = set(skip_users)
    errored_users = []
    new_dataset = {}
    fail_counter = 0
    should_start = False if start_username else True
    for username, value in old_dataset.items():
        if not should_start and username == start_username:
            should_start = True
            continue
        elif not should_start:
            continue
        if username in skip_users:
            continue
        interests = value["interests"]
        business_category_name = value["business_category_name"]
        details = get_instagram_user_details(username)
        
        try:
            details = details["business_discovery"]
            details["business_category_name"] = business_category_name
            details["interests"] = interests
            details["posts"] = {}
            for post in details["media"]["data"]:
                post["hashtags"] = extract_hashtags(post.get("caption", ""))
                details["posts"][post["id"]] = post
                del post['id']
            del details["media"]
            new_dataset[username] = details
            print(
                f"Updated {username} and found {len(details['posts'])} posts")
            fail_counter = 0
        except Exception as e:
            errored_users.append(username)
            print(f"Error for {username}: {e}, {details['error']['message']}")
            fail_counter += 1
            if fail_counter == fail_limit:
                print(f"Failed to fetch {fail_counter} users, stopping...")
                break
            continue
    return new_dataset, errored_users


In [None]:
old_dataset = read_dataset("dataset.json")
# get latest username 
updated_dataset = read_dataset("updated_dataset.json")
last_username = list(updated_dataset.keys())[-1]
print(f"Last username: {last_username}")
old_errored_users = read_dataset("errored_users.json")
print(f'Number of instances in new dataset: {len(updated_dataset)}')

In [None]:
skip_users = [*old_errored_users, *updated_dataset.keys()]
new_dataset, errored_users = update_dataset(old_dataset, skip_users=skip_users)
print(
    f'Fetch completed with {len(new_dataset)} successful '
    f'accounts and {len(errored_users)} errors.'
)


In [17]:
new_dataset = {**updated_dataset, **new_dataset}
save(new_dataset)

In [18]:
errored_users = list(set(old_errored_users + errored_users))
save(errored_users, filename="errored_users.json")