In [0]:
import requests
import pandas as pd
import re
from datetime import datetime 
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, MapType
# Your Instagram Business Account User ID and long-lived access token

# The hashtag you want to search for (without the '#' symbol)
HASHTAG = 'macav'  # Replace with your desired hashtag (without the '#')
INSTAGRAM_USER_ID = '17841472712121977'
ACCESS_TOKEN = dbutils.secrets.get(scope="Scope1", key="InstagramAccessToken")

# Azure Storage credentials
storage_account_name = "macavstorage"
container_name = "datalake"
servicePrincipalID = "0b27e0ea-e184-49cd-b921-b3519cb03f7f"
blobsecret = dbutils.secrets.get(scope="Scope1", key="blobsecret1")
tenantID = "60feac79-e042-4ce8-8759-dca313146110"

# Path to the file in Azure Data Lake
# Create secret scope at 
# https://adb-4383697834848777.17.azuredatabricks.net/#secrets/createScope
#Scope Name = Scope1
#DNS Name = "https://twitchkv.vault.azure.net/" (Vault URI)
#Resource ID = "/subscriptions/972ad05f-b62e-48ab-a9fa-a17fd4dc6640/resourceGroups/twitchData/providers/Microsoft.KeyVault/vaults/twitchkv" (Keyvault resource ID)

#Initializing spark-session and adding configurations
spark = SparkSession.builder \
    .appName("DeltaLakeAzureStorage") \
    .config("spark.sql.extensions", "delta.sql.DeltaSparkSessionExtensions") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

#Authenticating Serviceprincipal to access blob storage
spark.conf.set(f"fs.azure.account.auth.type.{storage_account_name}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{storage_account_name}.dfs.core.windows.net", 
               "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{storage_account_name}.dfs.core.windows.net", 
               servicePrincipalID)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{storage_account_name}.dfs.core.windows.net", 
               blobsecret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{storage_account_name}.dfs.core.windows.net", 
               f"https://login.microsoftonline.com/{tenantID}/oauth2/token")

In [0]:
def extract_hashtags(caption):
    hashtags = re.findall(r'#\w+', caption)  # Find all words starting with #
    caption_without_hashtags = re.sub(r'#\w+', '', caption).strip()  # Remove hashtags from caption
    return caption_without_hashtags, ' '.join(hashtags)  # Return cleaned caption and joined hashtags

In [0]:
#GET HASHTAG_ID FOR A CERTAIN HASHTAG

url = f"https://graph.facebook.com/v18.0/ig_hashtag_search"
params = {
    "user_id": INSTAGRAM_USER_ID,
    "q": HASHTAG,  # Search for the hashtag
    "access_token": ACCESS_TOKEN
}

# Make the API request
response = requests.get(url, params=params)

# Parse the response JSON to get the hashtag ID
hashtag_data = response.json()

# Extract the hashtag ID (first result)
hashtag_id = hashtag_data.get("data", [{}])[0].get("id", None)

print("Hashtag ID:", hashtag_id)

In [0]:

# Request URL to get media for the hashtag ID
media_url = f"https://graph.facebook.com/v18.0/{hashtag_id}/recent_media"
params = {
    "user_id": INSTAGRAM_USER_ID,
    "fields": "id,caption,media_type,media_url,permalink,timestamp",  # Removed owner field
    "access_token": ACCESS_TOKEN
}

# Make the API request to get the media
response = requests.get(media_url, params=params)
print(response)
# Check if the response is successful
if response.status_code == 200:
    # Parse the response JSON
    media_data = response.json()

    # Initialize the media list
    media_list = []

    # Loop through each media item in the response data
    for media in media_data.get("data", []):
        if media["media_type"] == "IMAGE":  # Only include images
            caption = media.get("caption", "No caption")
            cleaned_caption, hashtags = extract_hashtags(caption)  # Split caption and hashtags

            post_info = {
                "id": media["id"],
                "caption": cleaned_caption,  # Caption without hashtags
                "hashtags": hashtags,  # Extracted hashtags
                "media_type": media["media_type"],
                "media_url": media.get("media_url", "No URL"),
                "permalink": media["permalink"],
                "timestamp": media["timestamp"],
                "like_count": 0  # Placeholder for like count
            }

        # Now fetch the like count for the specific media ID
        media_id = media['id']
        like_count_url = f"https://graph.facebook.com/v18.0/{media_id}"
        like_count_params = {
            "fields": "like_count",  # Request the like_count field along with the media ID
            "access_token": ACCESS_TOKEN
        }

        # Make the API request to get the like count
        like_response = requests.get(like_count_url, params=like_count_params)

        if like_response.status_code == 200:
            like_data = like_response.json()
            # Update the post_info with the like count
            post_info['like_count'] = like_data.get('like_count', 0)
        else:
            print(f"Error fetching like count for Media ID: {media_id}, Status Code: {like_response.status_code}")

        # Append the structured media information to the media list
        media_list.append(post_info)
        
    api_completed_timestamp = datetime.utcnow().isoformat() 
    # Create a pandas DataFrame from the media list
    media_df = pd.DataFrame(media_list)
    media_df["latest_query_completed_timestamp"] = api_completed_timestamp

    # Display the DataFrame (if in Jupyter, this will render as a nice table)
    print(media_df)

else:
    print(f"Error: {response.status_code} - {response.text}")


In [0]:
spark_df = spark.createDataFrame(media_df)

In [0]:
spark_df.write \
    .format("delta") \
    .mode("overwrite") \
    .save(f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/datalake/instagram_data")