In [0]:
import pandas as pd
import requests
import time
import json
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, MapType

# Azure Storage credentials
storage_account_name = "macavstorage"
container_name = "datalake"
servicePrincipalID = "0b27e0ea-e184-49cd-b921-b3519cb03f7f" 
blobsecret = dbutils.secrets.get(scope="Scope1", key="blobsecret1")
tenantID = "60feac79-e042-4ce8-8759-dca313146110"
file = 'x_data'

# Path to the file in Azure Data Lake
# Create secret scope at 
# https://adb-4383697834848777.17.azuredatabricks.net/#secrets/createScope
#Scope Name = Scope1
#DNS Name = "https://twitchkv.vault.azure.net/" (Vault URI)
#Resource ID = "/subscriptions/972ad05f-b62e-48ab-a9fa-a17fd4dc6640/resourceGroups/twitchData/providers/Microsoft.KeyVault/vaults/twitchkv" (Keyvault resource ID)

#Initializing spark-session and adding configurations
spark = SparkSession.builder \
    .appName("DeltaLakeAzureStorage") \
    .config("spark.sql.extensions", "delta.sql.DeltaSparkSessionExtensions") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

#Authenticating Serviceprincipal to access blob storage
spark.conf.set(f"fs.azure.account.auth.type.{storage_account_name}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{storage_account_name}.dfs.core.windows.net", 
               "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{storage_account_name}.dfs.core.windows.net", 
               servicePrincipalID)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{storage_account_name}.dfs.core.windows.net", 
               blobsecret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{storage_account_name}.dfs.core.windows.net", 
               f"https://login.microsoftonline.com/{tenantID}/oauth2/token")

In [0]:

campaigns_fetched = spark.read.format("delta").load(f"abfs://{container_name}@{storage_account_name}.dfs.core.windows.net/datalake/campaigns")
display(campaigns_fetched)

In [0]:
for row in campaigns_fetched.select("hashtags").collect():
    for hashtag in row["hashtags"]:
        print(hashtag)

In [0]:
import requests
import time
import datetime
from pyspark.sql import Row

# Set up Bearer Tokens
BEARER_TOKEN1 = dbutils.secrets.get(scope="Scope1", key="XBearerToken")
BEARER_TOKEN2 = dbutils.secrets.get(scope="Scope1", key="XBearerToken2")

# Token toggle
token_switch = True

# Twitter API endpoint
url = "https://api.twitter.com/2/tweets/search/recent"

all_tweet_data = []

# Loop through hashtags from campaigns_fetched
for row in campaigns_fetched.select("hashtags").collect():
    for hashtag in row["hashtags"]:
        token = BEARER_TOKEN1 if token_switch else BEARER_TOKEN2
        token_switch = not token_switch

        headers = {
            "Authorization": f"Bearer {token}"
        }

        params = {
            "query": f"#{hashtag} -is:reply",
            "tweet.fields": "id,text,created_at,public_metrics,geo,entities,author_id",
            "expansions": "author_id",
            "user.fields": "id,username,name",
            "max_results": 100,
        }

        try:
            response = requests.get(url, headers=headers, params=params)

            if response.status_code == 429:
                reset_time = int(response.headers.get("x-rate-limit-reset", time.time() + 60))
                sleep_for = max(0, reset_time - time.time()) + 1
                print(f"Rate limit hit. Sleeping for {sleep_for:.0f} seconds...")
                time.sleep(sleep_for)
                continue

            response.raise_for_status()
            result = response.json()

            users_map = {user["id"]: user for user in result.get("includes", {}).get("users", [])}

            for tweet in result.get("data", []):
                author_id = tweet["author_id"]
                user = users_map.get(author_id, {})

                tweet_data = {
                    "id": tweet["id"],
                    "text": tweet["text"],
                    "hashtags": ",".join([tag['tag'] for tag in tweet.get("entities", {}).get("hashtags", [])]),
                    "author_id": author_id,
                    "username": user.get("username", "N/A"),
                    "created_at": tweet["created_at"],
                    "retweet_count": tweet["public_metrics"].get("retweet_count", 0),
                    "reply_count": tweet["public_metrics"].get("reply_count", 0),
                    "like_count": tweet["public_metrics"].get("like_count", 0),
                    "quote_count": tweet["public_metrics"].get("quote_count", 0),
                    "bookmark_count": tweet["public_metrics"].get("bookmark_count", 0),
                    "impression_count": tweet["public_metrics"].get("impression_count", 0),
                    "geo_place_id": tweet.get("geo", {}).get("place_id", "N/A"),
                    "api_completed_timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat()
                }

                all_tweet_data.append(tweet_data)

        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch tweets for #{hashtag}: {e}")

        # Sleep to avoid hitting short-term burst rate limits
        time.sleep(2)

unique_tweets = {tweet["id"]: tweet for tweet in all_tweet_data}
deduped_tweet_data = list(unique_tweets.values())

# Convert to Spark DataFrame
twitter_data = spark.createDataFrame([Row(**tweet) for tweet in deduped_tweet_data])

# Write to Data Lake (adjust path/mode if needed)
# twitter_data.write.parquet(data_lake_path, mode="overwrite")

# Display in notebook
display(twitter_data)

ChatGPT 03/04/2025 - THIS WORKS!

Own code Added bearer token2 03/04/2025, doesn't return user.fields with second api key - USE THE ABOVE INSTEAD

Write Json

In [0]:
twitter_data.write \
    .format("json") \
    .mode("append") \
    .save(f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/{file}")

read Json

In [0]:
xdata_fetched_json = spark.read.format("json").load(f"abfs://{container_name}@{storage_account_name}.dfs.core.windows.net/x_data")

display(xdata_fetched_json)