In [0]:
import pandas as pd
import requests
import time
import json
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, MapType

# Azure Storage credentials
storage_account_name = "macavstorage"
container_name = "datalake"
servicePrincipalID = "0b27e0ea-e184-49cd-b921-b3519cb03f7f"
blobsecret = dbutils.secrets.get(scope="Scope1", key="blobsecret1")
tenantID = "60feac79-e042-4ce8-8759-dca313146110"

# Path to the file in Azure Data Lake
# Create secret scope at 
# https://adb-4383697834848777.17.azuredatabricks.net/#secrets/createScope
#Scope Name = Scope1
#DNS Name = "https://twitchkv.vault.azure.net/" (Vault URI)
#Resource ID = "/subscriptions/972ad05f-b62e-48ab-a9fa-a17fd4dc6640/resourceGroups/twitchData/providers/Microsoft.KeyVault/vaults/twitchkv" (Keyvault resource ID)

#Initializing spark-session and adding configurations
spark = SparkSession.builder \
    .appName("DeltaLakeAzureStorage") \
    .config("spark.sql.extensions", "delta.sql.DeltaSparkSessionExtensions") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

#Authenticating Serviceprincipal to access blob storage
spark.conf.set(f"fs.azure.account.auth.type.{storage_account_name}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{storage_account_name}.dfs.core.windows.net", 
               "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{storage_account_name}.dfs.core.windows.net", 
               servicePrincipalID)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{storage_account_name}.dfs.core.windows.net", 
               blobsecret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{storage_account_name}.dfs.core.windows.net", 
               f"https://login.microsoftonline.com/{tenantID}/oauth2/token")

In [0]:
import requests
import datetime

# Set up Bearer Token (Replace with your actual token)

BEARER_TOKEN = dbutils.secrets.get(scope="Scope1", key="XBearerToken")
# Define the hashtag you want to search for (without #)
HASHTAG = "macavai"

# Twitter API endpoint for recent tweets with a hashtag
url = "https://api.twitter.com/2/tweets/search/recent"

# Define query parameters
params = {
    "query": f"#{HASHTAG} -is:reply",  # Exclude replies
    "tweet.fields": "id,text,created_at,public_metrics,attachments,lang,source,author_id,entities",
    "expansions": "attachments.media_keys,author_id",
    "media.fields": "media_key,type,url",
    "user.fields": "id,username",
    "max_results": 10,  # Adjust as needed (max 100)
}

# Set up headers with Bearer Token
headers = {
    "Authorization": f"Bearer {BEARER_TOKEN}"
}

# Make API request
response = requests.get(url, params=params, headers=headers)
tweets_data = response.json()
print(tweets_data)

In [0]:
from pyspark.sql import Row
import datetime
# Extract data
tweet = tweets_data['data'][0]
user = tweets_data['includes']['users'][0]

tweet_id = tweet['id']
text = tweet['text']
hashtags = ",".join([tag['tag'] for tag in tweet.get('entities', {}).get('hashtags', [])])
author_id = tweet['author_id']
geo = tweet.get("geo", {}).get("place_id", "N/A")
username = user['username']
created_at = tweet['created_at']

# Public metrics
metrics = tweet['public_metrics']
retweet_count = metrics['retweet_count']
reply_count = metrics['reply_count']
like_count = metrics['like_count']
quote_count = metrics['quote_count']
bookmark_count = metrics['bookmark_count']
impression_count = metrics['impression_count']

# Get API completed timestamp (current time in UTC)
#api_completed_timestamp = datetime.datetime.utcnow().isoformat()

# Sample data
tweet_data = {
    "id": tweet_id,
    "text": text,
    "hashtags": hashtags,
    "author_id": author_id,
    "username": username,
    "created_at": created_at,
    "retweet_count": retweet_count,
    "reply_count": reply_count,
    "like_count": like_count,
    "quote_count": quote_count,
    "bookmark_count": bookmark_count,
    "impression_count": impression_count,
    "geo_place_id": geo,
    "api_completed_timestamp": 0
}

# Convert to Spark DataFrame
spark_df = spark.createDataFrame([Row(**tweet_data)])

In [0]:
spark_df.write \
    .format("delta") \
    .mode("overwrite") \
    .save(f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/datalake/x_data")

In [0]:
dbutils.fs.ls("abfss://datalake@macavstorage.dfs.core.windows.net/")

In [0]:
display(spark_df)