# üí¨ Extract Comments on Data Science Club Tweets

This notebook uses **RapidAPI** to fetch comments/replies on the latest tweets from the Data Science Club account.

## Features:
- ‚úÖ Free (50 requests/day)
- ‚úÖ Extracts replies on tweets
- ‚úÖ Saves comments to JSON

## Requirements:
1. Account on [RapidAPI](https://rapidapi.com/)
2. Subscription to [Twitter241 API](https://rapidapi.com/davethebeast/api/twitter241)

In [27]:
# Install required libraries
!pip install requests python-dotenv



In [28]:
import requests
import json
import os
from dotenv import load_dotenv

# ==========================================
# üîë Load environment variables from .env
# ==========================================
load_dotenv()

RAPIDAPI_KEY = os.getenv("RAPIDAPI_KEY")
DSC_USERNAME = os.getenv("DSC_USERNAME", "DSC_KAU")

# API Headers
HEADERS = {
    "x-rapidapi-key": RAPIDAPI_KEY,
    "x-rapidapi-host": "twitter241.p.rapidapi.com"
}

# Validate
if not RAPIDAPI_KEY:
    print("‚ùå Error: RAPIDAPI_KEY not found in .env file!")
else:
    print(f"‚úÖ API Key loaded successfully")
    print(f"‚úÖ Username: @{DSC_USERNAME}")

‚úÖ API Key loaded successfully
‚úÖ Username: @DSC_KAU


In [29]:
def get_user_id(username: str) -> str:
    """
    Get user ID from username
    """
    user_url = "https://twitter241.p.rapidapi.com/user"
    
    response = requests.get(
        user_url,
        headers=HEADERS,
        params={"username": username}
    )
    
    if response.status_code != 200:
        print(f"‚ùå Error: {response.status_code}")
        return None
    
    user_data = response.json()
    
    # Extract user_id
    if 'result' in user_data and 'data' in user_data['result']:
        return user_data['result']['data']['user']['result']['rest_id']
    elif 'data' in user_data:
        return user_data['data']['user']['result']['rest_id']
    
    return None

In [30]:
def get_latest_tweets(user_id: str, count: int = 5) -> list:
    """
    Get the latest tweets from a user
    """
    tweets_url = "https://twitter241.p.rapidapi.com/user-tweets"
    
    response = requests.get(
        tweets_url,
        headers=HEADERS,
        params={"user": user_id, "count": str(count * 2)}
    )
    
    if response.status_code != 200:
        print(f"‚ùå Error fetching tweets: {response.status_code}")
        return []
    
    tweets_data = response.json()
    
    # Extract tweets
    tweets_list = []
    
    # Get instructions
    instructions = tweets_data.get('result', {}).get('timeline', {}).get('instructions', [])
    if not instructions:
        instructions = tweets_data.get('data', {}).get('user', {}).get('result', {}).get('timeline_v2', {}).get('timeline', {}).get('instructions', [])
    
    # Find entries
    entries = []
    for instruction in instructions:
        if 'entries' in instruction:
            entries = instruction['entries']
            break
        elif instruction.get('type') == 'TimelineAddEntries':
            entries = instruction.get('entries', [])
            break
    
    tweet_count = 0
    for entry in entries:
        if tweet_count >= count:
            break
        
        entry_id = entry.get('entryId', '')
        
        if 'tweet' not in entry_id and 'Tweet' not in entry_id:
            continue
        
        try:
            content = entry.get('content', {})
            item_content = content.get('itemContent', {})
            tweet_results = item_content.get('tweet_results', {})
            result = tweet_results.get('result', {})
            
            if result.get('__typename') == 'TweetWithVisibilityResults':
                result = result.get('tweet', {})
            
            legacy = result.get('legacy', {})
            
            if not legacy:
                continue
            
            tweet_id = legacy.get('id_str', result.get('rest_id', ''))
            text = legacy.get('full_text', 'N/A')
            reply_count = legacy.get('reply_count', 0)
            
            tweets_list.append({
                'id': tweet_id,
                'text': text,
                'reply_count': reply_count
            })
            
            tweet_count += 1
            
        except Exception as e:
            continue
    
    return tweets_list

In [31]:
def get_tweet_comments(tweet_id: str, max_comments: int = 20) -> list:
    """
    Get comments/replies on a specific tweet using tweet detail endpoint
    """
    detail_url = "https://twitter241.p.rapidapi.com/tweet"
    
    response = requests.get(
        detail_url,
        headers=HEADERS,
        params={"pid": tweet_id}
    )
    
    if response.status_code != 200:
        print(f"   ‚ö†Ô∏è Error {response.status_code} for tweet {tweet_id}")
        return []
    
    tweet_data = response.json()
    comments = []
    
    try:
        # Navigate to the correct structure
        conversation = tweet_data.get('data', {}).get('threaded_conversation_with_injections_v2', {})
        instructions = conversation.get('instructions', [])
        
        entries = []
        for instruction in instructions:
            if instruction.get('type') == 'TimelineAddEntries':
                entries = instruction.get('entries', [])
                break
        
        for entry in entries:
            entry_id = entry.get('entryId', '')
            
            # Look for conversation thread entries (these are replies)
            if 'conversationthread' in entry_id.lower():
                content = entry.get('content', {})
                items = content.get('items', [])
                
                for item in items:
                    try:
                        item_content = item.get('item', {}).get('itemContent', {})
                        tweet_results = item_content.get('tweet_results', {})
                        result = tweet_results.get('result', {})
                        
                        if result.get('__typename') == 'TweetWithVisibilityResults':
                            result = result.get('tweet', {})
                        
                        legacy = result.get('legacy', {})
                        core = result.get('core', {}).get('user_results', {}).get('result', {})
                        user_legacy = core.get('legacy', {})
                        
                        # Skip the original tweet
                        if legacy and legacy.get('id_str') != tweet_id:
                            comment_text = legacy.get('full_text', '')
                            if comment_text:
                                comments.append({
                                    'comment_id': legacy.get('id_str', ''),
                                    'text': comment_text,
                                    'username': user_legacy.get('screen_name', 'Unknown'),
                                    'name': user_legacy.get('name', 'Unknown'),
                                    'likes': legacy.get('favorite_count', 0),
                                    'date': legacy.get('created_at', '')
                                })
                                
                                if len(comments) >= max_comments:
                                    break
                    except:
                        continue
                        
    except Exception as e:
        print(f"   ‚ö†Ô∏è Error parsing: {e}")
    
    return comments


def get_all_replies_via_search(username: str, tweet_ids: list) -> dict:
    """
    Get all replies to a user using search, then filter by tweet IDs
    """
    print(f"üîç Searching for all replies to @{username}...")
    
    search_url = "https://twitter241.p.rapidapi.com/search-v2"
    
    response = requests.get(
        search_url,
        headers=HEADERS,
        params={"query": f"to:{username}", "count": "100", "type": "Latest"}
    )
    
    if response.status_code != 200:
        print(f"   ‚ö†Ô∏è Search failed: {response.status_code}")
        return {}
    
    data = response.json()
    
    # Extract all replies
    all_replies = {}  # tweet_id -> list of replies
    
    for tid in tweet_ids:
        all_replies[tid] = []
    
    try:
        instructions = data.get('result', {}).get('timeline', {}).get('instructions', [])
        
        entries = []
        for instruction in instructions:
            if instruction.get('type') == 'TimelineAddEntries':
                entries = instruction.get('entries', [])
                break
        
        for entry in entries:
            entry_id = entry.get('entryId', '')
            
            if 'tweet' not in entry_id.lower():
                continue
            
            try:
                content = entry.get('content', {})
                item_content = content.get('itemContent', {})
                tweet_results = item_content.get('tweet_results', {})
                result = tweet_results.get('result', {})
                
                if result.get('__typename') == 'TweetWithVisibilityResults':
                    result = result.get('tweet', {})
                
                legacy = result.get('legacy', {})
                core = result.get('core', {}).get('user_results', {}).get('result', {})
                user_legacy = core.get('legacy', {})
                user_core = core.get('core', {})
                
                # Check if this is a reply to one of our tweets
                in_reply_to = legacy.get('in_reply_to_status_id_str', '')
                
                if in_reply_to in tweet_ids:
                    screen_name = user_legacy.get('screen_name') or user_core.get('screen_name', 'Unknown')
                    name = user_legacy.get('name') or user_core.get('name', 'Unknown')
                    
                    all_replies[in_reply_to].append({
                        'comment_id': legacy.get('id_str', ''),
                        'text': legacy.get('full_text', ''),
                        'username': screen_name,
                        'name': name,
                        'likes': legacy.get('favorite_count', 0),
                        'date': legacy.get('created_at', '')
                    })
                    
            except Exception as e:
                continue
                
    except Exception as e:
        print(f"   ‚ö†Ô∏è Error: {e}")
    
    return all_replies

In [32]:
# ==========================================
# üöÄ Extract Comments from Last 5 Tweets
# ==========================================

print(f"üîç Fetching comments for @{DSC_USERNAME}...")
print("=" * 50)

# Step 1: Get user ID
print("üîÑ Getting user ID...")
user_id = get_user_id(DSC_USERNAME)

if not user_id:
    print("‚ùå Could not find user")
else:
    print(f"‚úÖ User ID: {user_id}")
    
    # Step 2: Get latest 5 tweets
    print("\nüîÑ Fetching last 5 tweets...")
    tweets = get_latest_tweets(user_id, count=5)
    
    if not tweets:
        print("‚ùå No tweets found")
    else:
        print(f"‚úÖ Found {len(tweets)} tweets")
        
        # Get tweet IDs
        tweet_ids = [t['id'] for t in tweets]
        
        # Step 3: First try direct method for each tweet
        all_comments = {}
        
        for i, tweet in enumerate(tweets, 1):
            print(f"\nüìù Tweet {i}: {tweet['text'][:50]}...")
            print(f"   üí¨ Expected replies: {tweet['reply_count']}")
            
            comments = get_tweet_comments(tweet['id'])
            
            all_comments[tweet['id']] = {
                'tweet_text': tweet['text'],
                'comments': comments
            }
            
            print(f"   ‚úÖ Extracted {len(comments)} comments (direct method)")
        
        # Step 4: Use search method to find additional replies
        print("\n" + "=" * 50)
        print("üîÑ Using search method to find additional replies...")
        
        search_replies = get_all_replies_via_search(DSC_USERNAME, tweet_ids)
        
        # Merge search results with direct results
        for tweet_id, replies in search_replies.items():
            if tweet_id in all_comments:
                existing_ids = [c['comment_id'] for c in all_comments[tweet_id]['comments']]
                
                for reply in replies:
                    if reply['comment_id'] not in existing_ids:
                        all_comments[tweet_id]['comments'].append(reply)
                        print(f"   ‚ûï Added reply from @{reply['username']} to tweet {tweet_id[:10]}...")

        # Final summary
        print("\n" + "=" * 50)
        print("üìä Final Summary:")
        total_comments = 0
        for i, tweet in enumerate(tweets, 1):
            count = len(all_comments[tweet['id']]['comments'])
            total_comments += count
            print(f"   Tweet {i}: {count} comments (expected: {tweet['reply_count']})")
            
            for j, comment in enumerate(all_comments[tweet['id']]['comments'][:3], 1):
                text_preview = comment['text'][:50] + "..." if len(comment['text']) > 50 else comment['text']
                print(f"      {j}. @{comment['username']}: {text_preview}")

print("\n" + "=" * 50)
print(f"‚úÖ Done! Total comments extracted: {total_comments}")

üîç Fetching comments for @DSC_KAU...
üîÑ Getting user ID...
‚úÖ User ID: 1608405795707752448

üîÑ Fetching last 5 tweets...
‚úÖ Found 5 tweets

üìù Tweet 1: Ÿäÿ≥ÿ± ŸÜÿßÿØŸä ÿπŸÑŸÖ ÿßŸÑÿ®ŸäÿßŸÜÿßÿ™ ÿßŸÑÿ•ÿπŸÑÿßŸÜ ÿπŸÜ ÿßŸÑŸÅÿ±ŸÇ ÿßŸÑŸÅÿßÿ¶ÿ≤ÿ© ŸÅŸä ...
   üí¨ Expected replies: 2
   ‚úÖ Extracted 2 comments (direct method)

üìù Tweet 2: ÿßŸÑÿ™ÿÆÿ∑Ÿäÿ∑üìù! 
ÿ≠ŸÑŸÇÿ© ÿßŸÑŸàÿµŸÑ ÿ®ŸäŸÜ ÿßŸÑŸÅŸÉÿ±ÿ© ŸàÿßŸÑÿ™ŸÜŸÅŸäÿ∞ üöÄ

ŸÜÿßÿØŸä ...
   üí¨ Expected replies: 0
   ‚úÖ Extracted 0 comments (direct method)

üìù Tweet 3: ÿ¨ÿßŸáÿ≤ŸäŸÜ ŸÑŸÖÿ≥ÿßÿ®ŸÇÿ© ŸäŸÇÿØŸÖŸáÿß ŸÜÿßÿØŸä ÿπŸÑŸÖ ÿßŸÑÿ®ŸäÿßŸÜÿßÿ™üî•ÿü

ŸÜÿπŸÑŸÜ ŸÑŸÉ...
   üí¨ Expected replies: 1
   ‚úÖ Extracted 0 comments (direct method)

üìù Tweet 4: ŸÑÿ£ŸÜ ÿßŸÑÿπŸäŸÜ ÿ™ŸÅŸáŸÖ ÿ£ÿ≥ÿ±ÿπ ŸÖŸÜ ÿßŸÑŸÉŸÑÿßŸÖ üëÄ

ÿ¨ÿßŸäŸäŸÜŸÉŸÖ ŸÅŸä Ÿàÿ±ÿ¥ÿ© ÿπ...
   üí¨ Expected replies: 2
   ‚úÖ Extracted 0 comments (direct method)

üìù Tweet 5: ŸÖŸÜ ÿßŸÑÿ®ŸäÿßŸÜÿßÿ™ ÿ•ŸÑŸâ ÿßŸÑÿ±ÿ§Ÿâ ŸÖÿπ Ÿàÿ±ÿ¥ÿ© ÿ™ÿ≠ŸÑŸäŸÑ ÿßŸÑÿ®ŸäÿßŸÜÿßÿ™ ÿßŸÑÿßÿ≥ÿ™...
   üí¨ E

In [33]:
# Try simple search for replies to DSC_KAU
print("üîç Trying simple search for all replies to @DSC_KAU...")
print("=" * 50)

search_url = "https://twitter241.p.rapidapi.com/search-v2"

response = requests.get(
    search_url,
    headers=HEADERS,
    params={"query": "to:DSC_KAU", "count": "20", "type": "Latest"}
)

print(f"Status: {response.status_code}")

if response.status_code == 200:
    data = response.json()
    print(f"Response keys: {list(data.keys())}")
    
    # Try to extract tweets
    if 'result' in data:
        print(f"Result keys: {list(data['result'].keys())}")
    
    # Save for analysis
    with open('debug_search_v2.json', 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print("üìÅ Saved to: debug_search_v2.json")
else:
    print(f"Error response: {response.text[:200]}")

üîç Trying simple search for all replies to @DSC_KAU...


Status: 200
Response keys: ['cursor', 'result', 'status']
Result keys: ['timeline']
üìÅ Saved to: debug_search_v2.json


## üíæ Save Comments to JSON File

In [34]:
def save_comments_to_json(comments_data, filename="dsc_comments.json"):
    """Save comments to a JSON file"""
    if not comments_data:
        print("‚ùå No comments to save")
        return
    
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(comments_data, f, ensure_ascii=False, indent=2)
    
    # Count total comments
    total = sum(len(data['comments']) for data in comments_data.values())
    print(f"‚úÖ Saved {total} comments from {len(comments_data)} tweets to: {filename}")

# Save the comments
if 'all_comments' in dir() and all_comments:
    save_comments_to_json(all_comments)

‚úÖ Saved 2 comments from 5 tweets to: dsc_comments.json
