<a href="https://colab.research.google.com/github/MonaFaghfouri/Twitter-data-scraping/blob/main/Tweets_Without_Keywords.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nest_asyncio
!pip install twikit
import asyncio
import os
import pandas as pd
import csv
from twikit import Client, TooManyRequests
from datetime import datetime, timedelta
from random import uniform
import nest_asyncio

# Enable asyncio compatibility in Jupyter/Colab
nest_asyncio.apply()

# Constants
CONCURRENT_REQUESTS = 5  # Limit concurrent requests to avoid rate limits
BATCH_SIZE = 100  # Save tweets in batches
DATE_RANGE = ("2025-01-01", "2025-01-05")

# Define query directly
QUERY = "lang:fa"

def get_filename():
    return "2024-03-21.xlsx"

async def get_tweets(client, query, tweets=None):
    """Fetch tweets efficiently using pagination."""
    try:
        if tweets is None:
            tweets = await client.search_tweet(query, product="Latest")
        else:
            await asyncio.sleep(uniform(1, 2))  # Random delay to avoid detection
            tweets = await tweets.next()
        return tweets
    except TooManyRequests as e:
        wait_time = max((datetime.fromtimestamp(e.rate_limit_reset) - datetime.now()).total_seconds(), 5)
        print(f"Rate limit reached! Waiting {wait_time:.2f} seconds...")
        await asyncio.sleep(wait_time)
        return None
    except Exception as e:
        print(f"Error fetching tweets: {e}")
        return None

async def fetch_tweets_for_date(client, start_date, end_date, semaphore):
    """Fetch tweets asynchronously for a given date range."""
    adjusted_end_date = (datetime.strptime(end_date, "%Y-%m-%d") + timedelta(days=1)).strftime("%Y-%m-%d")
    query = f"{QUERY} since:{start_date} until:{adjusted_end_date}"
    tweet_data_list = []
    tweets = None
    filename = get_filename()

    async with semaphore:
        while len(tweet_data_list) < 200:
            tweets = await get_tweets(client, query, tweets)
            if not tweets:
                break

            for tweet in tweets:
                if len(tweet_data_list) >= 200:
                    break
                tweet_data_list.append([
                    tweet.user.name, tweet.text, tweet.created_at,
                    tweet.retweet_count, tweet.favorite_count, tweet.user.location or "Unknown"
                ])

            if len(tweet_data_list) >= BATCH_SIZE and len(tweet_data_list) < 200:
                save_to_excel(tweet_data_list, filename)
                tweet_data_list.clear()

            print(f"Fetched {len(tweet_data_list)} tweets for range {start_date} - {end_date}.")

        if tweet_data_list:
            save_to_excel(tweet_data_list[:200], filename)

async def fetch_tweets():
    """Run tweet fetching in parallel with concurrency control."""
    client = Client(language="fa")
    try:
        client.load_cookies("data.json")
    except Exception as e:
        print(f"Error loading cookies: {e}. Try deleting 'cookies.json' and re-authenticating.")
        return

    semaphore = asyncio.Semaphore(CONCURRENT_REQUESTS)
    tasks = [fetch_tweets_for_date(client, DATE_RANGE[0], DATE_RANGE[1], semaphore)]
    await asyncio.gather(*tasks)
    print("All tweets fetched successfully!")

def save_to_excel(tweet_data, filename):
    """Save tweets efficiently to an Excel file."""
    df = pd.DataFrame(tweet_data, columns=["Username", "Text", "Created At", "Retweets", "Likes", "Location"])
    if os.path.exists(filename):
        existing_df = pd.read_excel(filename)
        df = pd.concat([existing_df, df], ignore_index=True)
    df.to_excel(filename, index=False)
    print(f"Saved {len(tweet_data)} tweets to {filename}")

# Run script
asyncio.run(fetch_tweets())