In [52]:
from twikit import Client, TooManyRequests
from configparser import ConfigParser
import asyncio
import csv
import os
from datetime import datetime, timezone
import pandas as pd


In [2]:
async def login():
    config = ConfigParser()
    config.read('config.ini')
    username = config['X']['username']
    email = config['X']['email']
    password = config['X']['password']

    client = Client(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.120 Safari/537.36", language='en-US')

    await client.login(
        auth_info_1=username,
        auth_info_2=email,
        password=password
    )
    client.save_cookies("cookies.json")

    return client



async def main():
    if os.path.exists("cookies.json"):
        client = Client(language='en-US')
        client.load_cookies('cookies.json')
    else:
        client = await login()
    

    #* create a csv file
    with open('tweets.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Tweet_count', 'Username', 'Text', 'Created At', 'Retweets', 'Likes'])

    tweets = await client.search_tweet('python', 'Latest')

    # Print fetched tweets
    for tweet in tweets:
        print(
            vars(tweet)
        )

In [7]:
await main()

{'_client': <twikit.client.client.Client object at 0x1184f2c60>, '_data': {'__typename': 'Tweet', 'rest_id': '1868002976033628622', 'core': {'user_results': {'result': {'__typename': 'User', 'id': 'VXNlcjoxNTE4MzM4MDI0Mjc4NDc4ODQ4', 'rest_id': '1518338024278478848', 'affiliates_highlighted_label': {}, 'has_graduated_access': True, 'is_blue_verified': False, 'profile_image_shape': 'Circle', 'legacy': {'can_dm': False, 'can_media_tag': True, 'created_at': 'Sun Apr 24 21:16:16 +0000 2022', 'default_profile': True, 'default_profile_image': False, 'description': 'i love robot girls 💖\nim afraid of ia and women 😭', 'entities': {'description': {'urls': []}}, 'fast_followers_count': 0, 'favourites_count': 10244, 'followers_count': 8, 'friends_count': 142, 'has_custom_timelines': False, 'is_translator': False, 'listed_count': 0, 'location': '', 'media_count': 22, 'name': 'PythonXavier', 'normal_followers_count': 8, 'pinned_tweet_ids_str': [], 'possibly_sensitive': False, 'profile_banner_url': '

In [19]:

async def fetch_user_data(username, client):
    # Initialize Twikit client

    # Fetch user information
    user = await client.search_user(username)

    # Extract relevant user data
    user_data = user._data['legacy']
    # created_at = datetime.strptime(user_data['created_at'], "%a %b %d %H:%M:%S %z %Y")

    # # Calculate derived fields
    # current_time = datetime.now(timezone.utc)
    # account_age_days = (current_time - created_at).days
    # average_tweets_per_day = user_data['statuses_count'] / account_age_days if account_age_days > 0 else 0

    # # Prepare structured data
    # account_info = {
    #     'bot': 'Unknown',  # Needs manual labeling or additional logic
    #     'screen_name': user_data['screen_name'],
    #     'created_at': created_at.strftime("%Y-%m-%d"),
    #     'hour_created': created_at.hour,
    #     'verified': user_data['verified'],
    #     'geo_enabled': 'Unavailable',  # Not provided by Twikit
    #     'lang': user_data.get('lang', 'N/A'),
    #     'default_profile': user_data['default_profile'],
    #     'default_profile_image': user_data['default_profile_image'],
    #     'favourites_count': user_data['favourites_count'],
    #     'followers_count': user_data['followers_count'],
    #     'friends_count': user_data['friends_count'],
    #     'statuses_count': user_data['statuses_count'],
    #     'average_tweets_per_day': round(average_tweets_per_day, 2),
    #     'account_age_days': account_age_days
    # }

    # return account_info

In [54]:
async def search_users_to_df(username, number_users = 20):
    # Load cookies or log in if cookies are unavailable
    if os.path.exists("cookies.json"):
        client = Client(language='en-US')
        client.load_cookies('cookies.json')
    else:
        client = await login()
    
    # Search for the user by username
    users = await client.search_user(username, count=number_users)
    if not users:
        print(f"No users found for username: {username}")
        return
    
    all_user_data = []

    for user in users:
        created_at = datetime.strptime(user.created_at, "%a %b %d %H:%M:%S %z %Y")

        # Calculate derived fields
        current_time = datetime.now(timezone.utc)
        account_age_days = (current_time - created_at).days
        average_tweets_per_day = user.statuses_count / account_age_days if account_age_days > 0 else 0

        # Structure the user data
        user_data = {
            'screen_name': user.screen_name,
            'created_at': created_at.strftime("%Y-%m-%d"),
            'hour_created': created_at.hour,
            'verified_blur': user.is_blue_verified,
            'default_profile': user.default_profile,
            'default_profile_image': user.default_profile_image,
            'favourites_count': user.favourites_count,
            'followers_count': user.followers_count,
            'friends_count': user.following_count,
            'statuses_count': user.statuses_count,
            'average_tweets_per_day': round(average_tweets_per_day, 2),
            'account_age_days': account_age_days
        }
        all_user_data.append(user_data)
    
    df = pd.DataFrame(all_user_data)

    return df




In [62]:
username_to_search = "ua"
count = 1000

df = await search_users_to_df(username_to_search, count)

df.to_csv("twitter_zelenskiy_data.csv", index=False)
df



Unnamed: 0,screen_name,created_at,hour_created,verified_blur,default_profile,default_profile_image,favourites_count,followers_count,friends_count,statuses_count,average_tweets_per_day,account_age_days
0,kiyoiscute,2017-10-30,0,True,True,False,75731,108820,961,5780,2.22,2602
1,UnionAlgerienne,2022-12-29,10,True,True,False,1065,24551,0,1075,1.5,716
2,Dialog_UA,2014-03-20,15,True,False,False,9889,80714,1013,322270,82.17,3922
3,UA_EN_TCG,2023-11-29,5,True,True,False,5,10197,9,855,2.24,381
4,ukraineuachina,2022-09-20,11,False,True,False,6,1637,0,601,0.74,816
5,SamUA2000,2008-06-02,21,True,True,False,437626,53805,186,178665,29.59,6039
6,UAEPM28937,2024-12-12,7,False,True,False,0,0,0,0,0.0,2
7,f20221211,2024-12-12,21,False,True,False,0,0,0,5,5.0,1
8,BrennpunktUA,2021-02-14,4,True,True,False,203456,66884,1229,121873,87.11,1399
9,PaperSkies_ua,2022-10-04,20,True,True,False,205,925,16,47,0.06,802


In [74]:
async def fetch_updated_user_data(user_id, client):
    try:
        # Search for the user by screen_name
        user = await client.get_user_by_id(user_id)
        if not user:
            print(f"User not found: {user_id}")
            return None

        created_at = datetime.strptime(user.created_at, "%a %b %d %H:%M:%S %z %Y")

        # Calculate derived fields
        current_time = datetime.now(timezone.utc)
        account_age_days = (current_time - created_at).days
        average_tweets_per_day = user.statuses_count / account_age_days if account_age_days > 0 else 0

        # Return the updated data for existing columns
        return {
            'created_at': created_at.strftime("%Y-%m-%d %H:%M:%S"),
            'hour_created': created_at.hour,
            'verified': user.is_blue_verified,
            'default_profile': user.default_profile,
            'default_profile_image': user.default_profile_image,
            'favourites_count': user.favourites_count,
            'followers_count': user.followers_count,
            'friends_count': user.following_count,
            'statuses_count': user.statuses_count,
            'average_tweets_per_day': round(average_tweets_per_day, 2),
            'account_age_days': account_age_days
        }

    except Exception as e:
        print(f"Error fetching data for {user_id}: {e}")
        return None

# Function to update the dataset with fresh data
async def update_dataset(file_path, output_path):
    # Load the existing dataset
    df = pd.read_csv(file_path)

    # Load Twikit client
    if os.path.exists("cookies.json"):
        client = Client(language='en-US')
        client.load_cookies('cookies.json')
    else:
        raise FileNotFoundError("cookies.json file is required for authentication")

    # Process and update user data row by row
    for idx, row in df.iterrows():
        screen_name = row['screen_name']
        user_id = row["id"]  # Fetch user ID
        bot = 1 if row["account_type"] == 'bot' else 0  # Check account type
        
        print(f"Updating data for: {screen_name} ({idx+1}/{len(df)})")
        
        # Fetch updated data for the user
        updated_user = await fetch_updated_user_data(user_id, client)

        if updated_user:
            # Update only the existing columns in the dataset
            for key, value in updated_user.items():
                df.at[idx, key] = value
            df.at[idx, "bot"] = bot
        else:
            print(f"Skipping user: {screen_name}")

    # Save the updated dataset
    df.to_csv(output_path, index=False)
    print(f"Updated dataset saved to {output_path}")

In [78]:
input_file = "../archive(3)/twitter_human_bots_dataset.csv"  # Input dataset path
output_file = "updated_twitter_dataset.csv"    # Output dataset path

await update_dataset(input_file, output_file)

Updating data for: paty_castroo (1/37438)
Updating data for: CBirckner (2/37438)
Updating data for: amf_jay (3/37438)


CancelledError: 