<a href="https://colab.research.google.com/github/Keerthana345/Twitter_Bot_Detection/blob/main/Twitter_bot_unsupervised.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install tweepy pandas



In [None]:
import tweepy
import pandas as pd
import time

# 📌 Step 1: Twitter API Credentials
BEARER_TOKEN = "your twitter bearer_token"  # Replace with your actual bearer token

# Authenticate Twitter API
client = tweepy.Client(bearer_token=BEARER_TOKEN)

# 📌 Step 2: Function to Fetch Tweets
def fetch_tweets(keyword="news", count=100):
    tweets_data = []

    try:
        # Fetch tweets in batches
        while len(tweets_data) < count:
            response = client.search_recent_tweets(
                query=f"{keyword} lang:en -is:retweet",  # Exclude retweets, fetch English tweets
                tweet_fields=["created_at"],
                user_fields=["username"],
                expansions=["author_id"],
                max_results=100  # Max per request
            )

            if response.data:
                users = {user.id: user.username for user in response.includes["users"]}

                for tweet in response.data:
                    tweets_data.append({
                        "username": users.get(tweet.author_id, "Unknown"),
                        "tweet": tweet.text,
                        "created_at": tweet.created_at
                    })

            # Prevent hitting rate limits
            time.sleep(2)

    except tweepy.TweepyException as e:
        print(f"Error fetching tweets: {e}")

    return tweets_data[:count]  # Ensure exactly `count` tweets are returned

# 📌 Step 3: Fetch & Save Tweets to CSV
tweets = fetch_tweets(keyword="AI", count=100)  # Fetch 100 tweets

# Convert to DataFrame
df = pd.DataFrame(tweets)

# Save as CSV
df.to_csv("tweets.csv", index=False)

print("100 Tweets saved to 'tweets.csv'")


100 Tweets saved to 'tweets.csv'


In [None]:
import pandas as pd

# 📌 Load dataset
df = pd.read_csv("/content/tweets.csv")
df

Unnamed: 0,username,tweet,created_at
0,capsysteam,Introducing DynamicOCR by CAPSYS Technologies!...,2025-03-06 15:01:43+00:00
1,ironumdigital,AI's latest leap? Seven new billionaires thank...,2025-03-06 15:01:43+00:00
2,odugbopeter91,"@partofdream_POD Hey @PublicAIData, sounds lik...",2025-03-06 15:01:43+00:00
3,Toneworks99,@davidpugliese @OttawaCitizen Billion dollar j...,2025-03-06 15:01:43+00:00
4,Tany212121,"Hey Phronians, I have just claimed my first Ph...",2025-03-06 15:01:43+00:00
...,...,...,...
95,MdRezaul766285,@imagesaicouldnt Looks like even AI needs a li...,2025-03-06 15:01:26+00:00
96,GonzalezTheas,✈️🌈🚀 FDV Surge Alert ✈️✨🚀\n👑 $CFC \n🔮 ALB9vFY3...,2025-03-06 15:01:26+00:00
97,Sayhio_,I just started earning on @nodepay_ai 💵\n\nWan...,2025-03-06 15:01:26+00:00
98,Dariban22,@hubdotxyz I hunt for early alpha like a squir...,2025-03-06 15:01:26+00:00


In [None]:
import pandas as pd
import re
from textblob import TextBlob
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

# Load the original dataset
df = pd.read_csv("tweets.csv")

# Function to extract text-based features
def extract_text_features(tweet):
    words = tweet.split()
    hashtags = len(re.findall(r"#\w+", tweet))
    mentions = len(re.findall(r"@\w+", tweet))
    urls = len(re.findall(r"https?://\S+", tweet))
    special_chars = len(re.findall(r"[!@#$%^&*()]", tweet))
    lexical_diversity = len(set(words)) / len(words) if words else 0

    return {
        "tweet_length": len(tweet),
        "word_count": len(words),
        "hashtags_count": hashtags,
        "mentions_count": mentions,
        "urls_count": urls,
        "special_chars_count": special_chars,
        "lexical_diversity": lexical_diversity,
        "sentiment_score": TextBlob(tweet).sentiment.polarity,
    }

# Apply feature extraction and merge with original dataset
features_df = df["tweet"].apply(lambda x: extract_text_features(x)).apply(pd.Series)
df = pd.concat([df, features_df], axis=1)

# Save the feature-enriched dataset
df.to_csv("tweet_features_combined.csv", index=False)

print("Tweet features extracted and merged into 'tweet_features_combined.csv'")


Tweet features extracted and merged into 'tweet_features_combined.csv'


In [None]:
df

Unnamed: 0,username,tweet,created_at,tweet_length,word_count,hashtags_count,mentions_count,urls_count,special_chars_count,lexical_diversity,sentiment_score
0,capsysteam,Introducing DynamicOCR by CAPSYS Technologies!...,2025-03-06 15:01:43+00:00,284.0,35.0,3.0,0.0,2.0,5.0,0.914286,0.500000
1,ironumdigital,AI's latest leap? Seven new billionaires thank...,2025-03-06 15:01:43+00:00,172.0,24.0,0.0,0.0,1.0,0.0,1.000000,0.176623
2,odugbopeter91,"@partofdream_POD Hey @PublicAIData, sounds lik...",2025-03-06 15:01:43+00:00,208.0,31.0,1.0,2.0,0.0,5.0,0.935484,0.337500
3,Toneworks99,@davidpugliese @OttawaCitizen Billion dollar j...,2025-03-06 15:01:43+00:00,111.0,16.0,0.0,2.0,0.0,2.0,1.000000,0.450000
4,Tany212121,"Hey Phronians, I have just claimed my first Ph...",2025-03-06 15:01:43+00:00,128.0,21.0,0.0,0.0,1.0,1.0,0.952381,0.312500
...,...,...,...,...,...,...,...,...,...,...,...
95,MdRezaul766285,@imagesaicouldnt Looks like even AI needs a li...,2025-03-06 15:01:26+00:00,151.0,26.0,0.0,2.0,0.0,4.0,0.961538,-0.292969
96,GonzalezTheas,✈️🌈🚀 FDV Surge Alert ✈️✨🚀\n👑 $CFC \n🔮 ALB9vFY3...,2025-03-06 15:01:26+00:00,218.0,28.0,0.0,0.0,2.0,6.0,0.964286,0.566667
97,Sayhio_,I just started earning on @nodepay_ai 💵\n\nWan...,2025-03-06 15:01:26+00:00,119.0,16.0,1.0,1.0,1.0,2.0,1.000000,0.100000
98,Dariban22,@hubdotxyz I hunt for early alpha like a squir...,2025-03-06 15:01:26+00:00,155.0,25.0,1.0,2.0,0.0,5.0,0.920000,-0.187500


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

# Load dataset
df = pd.read_csv("tweet_features_combined.csv")

# Select numerical features for clustering
features = ["tweet_length", "word_count", "hashtags_count", "mentions_count", "urls_count",
            "special_chars_count", "lexical_diversity", "sentiment_score"]

# Scale features for better clustering performance
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[features])

# Apply DBSCAN
dbscan = DBSCAN(eps=1.5, min_samples=5)
df["cluster"] = dbscan.fit_predict(df_scaled)

# Identify potential bots (outliers in DBSCAN are labeled as -1)
df["is_bot"] = df["cluster"] == -1  # Bots are usually in noise (-1 cluster)

# Save results
df.to_csv("dbscan_twitter_bot_results.csv", index=False)

print("DBSCAN clustering complete! Results saved in 'dbscan_twitter_bot_results.csv'")

# Print bot detection summary
print(df["is_bot"].value_counts())


DBSCAN clustering complete! Results saved in 'dbscan_twitter_bot_results.csv'
is_bot
False    61
True     39
Name: count, dtype: int64


In [None]:
data=pd.read_csv("/content/dbscan_twitter_bot_results.csv")
data

Unnamed: 0,username,tweet,created_at,tweet_length,word_count,hashtags_count,mentions_count,urls_count,special_chars_count,lexical_diversity,sentiment_score,cluster,is_bot
0,capsysteam,Introducing DynamicOCR by CAPSYS Technologies!...,2025-03-06 15:01:43+00:00,284.0,35.0,3.0,0.0,2.0,5.0,0.914286,0.500000,-1,True
1,ironumdigital,AI's latest leap? Seven new billionaires thank...,2025-03-06 15:01:43+00:00,172.0,24.0,0.0,0.0,1.0,0.0,1.000000,0.176623,0,False
2,odugbopeter91,"@partofdream_POD Hey @PublicAIData, sounds lik...",2025-03-06 15:01:43+00:00,208.0,31.0,1.0,2.0,0.0,5.0,0.935484,0.337500,1,False
3,Toneworks99,@davidpugliese @OttawaCitizen Billion dollar j...,2025-03-06 15:01:43+00:00,111.0,16.0,0.0,2.0,0.0,2.0,1.000000,0.450000,1,False
4,Tany212121,"Hey Phronians, I have just claimed my first Ph...",2025-03-06 15:01:43+00:00,128.0,21.0,0.0,0.0,1.0,1.0,0.952381,0.312500,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,MdRezaul766285,@imagesaicouldnt Looks like even AI needs a li...,2025-03-06 15:01:26+00:00,151.0,26.0,0.0,2.0,0.0,4.0,0.961538,-0.292969,1,False
96,GonzalezTheas,✈️🌈🚀 FDV Surge Alert ✈️✨🚀\n👑 $CFC \n🔮 ALB9vFY3...,2025-03-06 15:01:26+00:00,218.0,28.0,0.0,0.0,2.0,6.0,0.964286,0.566667,-1,True
97,Sayhio_,I just started earning on @nodepay_ai 💵\n\nWan...,2025-03-06 15:01:26+00:00,119.0,16.0,1.0,1.0,1.0,2.0,1.000000,0.100000,0,False
98,Dariban22,@hubdotxyz I hunt for early alpha like a squir...,2025-03-06 15:01:26+00:00,155.0,25.0,1.0,2.0,0.0,5.0,0.920000,-0.187500,1,False
