In [54]:
import pandas as pd
import numpy as np
#from collections import Counter
#from datetime import datetime

class BotDetector:
    def __init__(self, df):
        self.df = df.copy()
        self.df['timestamp'] = pd.to_numeric(self.df['timestamp'])
        self.suspicious_users = set()
        
    def detect_rapid_posting(self, min_posts=3, time_window=10):
        """Detect users posting many times within short time windows"""
        user_posts = self.df.groupby('user_id').agg({
            'timestamp': lambda x: sorted(x.tolist()),
            'text_id': 'count'
        }).reset_index()
        
        rapid_posters = []
        for _, row in user_posts.iterrows():
            if row['text_id'] < min_posts:
                continue
                
            timestamps = row['timestamp']
            for i in range(len(timestamps) - min_posts):
                window = timestamps[i:i + min_posts]
                if window[-1] - window[0] <= time_window:
                    rapid_posters.append(row['user_id'])
                    break
                    
        self.suspicious_users.update(rapid_posters)
        return rapid_posters
    
    def detect_duplicate_content(self, similarity_threshold=0.9, min_duplicates=3):
        """Find users posting very similar content multiple times"""
        from difflib import SequenceMatcher
        
        def text_similarity(text1, text2):
            return SequenceMatcher(None, str(text1), str(text2)).ratio()
        
        duplicate_posters = []
        user_posts = self.df.groupby('user_id')
        
        for user_id, posts in user_posts:
            if len(posts) < min_duplicates:
                continue
                
            texts = posts['text'].tolist()
            similar_posts = 0
            
            for i in range(len(texts)):
                for j in range(i + 1, len(texts)):
                    if text_similarity(texts[i], texts[j]) >= similarity_threshold:
                        similar_posts += 1
                        if similar_posts >= min_duplicates:
                            duplicate_posters.append(user_id)
                            break
                if user_id in duplicate_posters:
                    break
                    
        self.suspicious_users.update(duplicate_posters)
        return duplicate_posters
    
    def detect_periodic_posting(self, min_posts=10, variance_threshold=0.1):
        """Identify users posting at suspiciously regular intervals"""
        periodic_posters = []
        user_posts = self.df.groupby('user_id')
        
        for user_id, posts in user_posts:
            if len(posts) < min_posts:
                continue
                
            timestamps = sorted(posts['timestamp'])
            intervals = np.diff(timestamps)
            
            # Calculate coefficient of variation (normalized variance)
            cv = np.std(intervals) / np.mean(intervals)
            if cv < variance_threshold:
                periodic_posters.append(user_id)
                
        self.suspicious_users.update(periodic_posters)
        return periodic_posters
    
    def get_bot_likelihood_scores(self):
        """Calculate a bot likelihood score for each user"""
        scores = {}
        total_checks = 3  # Number of detection methods
        
        rapid = set(self.detect_rapid_posting())
        duplicates = set(self.detect_duplicate_content())
        periodic = set(self.detect_periodic_posting())
        
        all_users = set(self.df['user_id'].unique())
        
        for user in all_users:
            score = 0
            if user in rapid: score += 1
            if user in duplicates: score += 1
            if user in periodic: score += 1
            scores[user] = score / total_checks
            
        return pd.DataFrame([
            {'user_id': user, 'bot_likelihood': score} 
            for user, score in scores.items()
        ]).sort_values('bot_likelihood', ascending=False)

In [57]:
# First load your JSON dataset
from pandas import json_normalize
import json

df = pd.read_json('../data/dataset.json')

# Create detector instance and analyze
detector = BotDetector(df)
suspicious_users = detector.get_bot_likelihood_scores()

# View top suspicious users
#print(suspicious_users.head(10))

# For specific behaviors, run individual methods:
rapid_posters = detector.detect_rapid_posting()
duplicate_posters = detector.detect_duplicate_content()
periodic_posters = detector.detect_periodic_posting()

print(duplicate_posters)

[1005036246, 1007621559, 1007690223, 1010626427, 1013703855, 1018419764, 1031499948, 1045532631, 1049151527, 1050902549, 1050929533, 1055454345, 1056254952, 1061458474, 1061862662, 1064632123, 1064791428, 1068051022, 1072645177, 1077881120, 1081539622, 1099719517]
