# Basic twitter classifier
---

# Import
---

In [None]:
from datetime import datetime
import pandas as pd
import numpy as np

# Functions
---

## Account Properties

In [None]:
import re

In [None]:
def urlRatio(user,tweets):
    count_url = 0
    count_tweet = 0
    
    for tweet in tweets:
        count_url += len(tweet['entities']['urls'])
        count_tweet += 1
        
    return count_url/count_tweet * 100

In [None]:
def ffRatio(user):
    if(user['followers_count']!=0):
        return user['friends_count']/user['followers_count']
    return 0

In [None]:
def verified(user):
    return int(user['verified'])  

In [None]:
def created(user):
    time_seconds = datetime.strptime(user['created_at'],'%a %b %d %H:%M:%S %z %Y').timestamp()
    if time_seconds > 1571090400 and time_seconds < 1573340400:
        return 100.0
    return 0

In [None]:
def hashtagRatio(user,tweets):
    count_hashtag = 0
    count_tweet = 0
    
    for tweet in tweets:
        count_hashtag += len(tweet['entities']['hashtags'])
        count_tweet += 1
        
    return count_hashtag/count_tweet * 100

In [None]:
def mentionsRatio(user,tweets):
    count_mentions = 0
    count_tweet = 0
    
    for tweet in tweets:
        count_mentions += len(tweet['entities']['user_mentions'])
        count_tweet += 1
        
    return count_mentions/count_tweet * 100

In [None]:
def maxTweetsHour(user,tweets):
    hourlyTweets = []
    
    for tweet in tweets:
        time_seconds = datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S %z %Y').timestamp()
        hourlyTweets.append(time_seconds)
        
    tmp_counter = 0
    max_counter = 0
    last_hour = 0
    for i in range(len(hourlyTweets)-1):
        tmp_counter += 1 
        if((hourlyTweets[last_hour]+3600)<hourlyTweets[i+1]):
            max_counter = max(max_counter,tmp_counter)
            tmp_counter = 0
            last_hour = i + 1

    return max_counter
    

In [None]:
def retweetCount(user,tweets):
    retweet_count = 0
    count_tweet = 0
    
    for tweet in tweets:
        retweet_count += tweet['retweet_count']
        count_tweet += 1
        
    return retweet_count/count_tweet * 100

In [None]:
def accountPropertiesComponent(users):
    user_properties = []
    for user in users:
        
        tweets = list(db_test.tweets.find({'id_user' : user['id_str']}).limit(300).rewind())

        R1 = urlRatio(user,tweets)
        R2 = ffRatio(user)
        R3 = verified(user)
        R4 = created(user)
        R5 = hashtagRatio(user,tweets)
        R6 = mentionsRatio(user,tweets)
        R7 = maxTweetsHour(user,tweets)
        R8 = retweetCount(user,tweets)
        
        results = {"id_user": user['id_str'], "urlRatio": R1, "friendFollowers" : R2, "verified" : R3, "created" : R4, "hashtagRatio" : R5, "mentionsRatio" : R6, "MaxTweetsHour" : R7, 'retweetCountRatio':R8 }
        db_test.results.update_one({'id_str':results['id_user']},{"$set": results},upsert=True)
        user_properties.append(results)
    return user_properties

## Entropy Account

In [None]:
#--------------------------------------------------------------------------------------
#    Title: <title of program/source code>
#    Author: enzo-santos
#    Date: 3 may 2020
#    Availability: https://gist.github.com/DustinAlandzes/a835909ffd15b9927820d175a48dee41#gistcomment-3285038
#
#--------------------------------------------------------------------------------------

def ApEn_new(U, m, r):
    U = np.array(U)
    N = U.shape[0]
            
    def _phi(m):
        z = N - m + 1.0
        x = np.array([U[i:i+m] for i in range(int(z))])
        X = np.repeat(x[:, np.newaxis], 1, axis=2)
        C = np.sum(np.absolute(x - X).max(axis=2) <= r, axis=0) / z
        return np.log(C).sum() / z
    
    return abs(_phi(m + 1) - _phi(m))

In [None]:
def meassureEntropyOne(user):
    timestamps = db_test.tweets.find({'id_user' : user['id_str']},{'created_at':1})
    hourlyTweets = []
    for time in timestamps:
        time_seconds = datetime.strptime(time['created_at'],'%a %b %d %H:%M:%S %z %Y').timestamp()
        hourlyTweets.append(time_seconds)

    #segmentSize = 3600 #3600 #86400
    segmentNumber = 100
    segmentSize = int((max(hourlyTweets)-min(hourlyTweets)))/segmentNumber
    segmentList = [0]*(segmentNumber+1)
    minHT = min(hourlyTweets)
    
    for time in hourlyTweets:
        diference = time-minHT
        segment = int(diference/segmentSize)
        segmentList[segment] += 1
        
    return {"id_user": user['id_str'], "entropy": ApEn_new(segmentList, 7, 0.3 * np.std(segmentList))}

In [None]:
def meassureEntropyAll(users):
    users_entropy = []
    for user in users:
        results = meassureEntropyOne(user)
        users_entropy.append(results)
        db_test.results.update_one({'id_str':results['id_user']},{"$set": results},upsert=True)
    return users_entropy

## Spam/Ham

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [None]:
def prepareTweetsTrain(users):
    tweets_final = []
    for user in users:
        tweets = db_train.tweets.find({'id_user' : user['id_str']},{'text':1,})
        for tweet in tweets:
            label = user['label']
            tweet = tweet['text']
            tweets_final.append([label,tweet])
    return pd.DataFrame(tweets_final, columns =['label', 'message'])

In [None]:
def prepareTweetsTest(user):
    tweets_final = []
    tweets = db_test.tweets.find({'id_user' : user['id_str']},{'text':1,})
    for tweet in tweets:
        tweet = tweet['text']
        tweets_final.append(['',tweet])
    return pd.DataFrame(tweets_final, columns =['label', 'message'])

In [None]:
def spamDetectionOne(user,tweets_test,tweets_train,bow_transformer,tfidf_transformer,spam_detect_model):
    bow = bow_transformer.transform(tweets_test['message'])
    m_tfidf = tfidf_transformer.transform(bow)
    prediction = spam_detect_model.predict(m_tfidf).tolist()
     
    return {"id_user": user['id_str'], "spam": 100*prediction.count("bot")/len(prediction)} 

In [None]:
def spamDetectionAll(test_users, train_users):
    user_spam = []
    tweets_train = prepareTweetsTrain(train_users)
    
    bow_transformer = CountVectorizer().fit(tweets_train['message'])
    train_bow = bow_transformer.transform(tweets_train['message'])
    
    tfidf_transformer = TfidfTransformer().fit(train_bow)
    train_tfidf = tfidf_transformer.transform(train_bow)
    
    spam_detect_model = MultinomialNB().fit(train_tfidf,tweets_train['label'])
    
    for user_test in test_users:
        tweets_test = prepareTweetsTest(user_test)
        results = spamDetectionOne(user_test,tweets_test,tweets_train,bow_transformer,tfidf_transformer,spam_detect_model)
        user_spam.append(results)
        db_test.results.update_one({'id_str':results['id_user']},{'$set':results},upsert=True)
    return user_spam

## Bot Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

In [None]:
def botClassifier(df_test,df_train,train_labels):
    rf = RandomForestClassifier(n_estimators = 1000, random_state = 1)
    rf.fit(df_train, train_labels);
    predictions = rf.predict(df_test)

    results = []
    for i in range(len(predictions)-1):
        results.append({'id_str':test_users_results[i]['id_user'],'label':predictions[i]})
    return results

## Auxiliar

In [None]:
from collections import defaultdict 
def dict_merger(d1,d2,d3, keyname):
    d4 = defaultdict(dict)

    for elem in d1: 
        d4[elem[keyname]].update(elem) 

    for elem in d2:
        d4[elem[keyname]].update(elem)

    for elem in d3: 
        d4[elem[keyname]].update(elem)

    return d4.values()

In [None]:
def set_db(df):
    labels = 0
    if('label' in df.columns):
        labels = np.array(df_train['label'])

    df = df.drop('label', axis=1, errors='ignore')
    df = df.drop('id_str', axis=1, errors='ignore')
    df = df.drop('id_user', axis=1, errors='ignore')
    df = df.drop('_id', axis=1, errors='ignore')

    return df, labels


# Execution
---

## MongoDB Connection

In [None]:
from pymongo import MongoClient
from pymongo import IndexModel, ASCENDING, DESCENDING

In [None]:
username = 'user'
password = 'password'

url = 'mongodb://'+username+':'+password+'@127.0.0.1/twitterEN'
client1 = MongoClient(url)
client2 = MongoClient(url)

db_train = client1.twitterEN
db_test = client2.twitterESP

## Prepare Training Data and Test Data

In [None]:
train_users = list(db_train.users.find({}))
test_users = list(db_test.users.find({}))

## Get Users Feactures

In [None]:
user_property = accountPropertiesComponent(test_users)

In [None]:
user_entropy = meassureEntropyAll(test_users)

In [None]:
user_spam = spamDetectionAll(test_users,train_users)

In [None]:
all_users = db_test.users.find({})
user_attributes = dict_merger(user_entropy,user_spam,user_property, 'id_user')

## Get Bots Predictions

In [None]:
train_users_results = list(db_train.results.find({}))
test_users_results = list(db_test.results.find({}))

In [None]:
df_train = pd.DataFrame(train_users_results)
df_test = pd.DataFrame(test_users_results) # pd.DataFrame(user_attributes)

In [None]:
df_train, train_labels = set_db(df_train)
df_test, test_labels = set_db(df_test)

In [None]:
results = botClassifier(df_test,df_train,train_labels)

In [None]:
list_bots = []
list_humans = []
for item in results:
    if item['label'] == 'bot':
        list_bots.append(item)
    else:
        list_humans.append(item)

In [None]:
len(list_bots)/len(results)*100