In [97]:
from dotenv import load_dotenv

load_dotenv()

True

In [98]:
import os
import tweepy

client = tweepy.Client(os.getenv("TWITTER_BEARER_TOKEN"))

In [99]:
from enum import Enum
from math import floor

MLP_TWITTER_ID = 217749896
JLM_TWITTER_ID = 80820758

FOLLOWERS_COUNT = 10 # max 1000

PERSONALITIES = (MLP_TWITTER_ID, JLM_TWITTER_ID)

followers = {
    personality: [user.id for user in client.get_users_followers(personality, max_results=FOLLOWERS_COUNT).data]
    for personality in PERSONALITIES
}

In [104]:
import itertools

TWEETS_COUNT = 10 # max 200
MINIMUM_TWEET_PER_USER = 10

def filter_and_flatten(l):
    return list(itertools.chain.from_iterable(
        filter(lambda e: e is not None and len(e) >= MINIMUM_TWEET_PER_USER, l)
    ))


def tweets_from_followers(followers):
    return list(
        map(
            lambda t: t.text,
            filter_and_flatten([
                client.get_users_tweets(id=follower, max_results=TWEETS_COUNT, exclude="retweets").data
                for follower in followers
            ])
        )
    )


tweets = {
    personality: tweets_from_followers(followers[personality])
    for personality in PERSONALITIES
}

In [145]:
import re
from nltk.tokenize import TweetTokenizer

tokenizer = TweetTokenizer(
    preserve_case=False,
    strip_handles=False,
    reduce_len=False,
    match_phone_numbers=False
)

def tokenize_tweet(tweet):
    return tokenizer.tokenize(
        re.sub(
            r'https:\/\/t.co\/[a-zA-Z]+', '',
            tweet.replace("'", " ")
        )
    )


tokenized_tweets = {
    personality: [tokenize_tweet(tweet) for tweet in tweets[personality]]
    for personality in PERSONALITIES
}

tokenized_tweets

{217749896: [['@syondzi',
   '@zemmoureric',
   'et',
   'l',
   'esclavage',
   'en',
   'afrique',
   'des',
   'noirs',
   'par',
   'les',
   'arabes',
   'qui',
   'continue',
   'toujours',
   'à',
   'l',
   'heure',
   'actuelle',
   '?'],
  ['@cassejrmie1',
   '@zemmoureric',
   'petain',
   'soutenait',
   'la',
   'même',
   'politique',
   'que',
   'meluche'],
  ['@usuldufutur', 'oh', 'le', 'contre', 'son', 'camp'],
  ['@allymcallyy',
   '@marchedescalier',
   '@kassius2022',
   '@ooc_rn',
   'c',
   'est',
   'ça',
   'l',
   'électeur',
   'haineux',
   'de',
   'la',
   'france'],
  ['@jesus_nazareen',
   '@patrioteavttout',
   '@marxfanaccount',
   'moi',
   'je',
   'dis',
   '20',
   'annuités',
   ',',
   'qui',
   'dit',
   'mieux',
   '?'],
  ['@abrahel27',
   '@calmdownnowbro',
   '@marxfanaccount',
   'moi',
   'je',
   'propose',
   'la',
   'retraite',
   'à',
   '35',
   'ans',
   'avec',
   '13',
   'annuités',
   '.',
   'ce',
   'serait',
   'une',
   'pis

In [206]:
import pandas as pd

data = pd.concat([
    pd.DataFrame.from_records([
        [tweet, personality]
        for tweet in tokenized_tweets[personality]
    ], columns=["tweet", "personality"])
    for personality in PERSONALITIES
])

In [210]:
train_set = data.sample(frac=0.8)
test_set = data.drop(train_set.index)

In [254]:
from collections import Counter

train_bow = {
    personality: Counter(
        list(itertools.chain.from_iterable(train_set[train_set["personality"] == personality]["tweet"]))
    )
    for personality in PERSONALITIES
}

vocabulary = set(itertools.chain.from_iterable([
    set(train_bow[personality].elements())
    for personality in PERSONALITIES
]))

In [255]:
from math import log

log_priors = {
    personality: log(len(train_set[train_set["personality"] == personality]) / len(train_set))
    for personality in PERSONALITIES
}

log_likelihood = {
    personality: {
        word: train_bow[personality][word] + 1 / (sum(train_bow[personality].values()) + len(vocabulary))
        for word in vocabulary
    }
    for personality in PERSONALITIES
}

In [256]:
from functools import reduce

def get_prob(prior, likelihood, tweet):
    return reduce(lambda acc, word: acc + likelihood[word] if word in likelihood else 0, tweet, 0.0)

def predict(tweet):
    class_sum = {
        personality: get_prob(log_priors[personality], log_likelihood[personality], tweet)
        for personality in PERSONALITIES
    }
    return class_sum


test_v = test_set.iloc[0]
predict(test_v["tweet"])

{217749896: 9.010033444816054, 80820758: 8.00494233937397}