In [96]:
import spacy

nlp = spacy.load("fr_core_news_lg")

In [None]:
from pathlib import Path

import pandas as pd

DATA_FOLDER = Path(".") / "data"
PARTIES = list(DATA_FOLDER.glob("*.csv"))

dataset = pd.concat([pd.read_csv(party) for party in PARTIES]).reset_index(drop=True)

# The decision was made to aggregate all tweets of users to obtain a large corpus
# per user instead of invidivual tweets, which don't embed much political signal.
dataset = (
    dataset
        .drop_duplicates(subset=["tweet", "user_id"], keep="last")
        .groupby(["user_id", "account"])
        .agg({'tweet': ' '.join})
        .rename(columns={"tweet": "tweets"})
        .reset_index()
)
dataset

In [None]:
import string
import re

from spacy.lang.fr.stop_words import STOP_WORDS

def tokenizer(tweet):
    text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split()).lower()
    tokens = [token.lemma_ for token in nlp(text) if token.vector.any()]

    return [
        t for t in tokens
        if t not in STOP_WORDS
        and t not in string.punctuation
        and not t.isdigit() 
        and len(t) > 3
    ]

dataset["tweets"].head().apply(tokenizer)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(tokenizer=tokenizer, ngram_range=(1, 1))
X = vectorizer.fit_transform(dataset["tweets"])

vectorizer.get_feature_names_out()

In [None]:
df = pd.DataFrame(X[1].T.todense(), index=vectorizer.get_feature_names_out(), columns=["tfidf"]).sort_values(by=["tfidf"],ascending=False)
df[df["tfidf"] > 0.2]

In [101]:
from sklearn.model_selection import train_test_split

X = dataset["tweets"]
y = dataset["account"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [102]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()

p = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', classifier)
])

p.fit(X_train, y_train)



In [104]:
from sklearn import metrics

predicted = p.predict(X_test)

print("accuracy:", metrics.accuracy_score(y_test, predicted))

accuracy: 0.5789473684210527
