In [39]:
import pandas as pd
from nltk.stem import WordNetLemmatizer
import nltk
import unicodedata
import re
from gensim.models import Word2Vec

In [11]:
relativePath = "../data/kawintiranon-stance-detection/" 
file = "biden_stance_train_public.csv"
path = relativePath + file

In [12]:
def load_dataset(file):
    relativePath = "../data/kawintiranon-stance-detection/"
    return pd.read_csv(relativePath+file)

In [30]:
lemmatizer = WordNetLemmatizer()

def clean_tweet(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore').lower()
    text = re.sub(r'#[^ ]+', '', text)  # removing hashtags
    text = re.sub(r'https.+', '', text)  # removing links
    text = re.sub(r'@[^ ]+', '', text)  # removing mentions
    text = re.sub(r'[^(a-zA-Z)\s]', '', text) # removing special characters
    return text

def filter_stopwords_and_lemmatize(text):
    stopwords = nltk.corpus.stopwords.words('english')
    words = text.split() 
    filtered_words = [word for word in words if word not in stopwords] 
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    return " ".join(lemmatized_words)

In [31]:
biden_train = load_dataset("biden_stance_train_public.csv")
trump_train = load_dataset("trump_stance_train_public.csv")

biden_test = load_dataset("biden_stance_test_public.csv")
trump_test = load_dataset("trump_stance_test_public.csv")

all_datasets = [biden_train, trump_train, biden_test, trump_test]

In [32]:
biden_train

Unnamed: 0,tweet_id,text,label
0,1311162417938919424,imma keep it real with y’all i don’t think the...,NONE
1,1299361296400429057,@USER guess the #cult45 #trumpdrunkmorons can ...,NONE
2,1239350672782483456,joe “let’s be reasonable guys not everybody ca...,NONE
3,1299022276571140099,pelosi says biden shouldn't debate trump: 'i w...,AGAINST
4,1295359993655578624,"@USER hey @USER, here’s what your own party th...",NONE
...,...,...,...
870,1296637794966745088,i’m excited to hear from hunter biden’s family...,NONE
871,1295317559802302466,@USER your still talking about trump? get over...,AGAINST
872,1292661159078723590,@USER we will...thank you for checking\n#biden...,FAVOR
873,1298457178773184512,"i thought you were not doing bad, but then the...",NONE


In [33]:
def stance_to_sentiment(stance):
    if (stance == "NONE"): return 0
    elif (stance == "AGAINST"): return -1
    elif (stance == "FAVOR"): return 1

In [34]:
for dataset in all_datasets:
    dataset["sentiment"] = dataset["label"].apply(stance_to_sentiment)
    dataset["cleaned_text"] = dataset["text"].apply(clean_tweet)
    dataset["lemmatized_text"] = dataset["cleaned_text"].apply(filter_stopwords_and_lemmatize)

In [36]:
biden_train[["text", "cleaned_text", "lemmatized_text"]]

Unnamed: 0,text,cleaned_text,lemmatized_text
0,imma keep it real with y’all i don’t think the...,imma keep it real with yall i dont think the ...,imma keep real yall dont think twitch
1,@USER guess the #cult45 #trumpdrunkmorons can ...,guess the can pick their poison unlike jim ...,guess pick poison unlike jim jones offered one...
2,joe “let’s be reasonable guys not everybody ca...,joe lets be reasonable guys not everybody can ...,joe let reasonable guy everybody get medicare ...
3,pelosi says biden shouldn't debate trump: 'i w...,pelosi says biden shouldnt debate trump i woul...,pelosi say biden shouldnt debate trump wouldnt...
4,"@USER hey @USER, here’s what your own party th...",hey heres what your own party thinks of you ...,hey here party think billboard paid world see ...
...,...,...,...
870,i’m excited to hear from hunter biden’s family...,im excited to hear from hunter bidens family a...,im excited hear hunter bidens family http
871,@USER your still talking about trump? get over...,your still talking about trump get over it so...,still talking trump get sonnyboy he going win
872,@USER we will...thank you for checking\n#biden...,we willthank you for checking\n,willthank checking
873,"i thought you were not doing bad, but then the...",i thought you were not doing bad but then the ...,thought bad lie came like


# Uczenie modelu dla word2vec

In [152]:
from sklearn import svm
import numpy as np
from sklearn.metrics import f1_score
from sklearn.linear_model import LinearRegression

In [37]:
model_path = "../models/word2vec.model"

In [41]:
w2v_model = Word2Vec.load(model_path)

In [43]:
svc = svm.SVC(kernel='linear', C=1, probability=True) 

In [133]:
vec_size = 100


In [134]:
def text_to_vector(text):
    tokens = text.split()
    vec = np.zeros(vec_size).reshape((1, vec_size))
    count = 0
    for word in tokens:
        try:
            vec += w2v_model.wv[word].reshape((1, vec_size))
            count += 1.
        except KeyError:  # handling the case where the token is not in vocabulary
            continue
    if count != 0:
        vec /= count
    return vec

In [136]:
def text_column_into_w2v_array(column):
    wordvec_arrays = np.zeros((len(column), vec_size)) 
    for i in range(len(column)):
        wordvec_arrays[i,:] = text_to_vector(column[i])
    return pd.DataFrame(wordvec_arrays)

In [137]:
biden_xtrain = text_column_into_w2v_array(biden_train["lemmatized_text"]).iloc[:875,:]

In [153]:
biden_ytrain = biden_train["sentiment"]

In [155]:
trump_xtrain = text_column_into_w2v_array(trump_train["lemmatized_text"]).iloc[:875,:]

In [154]:
trump_ytrain = trump_train["sentiment"]

In [163]:
biden_lr = LinearRegression().fit(biden_xtrain, biden_ytrain)
biden_lr.score(biden_xtrain, biden_ytrain)

0.1784152378467374

In [164]:
trump_lr = LinearRegression().fit(trump_xtrain, trump_ytrain)
trump_lr.score(trump_xtrain, trump_ytrain)

0.16081632975601856

# Sprawdzenie poprawności działania regressor

In [143]:
biden_xtest = text_column_into_w2v_array(biden_test['lemmatized_text'])

In [144]:
biden_ytest = biden_test["sentiment"]

In [156]:
trump_xtest = text_column_into_w2v_array(trump_test['lemmatized_text'])

In [157]:
trump_ytest = trump_test["sentiment"]

In [123]:
def number_to_class(number):
    if number >= 0.1:
        return 1
    elif number <= -0.1:
        return -1
    else: return 0

In [150]:
def calculate_accuracy_for_model(model, x, y):
    model_output = pd.DataFrame(model.predict(x))
    model_output["classified"] = output[0].apply(number_to_class)
    return (y == model_output["classified"]).sum()/len(y)

In [151]:
calculate_accuracy_for_model(biden_lr, biden_xtest, biden_ytest)

0.37066666666666664

In [165]:
calculate_accuracy_for_model(trump_lr, trump_xtest, trump_ytest)

0.29333333333333333

# Zapis modeli

In [167]:
from joblib import dump, load

In [None]:
dump(biden_lr, '../models/biden_lr.joblib') 

In [None]:
dump(trump_lr, '../models/trump_lr.joblib') 