## PassiveAggressiveClassifier

### Imports

In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score

import pickle

### Read data

In [2]:
FOLDER_PATH_2022 = '../../data/cleaned_data/cleaned_text_data_2022.csv'
FOLDER_PATH_2021 = '../../data/cleaned_data/clean_text_2021.csv'

In [8]:
df_2021 = pd.read_csv(FOLDER_PATH_2021)
df_2022 = pd.read_csv(FOLDER_PATH_2022)

In [9]:
df_2021.isna().sum()

text              0
label             0
uncleaned_text    0
dtype: int64

In [10]:
df_2022.isna().sum()

full_text          0
real_fake_grade    0
text_metadata      0
combined           0
dtype: int64

### Preprocessing

In [11]:
df_2021['real_fake_grade'] = df_2021['label']

df_2021.drop('label', axis=1, inplace=True)

### Models

In [16]:
def get_vectorized_data(data_to_vectorize):
    vectorizer = TfidfVectorizer(max_features=10000)
    return vectorizer.fit_transform(data_to_vectorize).toarray()

def train_model_and_show_results(data_name, column_name):
    X = get_vectorized_data(data_name[column_name])
 
    X_train, X_test, y_train, y_test = train_test_split(X, data_name['real_fake_grade'], test_size=0.25, random_state=42)

    pac = PassiveAggressiveClassifier(max_iter=50)

    pac.fit(X_train,y_train)
    pred = pac.predict(X_test)

    accuracy = accuracy_score(y_test,pred)
    print("Accuracy: %0.3f"%accuracy)

    return pac

##### Tweet

Dataset 2022

In [17]:
tweet_model_2022 = train_model_and_show_results(df_2022, "full_text")

Accuracy: 0.735


Dataset 2021

In [18]:
tweet_model_2021 = train_model_and_show_results(df_2021, "text")

Accuracy: 0.789




##### Tekst metadata

In [23]:
tekst_metadata_model = train_model_and_show_results("text_metadata")

Accuracy: 0.700


##### Combined

In [24]:
combined_model = train_model_and_show_results("combined")

Accuracy: 0.792


#### Save data

In [25]:
pickle.dump(combined_model, open("./results/pac/pac-combined.pickle", 'wb'))
pickle.dump(tweet_model_2022, open("./results/pac/pac-tweets.pickle", 'wb'))
pickle.dump(tekst_metadata_model, open("./results/pac/pac-text-metadata.pickle", 'wb'))