# SVM

### Import libraries

In [68]:
from time import time
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import pickle

import warnings
warnings.filterwarnings('ignore')

In [69]:
FOLDER_PATH_2022 = '../../data/cleaned_data/cleaned_text_data_2022.csv'
FOLDER_PATH_2021 = '../../data/cleaned_data/clean_text_2021.csv'

### Load data

In [70]:
df_2022 = pd.read_csv(FOLDER_PATH_2022)
df_2021 = pd.read_csv(FOLDER_PATH_2021)

df_2022.shape, df_2021.shape

((7903, 4), (13521, 3))

### Preprocessing

In [71]:
df_2021['real_fake_grade'] = df_2021['label']

df_2021.drop('label', axis=1, inplace=True)

### Model

In [74]:
def retrieve_data(column_name: str, data_name: pd.DataFrame):
    """Returned vectorized data gesplistst in train en test"""

    if column_name not in data_name.columns:
        raise Exception(f"{column_name} not in dataframe")

    # Maak X en y variabelen aan
    X = data_name[column_name]
    y = data_name['real_fake_grade']


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

    vectorizer = TfidfVectorizer()
    #
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    return (X_train, y_train), (X_test, y_test)

def get_best_params(train_data, test_data):
    """De functie doet een gridsearch op een aantal random parameters en returned de beste resultaat ervan"""
    params = {
        'probability': [True, False],
        'C': [0.01, 0.1, 1, 10, 100, 1000],
        'gamma': [1, 'auto', 'scale'],
        'kernel': ['rbf', 'linear', 'poly', 'sigmoid']
    }

    grid = GridSearchCV(SVC(), params, cv=7, n_jobs=-1)

    result = grid.fit(train_data, test_data)

    return result.best_params_
    

#### Tweet

##### Dataset 2022

In [75]:
(X_train, y_train), (X_test, y_test) = retrieve_data('full_text', df_2022)

In [76]:
(X_train.shape, y_train.shape), (X_test.shape, y_test.shape)

(((5927, 14679), (5927,)), ((1976, 14679), (1976,)))

In [21]:
user_info_best_params = get_best_params(X_train, y_train)
user_info_best_params

{'C': 10, 'gamma': 1, 'kernel': 'rbf', 'probability': True}

In [77]:
svc_tweet_2022 = SVC(probability=True, C=10, gamma=1, kernel='rbf')

start = time()

svc_tweet_2022.fit(X_train, y_train)
end = time()

svc_time = end - start

In [78]:
start = time()
svc_tweet_2022 = svc_tweet_2022.predict(X_test)
end = time()

svc_tweet_2022_test_time = end - start
svc_tweet_2022_acc = accuracy_score(y_test, svc_tweet_2022)

In [79]:
print(f"Train time: {svc_time}")
print(f"Test time: {svc_tweet_2022_test_time}")
print(f"Accuracy: {svc_tweet_2022_acc}")

Train time: 42.79423785209656
Test time: 1.9230480194091797
Accuracy: 0.763663967611336


##### Dataset 2021

In [80]:
(X_train, y_train), (X_test, y_test) = retrieve_data('text', df_2021)

In [81]:
svc_tweet_2021 = SVC(probability=True, C=25, gamma=1, kernel='rbf')

start = time()

svc_tweet_2021.fit(X_train, y_train)
end = time()

svc_tweet_time_2021 = end - start

In [82]:
start = time()
pred = svc_tweet_2021.predict(X_test)
end = time()

tweet_2021_test_time = end - start
tweet_acc_score_2021 = accuracy_score(y_test, pred)

In [83]:
print(f"Train time: {svc_tweet_time_2021}")
print(f"Test time: {tweet_2021_test_time}")
print(f"Accuracy: {tweet_acc_score_2021}")

Train time: 91.1470558643341
Test time: 3.3451499938964844
Accuracy: 0.8402839396628217


**Insights**

- Dataset van 2021 krijgt een grotere accuracy, doordat het dubbel de data bevat.

#### Text metadata

In [84]:
(X_train, y_train), (X_test, y_test) = retrieve_data('text_metadata', df_2022)

In [29]:
user_info_best_params = get_best_params(X_train, y_train)
user_info_best_params

{'C': 10, 'gamma': 'scale', 'kernel': 'rbf', 'probability': True}

In [85]:
svc_text_metadata = SVC(C=10, gamma='scale', kernel='rbf', probability=True)

start = time()

svc_text_metadata.fit(X_train, y_train)
end = time()

text_metadata_time = end - start

In [86]:
start = time()
pred = svc_text_metadata.predict(X_test)
end = time()

svc_test_metadata_test_time = end - start

text_metadata_acc = accuracy_score(y_test, pred)

In [87]:
print(f"Train time: {text_metadata_time}")
print(f"Test time: {svc_test_metadata_test_time}")
print(f"Accuracy: {text_metadata_acc}")

Train time: 14.951473236083984
Test time: 0.7032508850097656
Accuracy: 0.7252024291497976


### Combined

In [88]:
(X_train, y_train), (X_test, y_test) = retrieve_data('combined', df_2022)

In [34]:
combined_best_params = get_best_params(X_train, y_train)
combined_best_params

{'C': 1, 'gamma': 1, 'kernel': 'linear', 'probability': True}

In [89]:
svc_combined = SVC(C=1, gamma=1, kernel='linear', probability=True)

start = time()

svc_combined.fit(X_train, y_train)
end = time()

conbimed_time = end - start

In [90]:
start = time()
pred = svc_combined.predict(X_test)
end = time()

svc_combined_test_time = end - start
svc_combined_acc = accuracy_score(y_test, pred)

In [92]:
print(f"Train time: {conbimed_time}")
print(f"Test time: {svc_combined_test_time}")
print(f"Accuracy: {svc_combined_acc}")

Train time: 35.44808387756348
Test time: 1.6490418910980225
Accuracy: 0.8061740890688259


### Resultaten

|      	| accuracy 	| training time (s)     	| test time  (s)|
|------	|-------------	|-------	| ------|
| Tweet 2021 	| 0.84      	|  91 | 3.4|
| Tweet 2022    	| 0.76        |43	| 1.92 |
| Text metadata    	| 0.73      |15 	| 0.7|
| Combined    	| 0.806        	| 35| 1.7|

### Conclusie

Dataset van 2021 krijgt de hoogste accuracy. Alleen wordt deze niet gebruikt vanwege het feit dat het geen metadata bevat. Combined geeft ook al een accuracy van 81%. Hoger dan gesplitst.

### Save model

In [67]:
pickle.dump(svc_combined, open("./results/svm/svm_model_combined.pickle", 'wb'))
pickle.dump(svc_text_metadata, open("./results/svm/svm_model_text_metadata.pickle", 'wb'))
pickle.dump(svc_tweet_2022, open("./results/svm/svm_model_tweet_2022.pickle", 'wb'))