In [58]:
import pandas as pd

In [59]:
url = "./datasets/pre_processed_dataset.csv"
names = ["tweet", "label"]
data = pd.read_csv(url,names=names)
data = data.drop(data.index[0])
data = data.sample(frac = 1)

In [60]:
X = data['tweet']
y = data['label']

In [61]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Create feature vectors
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(x_train)
test_vectors = vectorizer.transform(x_test)

# Model 1: Linear SVC version

In [81]:
import time
from sklearn import svm
from sklearn.metrics import classification_report


In [82]:
classifier_linear = svm.SVC(kernel='linear', C=10)
classifier_linear.fit(train_vectors, y_train)

# Model 2: Gaussian RBF Kernel version

In [90]:
classifier_kernel = svm.SVC(kernel='rbf', C=1, gamma=2**-5)
classifier_kernel.fit(train_vectors, y_train)

# Performance metrics

In [91]:

prediction_linear = classifier_linear.predict(test_vectors)
prediction_kernel = classifier_kernel.predict(test_vectors)


### 1. Recall score

In [92]:
from sklearn.metrics import recall_score
print('recall score linear SVC : ',recall_score(y_test, prediction_linear, average='weighted'))
print('recall score kernel SVC : ',recall_score(y_test, prediction_kernel, average='weighted'))

recall score linear SVC :  0.5384615384615384
recall score kernel SVC :  0.46153846153846156


### 2. F1 score

In [93]:
from sklearn.metrics import f1_score
print('F1 score linear SVC : ',f1_score(y_test, prediction_linear, average='macro'))
print('F1 score kernel SVC : ',f1_score(y_test, prediction_kernel, average='macro'))

F1 score linear SVC :  0.5086419753086419
F1 score kernel SVC :  0.21052631578947367
