In [2]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.kernel_approximation import Nystroem
from sklearn.kernel_approximation import RBFSampler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import pickle

In [3]:
# Read dictionary pkl file
with open('pickle-sorties/clean_train_dictionary.pkl/clean_train_dictionary.pkl', 'rb') as fp:
    clean_train_dictionary = pickle.load(fp)

with open('pickle-sorties/clean_test_dictionary.pkl', 'rb') as fp:
    clean_test_dictionary = pickle.load(fp)

In [4]:
combined_train_data = [data for key, data in clean_train_dictionary.items()]
combined_test_data = [data for key, data in clean_test_dictionary.items()]

# Dividing texts and labels 
train_texts, train_labels = [], []
test_texts, test_labels = [], []

for data in combined_train_data:
    texts, labels = zip(*data)
    train_texts.extend(texts)
    train_labels.extend(labels)

for data in combined_test_data:
    texts, labels = zip(*data)
    test_texts.extend(texts)
    test_labels.extend(labels)

## Kernel Approximation Nystroem

### Vectorization with CountVectorizer

In [7]:
from sklearn import svm

# Creating word embeddings using CountVectorizer
vectorizer = CountVectorizer()

X_train_vectorized = vectorizer.fit_transform(train_texts)

# Mapping with Nystroem
feature_map_nystroem = Nystroem(gamma=0.5, random_state=69, n_components=1000)

X_train_transformed = feature_map_nystroem.fit_transform(X_train_vectorized)

# Training SGD classifier with our transformed data
clf = SGDClassifier(max_iter=10000)
clf.fit(X_train_transformed, train_labels)

# Transforming and mapping data
X_test_vectorized = vectorizer.transform(test_texts)
X_test_transformed = feature_map_nystroem.transform(X_test_vectorized)

# Predictions
prediction = clf.predict(X_test_transformed)

#### Metrics CountVectorizer

In [8]:
accuracy = accuracy_score(test_labels, prediction)
print(f"Accuracy: {accuracy}")
precision = precision_score(test_labels, prediction, average='weighted')
print(f"Precision: {precision}")
recall = recall_score(test_labels, prediction, average='weighted')
print(f"Recall: {recall}")
f1 = f1_score(test_labels, prediction, average='weighted')
print(f"F1 Score: {f1}")

Accuracy: 0.2888372693250742
Precision: 0.7986688780415281
Recall: 0.2888372693250742
F1 Score: 0.13975047187095907


###  Vectorization with TfidfVectorizer

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Creating word embeddings using TfidfVectorizer
vectorizer = TfidfVectorizer()

# Mapping with Nystroem
X_train_vectorized = vectorizer.fit_transform(train_texts)

# Training SGD classifier with our transformed data
feature_map_nystroem = Nystroem(gamma=0.5, random_state=69, n_components=1000)

X_train_transformed = feature_map_nystroem.fit_transform(X_train_vectorized)

# Training SGD classifier with our transformed data
clf = SGDClassifier(max_iter=10000)
clf.fit(X_train_transformed, train_labels)

# Transforming and mapping data
X_test_vectorized = vectorizer.transform(test_texts)
X_test_transformed = feature_map_nystroem.transform(X_test_vectorized)

# Predictions
prediction = clf.predict(X_test_transformed)

#### Metrics TfidVectorizer

In [10]:
accuracy = accuracy_score(test_labels, prediction)
print(f"Accuracy: {accuracy}")
precision = precision_score(test_labels, prediction, average='weighted')
print(f"Precision: {precision}")
recall = recall_score(test_labels, prediction, average='weighted')
print(f"Recall: {recall}")
f1 = f1_score(test_labels, prediction, average='weighted')
print(f"F1 Score: {f1}")

Accuracy: 0.4158988256549232
Precision: 0.44966332100699735
Recall: 0.4158988256549232
F1 Score: 0.3745857941638245
