In [1]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.kernel_approximation import RBFSampler, Nystroem
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

In [2]:
# Read dictionary pkl file
with open('pickle-sorties/dirty_train_dictionary.pkl/dirty_train_dictionary.pkl', 'rb') as fp:
    dirty_train_dictionary = pickle.load(fp)

with open('pickle-sorties/dirty_test_dictionary.pkl/dirty_test_dictionary.pkl', 'rb') as fp:
    dirty_test_dictionary = pickle.load(fp)

In [3]:
combined_train_data = [data for key, data in dirty_train_dictionary.items()]
combined_test_data = [data for key, data in dirty_test_dictionary.items()]

# Dividing texts and labels 
train_texts, train_labels = [], []
test_texts, test_labels = [], []

for data in combined_train_data:
    texts, labels = zip(*data)
    train_texts.extend(texts)
    train_labels.extend(labels)

for data in combined_test_data:
    texts, labels = zip(*data)
    test_texts.extend(texts)
    test_labels.extend(labels)

## Kernel Approximation RBF

### Vectorization with CountVectorizer
The best scores we've found with this classifier had n_components, but with this corpus it crashes our machine every time we try.  
So we chose a smaller n_components (the bigger one that doesn't crash the machine) to have the predictions.

In [4]:
# Creating word embeddings using CountVectorizer
# Vectorisation des textes (conversion en représentation numérique)
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

# Applying kernel approximation RBF
rbf_feature = RBFSampler(gamma=0.5, random_state=69, n_components=1000)
X_train_features = rbf_feature.fit_transform(X_train)
X_test_features = rbf_feature.transform(X_test)

# Training SGD classifier on training set
clf = SGDClassifier(max_iter=10000)
clf.fit(X_train_features, train_labels)

prediction = clf.predict(X_test_features)

#### Metrics CountVectorizer

In [5]:
accuracy = accuracy_score(test_labels, prediction)
print(f"Accuracy: {accuracy}")
precision = precision_score(test_labels, prediction, average='weighted')
print(f"Precision: {precision}")
recall = recall_score(test_labels, prediction, average='weighted')
print(f"Recall: {recall}")
f1 = f1_score(test_labels, prediction, average='weighted')
print(f"F1 Score: {f1}")

Accuracy: 0.2436443412053168
Precision: 0.2713569241152814
Recall: 0.2436443412053168
F1 Score: 0.23937625849717695


### Vectorization with CountVectorizer

In [6]:
# Creating word embeddings using TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_transformed = vectorizer.fit_transform(train_texts)
X_test_transformed = vectorizer.transform(test_texts)

# Applying kernel approximation RBF
rbf_feature = RBFSampler(gamma=0.5, random_state=69, n_components=1000)
X_train_features = rbf_feature.fit_transform(X_train_transformed)
X_test_features = rbf_feature.transform(X_test_transformed)

# Training SGD classifier on training set
clf = SGDClassifier(max_iter=10000)
clf.fit(X_train_features, train_labels)

predictions = clf.predict(X_test_features)

#### Metrics TfidVectorizer

In [7]:
accuracy = accuracy_score(test_labels, prediction)
print(f"Accuracy: {accuracy}")
precision = precision_score(test_labels, prediction, average='weighted')
print(f"Precision: {precision}")
recall = recall_score(test_labels, prediction, average='weighted')
print(f"Recall: {recall}")
f1 = f1_score(test_labels, prediction, average='weighted')
print(f"F1 Score: {f1}")

Accuracy: 0.2436443412053168
Precision: 0.2713569241152814
Recall: 0.2436443412053168
F1 Score: 0.23937625849717695


## Kernel Approximation Nystroem

### Vectorization with CountVectorizer

In [8]:
from sklearn import svm

# Creating word embeddings using CountVectorizer
vectorizer = CountVectorizer()

X_train_vectorized = vectorizer.fit_transform(train_texts)

# Mapping with Nystroem
feature_map_nystroem = Nystroem(gamma=0.5, random_state=69, n_components=1000)

X_train_transformed = feature_map_nystroem.fit_transform(X_train_vectorized)

# Training SGD classifier with our transformed data
clf = SGDClassifier(max_iter=10000)
clf.fit(X_train_transformed, train_labels)

# Transforming and mapping data
X_test_vectorized = vectorizer.transform(test_texts)
X_test_transformed = feature_map_nystroem.transform(X_test_vectorized)

#### Metrics CountVectorizer

In [9]:
accuracy = accuracy_score(test_labels, prediction)
print(f"Accuracy: {accuracy}")
precision = precision_score(test_labels, prediction, average='weighted')
print(f"Precision: {precision}")
recall = recall_score(test_labels, prediction, average='weighted')
print(f"Recall: {recall}")
f1 = f1_score(test_labels, prediction, average='weighted')
print(f"F1 Score: {f1}")

Accuracy: 0.2436443412053168
Precision: 0.2713569241152814
Recall: 0.2436443412053168
F1 Score: 0.23937625849717695
