In [2]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.kernel_approximation import RBFSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import pickle

In [3]:
# Read dictionary pkl file
with open('pickle-sorties/clean_train_dictionary.pkl/clean_train_dictionary.pkl', 'rb') as fp:
    clean_train_dictionary = pickle.load(fp)

with open('pickle-sorties/clean_test_dictionary.pkl', 'rb') as fp:
    clean_test_dictionary = pickle.load(fp)

In [4]:
combined_train_data = [data for key, data in clean_train_dictionary.items()]
combined_test_data = [data for key, data in clean_test_dictionary.items()]

# Dividing texts and labels 
train_texts, train_labels = [], []
test_texts, test_labels = [], []

for data in combined_train_data:
    texts, labels = zip(*data)
    train_texts.extend(texts)
    train_labels.extend(labels)

for data in combined_test_data:
    texts, labels = zip(*data)
    test_texts.extend(texts)
    test_labels.extend(labels)

## Kernel Approximation RBF

### Vectorization with CountVectorizer

In [6]:
# Creating word embeddings using CountVectorizer
# Vectorisation des textes (conversion en représentation numérique)
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

# Applying kernel approximation RBF
rbf_feature = RBFSampler(gamma=0.5, random_state=69, n_components=1000)
X_train_features = rbf_feature.fit_transform(X_train)
X_test_features = rbf_feature.transform(X_test)

# Training SGD classifier on training set
clf = SGDClassifier(max_iter=10000)
clf.fit(X_train_features, train_labels)

prediction = clf.predict(X_test_features)

#### Metrics CountVectorizer

In [7]:
accuracy = accuracy_score(test_labels, prediction)
print(f"Accuracy: {accuracy}")
precision = precision_score(test_labels, prediction, average='weighted')
print(f"Precision: {precision}")
recall = recall_score(test_labels, prediction, average='weighted')
print(f"Recall: {recall}")
f1 = f1_score(test_labels, prediction, average='weighted')
print(f"F1 Score: {f1}")

Accuracy: 0.3005549103110079
Precision: 0.2647651219985524
Recall: 0.3005549103110079
F1 Score: 0.26863014605217383


### Vectorization with TfidVectorizer

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Creating word embeddings using TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_transformed = vectorizer.fit_transform(train_texts)
X_test_transformed = vectorizer.transform(test_texts)

# Applying kernel approximation RBF
rbf_feature = RBFSampler(gamma=0.5, random_state=69, n_components=1000)
X_train_features = rbf_feature.fit_transform(X_train_transformed)
X_test_features = rbf_feature.transform(X_test_transformed)

# Training SGD classifier on training set
clf = SGDClassifier(max_iter=10000)
clf.fit(X_train_features, train_labels)

predictions = clf.predict(X_test_features)

#### Metrics TfidVectorizer

In [10]:
accuracy = accuracy_score(test_labels, prediction)
print(f"Accuracy: {accuracy}")
precision = precision_score(test_labels, prediction, average='weighted')
print(f"Precision: {precision}")
recall = recall_score(test_labels, prediction, average='weighted')
print(f"Recall: {recall}")
f1 = f1_score(test_labels, prediction, average='weighted')
print(f"F1 Score: {f1}")

Accuracy: 0.3005549103110079
Precision: 0.2647651219985524
Recall: 0.3005549103110079
F1 Score: 0.26863014605217383
