# **VECTORISATION DES PHRASES**

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from collections import Counter
import json
import joblib

In [2]:
def tokenize(sentence, n=4):
  words = sentence.strip()
  return [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]

In [3]:
def build_vocabulary(corpus, K=5000, n=4):

    counter = Counter()

    for sentence in corpus:
        tokens = tokenize(sentence, n)
        counter.update(tokens)

    most_common = counter.most_common(K)
    vocab = {token: idx for idx, (token, _) in enumerate(most_common)}
    return vocab

In [4]:
def compute_tf(tokens, vocab):
    tf = np.zeros(len(vocab))
    for token in tokens:
        if token in vocab:
            tf[vocab[token]] += 1
    if tf.sum() > 0:
        tf = tf / tf.sum()
    return tf

In [5]:
def compute_tdf(corpus, vocab, n=4):
    N = len(corpus)
    idf = np.zeros(len(vocab))

    for sentence in corpus:
        tokens = set(tokenize(sentence, n))
        for token in tokens:
            if token in vocab:
                idf[vocab[token]] += 1

    idf = np.log((N + 1) / (idf + 1)) + 1
    return idf

In [6]:
def vectorize(corpus, K=5000, n=4, vocab=None, idf=None):
    X = []
    tu = []
    if vocab == None:
        vocab = build_vocabulary(corpus, K=K, n=n)
        idf = compute_tdf(corpus, vocab, n)
        tu.append(vocab)
        tu.append(idf)
    for sentence in corpus:
        tokens = tokenize(sentence, n=n)
        tf = compute_tf(tokens, vocab)
        X.append(tf * idf)

    tu.append(X)    
    return tuple(tu)

In [7]:
def vectorize_y(Y):
    Y_unique = set(Y)
    Y_set = {}
    for index, val in enumerate(Y_unique):
        Y_set[val] = index
    Y_final = []
    for y in Y:
        Y_final.append(Y_set[y])
    return np.array(Y_final), Y_set

In [8]:
def save_X( X_vect, file="/content/drive/MyDrive/new_dataX.npy"):
    np.save(file, X_vect)
    print("data X sauvegardée")

In [9]:
def save_vocab(vocab, file="/content/drive/MyDrive/vocab.json"):
    with open(file, "w") as f:
        json.dump(vocab, f)
    print("Vocabulaire X sauvegardée")

In [10]:
def save_idf(idf_vect, file="/content/drive/MyDrive/idfVect.npy"):
    np.save(file, idf_vect)
    print("idf vecteur Y sauvegardée")

In [11]:
def save_Y(Y, file="/content/drive/MyDrive/dataY.npy"):
    np.save(file, Y)
    print("data Y sauvegardée")

In [12]:
def save_vocabY(Y, file="/content/drive/MyDrive/vocabY.json"):
    with open(file, "w") as f:
        json.dump(Y, f)
    print("Vocabulaire Y sauvegardée")

In [13]:
df= pd.read_csv("dataset.csv")
corpus = df['Text'].tolist()
vocab, idf, X = vectorize(corpus, K=5000, n=4)
svd = TruncatedSVD(n_components=600, random_state=42)
X_reduit = svd.fit_transform(X)


# Étape 2 : Appliquer PCA
# X_reduit = pca.fit_transform(X)



print("Matrice réduite (shape) :", X_reduit.shape)



Matrice réduite (shape) : (22000, 600)


In [14]:
joblib.dump(svd, "svdmodel2.plk")
print("Terminer")

Terminer


In [15]:
save_X(X_reduit, file="Xfit.npy")
save_vocab(vocab, file="vocabFit.json")
save_idf(idf, file="idfFit.npy")

data X sauvegardée
Vocabulaire X sauvegardée
idf vecteur Y sauvegardée


In [16]:
corpusY = df['language'].tolist()
vectY, vocabY = vectorize_y(corpusY)
save_Y(vectY, file="Yfitsq.npy")
save_vocabY(vocabY, file="vocabYFitsq.json")

{'Swedish': 0, 'Pushto': 1, 'Latin': 2, 'Korean': 3, 'Turkish': 4, 'Urdu': 5, 'Arabic': 6, 'Thai': 7, 'Dutch': 8, 'French': 9, 'English': 10, 'Persian': 11, 'Spanish': 12, 'Russian': 13, 'Portugese': 14, 'Estonian': 15, 'Chinese': 16, 'Indonesian': 17, 'Japanese': 18, 'Tamil': 19, 'Romanian': 20, 'Hindi': 21}
data Y sauvegardée
Vocabulaire Y sauvegardée


In [31]:
sentence = ["Bonjour la famille j espere que vous allez bien, je vous ai envoye le dernier paiement , il est co"]
print(len(sentence[0]))

idf = np.load("idfFit.npy")
json_path = "vocabFit.json"
with open(json_path, 'r') as f:
    vocabX = json.load(f)
    

sentVect = vectorize(sentence, K=5000, n=4, vocab=vocabX, idf=idf )
sentVect = sentVect[0]

svd = joblib.load("svdmodel2.plk")

sent = svd.transform(sentVect)
print(sent.shape)


save_X(sent, file="test.npy")

23
(1, 600)
data X sauvegardée
