## Presentation
Share the building process of your models with an audience of fellow data scientists, in a 10-minute presentation. Go into the details of the construction of your networks - layers density, activation functions, optimizers - to come up with the best performing networks.
Remember to add a link to your notebook and the link to your respective GitHub repository at the end of your slides.
Add a link to your notebook in the presentation and copy your final slides in the dedicated folder.


In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

In [2]:
# Charger votre DataFrame
df = pd.read_csv('SMSSpamCollection.csv' , sep='\t',names=["cat","sms"])
df

Unnamed: 0,cat,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
df["cat"].value_counts()

ham     4825
spam     747
Name: cat, dtype: int64

In [4]:



# Optionnel : Si vous avez des valeurs manquantes, vous pouvez les supprimer
df.dropna(inplace=True)

# Convertir les e-mails en minuscules
df['sms'] = df['sms'].str.lower()



# Supprimer les caractères spéciaux et la ponctuation
#df['sms'] = df['sms'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))

# Diviser les données en ensembles d'entraînement et de test
X = df['sms']
y = df['cat']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# À ce stade, X_train contient les e-mails d'entraînement prétraités, et y_train contient les étiquettes correspondantes.
# De même, X_test contient les e-mails de test prétraités, et y_test contient les étiquettes correspondantes.


In [5]:
X_train

1978    reply to win £100 weekly! where will the 2006 ...
3989    hello. sort of out in town already. that . so ...
3935     how come guoyang go n tell her? then u told her?
4078    hey sathya till now we dint meet not even a si...
4086    orange brings you ringtones from all time char...
                              ...                        
3772    hi, wlcome back, did wonder if you got eaten b...
5191                               sorry, i'll call later
5226        prabha..i'm soryda..realy..frm heart i'm sory
5390                           nt joking seriously i told
860               did he just say somebody is named tampa
Name: sms, Length: 4457, dtype: object

In [6]:
X_train2 = ["hello hello I test it","hello hello I test it it"]

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer


countvectorizer = CountVectorizer(analyzer= 'word')
tfidfvectorizer = TfidfVectorizer(analyzer='word',max_features=1000)


# convert th documents into a matrix
count_wm = countvectorizer.fit_transform(X_train)
tfidf_wm = tfidfvectorizer.fit_transform(X_train)

#count_tokens = tfidfvectorizer.get_feature_names() # no difference
count_tokens = countvectorizer.get_feature_names()
tfidf_tokens = tfidfvectorizer.get_feature_names()

df_countvect = pd.DataFrame(data = count_wm.toarray(),columns = count_tokens)
df_tfidfvect = pd.DataFrame(data = tfidf_wm.toarray(),columns = tfidf_tokens)



In [8]:
df_tfidfvect

Unnamed: 0,000,03,04,0800,08000839402,08000930705,10,100,1000,10p,...,yes,yesterday,yet,yo,you,your,yours,yourself,yr,yup
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.30469,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.123465,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.086011,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4452,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.134984,0.0,0.0,0.0,0.0,0.0
4453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
4454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
4455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [9]:
vocab_size = len(tfidfvectorizer.get_feature_names_out())
vocab_size

1000

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Créez un Tokenizer
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X_train)

# Transformez les textes en séquences
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Ajoutez du padding pour avoir des séquences de même longueur
max_sequence_length = 1000  # Remplacez cette valeur par la longueur souhaitée
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length)


In [11]:
len(X_train)

4457

In [12]:
len(X_train_padded)

4457

In [13]:
embedding_dim = 50

In [14]:
from sklearn.preprocessing import LabelEncoder

# Créez un encodeur d'étiquettes
label_encoder = LabelEncoder()

# Encodez les étiquettes en valeurs numériques
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [15]:
y_train_encoded

array([1, 0, 0, ..., 0, 0, 0])

In [16]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Créez un modèle séquentiel
model = Sequential()

# Ajoutez une couche d'embedding (si vous utilisez des vecteurs TF-IDF, cette étape n'est pas nécessaire)
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))

# Ajoutez une couche LSTM (ou GRU) pour la séquence
model.add(LSTM(units=300))
model.add(Dropout(0.2))

# Ajoutez une couche dense pour la classification
model.add(Dense(1, activation='sigmoid'))

# Compilez le modèle
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Entraînez le modèle
model.fit(X_train_padded, y_train_encoded, epochs=10, batch_size=64, validation_data=(X_test_padded, y_test_encoded))


2023-10-03 14:10:01.294974: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-10-03 14:10:01.295060: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (raph-XPS-13-9370): /proc/driver/nvidia/version does not exist
2023-10-03 14:10:01.296851: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fee30454a00>