In [4]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [3]:
## Read Data for the Fraudulent Email Kaggle Challenge
import pandas as pd
data = pd.read_csv("kg_train.csv") # avec underscore

# Reduce the training set to speed up development. 
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


### Let's divide the training and test set into two partitions

In [4]:
# Your code
from sklearn.model_selection import train_test_split

# X = colonnes de features, y = colonne cible
X = data.iloc[:, :-1]  # toutes sauf la dernière
y = data.iloc[:, -1]   # dernière colonne

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(X_train.shape, X_test.shape)



(800, 1) (200, 1)


## Data Preprocessing

In [5]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on']


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [6]:
# Your code
import re

def clean_html(raw_html):
    # 1. Supprimer JavaScript/CSS
    no_js_css = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", raw_html)

    # 2. Supprimer les commentaires HTML
    no_comments = re.sub(r"<!--(.*?)-->", "", no_js_css)

    # 3. Supprimer toutes les balises HTML
    clean_text = re.sub(r"<.*?>", "", no_comments)

    return clean_text.strip()

# Exemple d'utilisation
html_code = """
<html>
<head><style>body {color: red;}</style></head>
<body>
<!-- Ceci est un commentaire -->
<p>Hello <b>World</b>!</p>
<script>alert('test');</script>
</body>
</html>
"""

print(clean_html(html_code))


Hello World!


- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [7]:
# Your code
import re

def clean_text(text):
    text = re.sub(r'\W', ' ', text)          # Remove special characters
    text = re.sub(r'\d', '', text)            # Remove numbers
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)  # Remove single characters
    text = re.sub(r'^[a-zA-Z]\s+', '', text)  # Remove single char from start
    text = re.sub(r'\s+', ' ', text)          # Replace multiple spaces
    text = re.sub(r'^b\s+', '', text)         # Remove prefix b
    text = text.lower()                       # Lowercase
    return text.strip()

# Exemple
sample = "Hello!!! This is b Example 123, with some numbers 45 and special @#$ chars."
print(clean_text(sample))


hello this is example with some numbers and special chars


## Now let's work on removing stopwords
Remove the stopwords.

In [8]:
# Your code
from nltk.corpus import stopwords

# Charger la liste des stopwords anglais
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

# Exemple
sample_text = "This is an example showing off stopwords removal."
print(remove_stopwords(sample_text))


example showing stopwords removal.


## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [9]:
# Your code
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk

# Télécharger les ressources nécessaires
nltk.download('punkt')      # Tokenizer
nltk.download('wordnet')    # WordNet data
nltk.download('omw-1.4')    # WordNet mapping (obligatoire dans certaines versions)

# Initialiser le lemmatizer
lemmatizer = WordNetLemmatizer()

# Exemple de texte
sample_text = "running runs ran easily fairly"

# Tokenisation en mots
words = nltk.word_tokenize(sample_text)

# Appliquer la lemmatisation
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

print("Texte original :", sample_text)
print("Après lemmatisation")




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\katyd\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\katyd\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\katyd\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Texte original : running runs ran easily fairly
Après lemmatisation


## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [15]:
# Your code
# Exemple : suppose que ton DataFrame s'appelle `data` avec colonnes 'label' et 'message'
# label = 'ham' ou 'spam'

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

def top_words(df, label, n=10, language='english'):
    # Nettoyage de base
    df = df.copy()
    df['label'] = df['label'].astype(str).str.strip().str.lower()
    df['text']  = df['text'].astype(str).fillna('').str.strip()

    # Filtrer sur le label
    subset = df.loc[df['label'] == label, 'text']
    # Retirer lignes vides
    subset = subset[subset.str.len() > 0]

    if subset.empty:
        return pd.DataFrame({'word': [], 'count': []})  # évite le crash

    # Vectorizer (choisis la langue de stopwords selon tes données)
    stop_lang = language  # 'english' ou 
    vectorizer = CountVectorizer(
        stop_words=stop_lang,
        lowercase=True,
        token_pattern=r'(?u)\b\w\w+\b',  # mots de 2+ caractères
        min_df=1
    )

    bow = vectorizer.fit_transform(subset.tolist())
    if bow.shape[1] == 0:
        return pd.DataFrame({'word': [], 'count': []})

    counts = bow.sum(axis=0).A1
    words = vectorizer.get_feature_names_out()
    freq = pd.DataFrame({'word': words, 'count': counts}).sort_values('count', ascending=False)
    return freq.head(n)


# Top 10 mots pour ham et spam
print("Top Ham Words:")
print(top_words(data, 'ham', 10, language='english'))   
print("\nTop Spam Words:")
print(top_words(data, 'spam', 10, language='english'))






Top Ham Words:
Empty DataFrame
Columns: [word, count]
Index: []

Top Spam Words:
Empty DataFrame
Columns: [word, count]
Index: []


## Extra features

In [22]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
# Séparation train / validation
import re, unicodedata, pandas as pd
from sklearn.model_selection import train_test_split

# --- 1) Preprocess "text" into "preprocessed text" ---
def preprocess_text(x):
    if pd.isna(x):
        return ""
    t = str(x).lower()
    t = re.sub(r"http\S+|www\.\S+", " ", t)      # URLs
    t = re.sub(r"\S+@\S+", " ", t)               # emails
    t = re.sub(r"[@#]\w+", " ", t)               # @mentions / #hashtags
    t = re.sub(r"\d+", " ", t)                   # numbers
    # Remove accents (comment out next 2 lines if you want to keep accents)
    t = unicodedata.normalize("NFKD", t)
    t = t.encode("ascii", "ignore").decode("ascii")
    t = re.sub(r"[^\w\s]", " ", t)               # punctuation/special chars
    t = re.sub(r"\s+", " ", t).strip()           # collapse whitespace
    return t

data["preprocessed text"] = data["text"].map(preprocess_text)

# --- 2) Train/val split ---
data_train, data_val = train_test_split(data, test_size=0.2, random_state=42)

# --- 3) Your feature creation (now safe) ---
for df in [data_train, data_val]:
    s = df["preprocessed text"]
    df["money_mark"]       = s.str.contains(money_symbol_list, case=False, regex=True).astype(int)
    df["suspicious_words"] = s.str.contains(suspicious_words,   case=False, regex=True).astype(int)
    df["text_len"]         = s.str.len()

# Quick sanity check
# print(data_train[["preprocessed text","money_mark","suspicious_words","text_len"]].head())




## How would work the Bag of Words with Count Vectorizer concept?

In [23]:
# Your code
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

X_train, X_val, y_train, y_val = train_test_split(
    data["preprocessed text"], data["label"], test_size=0.2, random_state=42
)

# Vectorize + classify in one pipeline
model = make_pipeline(
    CountVectorizer(
        lowercase=True,
        stop_words="english",
        ngram_range=(1,2),      # unigrams + bigrams
        min_df=2                # ignore very rare terms (optional)
    ),
    LogisticRegression(max_iter=1000)
)

model.fit(X_train, y_train)
print("Validation accuracy:", model.score(X_val, y_val))


Validation accuracy: 0.96


## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [24]:
# Your code
from sklearn.feature_extraction.text import TfidfVectorizer

# Example: assuming you have a pandas DataFrame `data` with a column 'preprocessed text'
corpus = data["preprocessed text"]  # replace with your text column

# 1. Load the vectorizer
vectorizer = TfidfVectorizer(lowercase=True, stop_words="english")

# 2. Vectorize the whole dataset
X_tfidf = vectorizer.fit_transform(corpus)

# 3. Print the shape
print("TF-IDF matrix shape:", X_tfidf.shape)


TF-IDF matrix shape: (1000, 30348)


## And the Train a Classifier?

In [25]:
# Your code
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Use your text column
TEXT_COL = "preprocessed text"  # change to "text" if that's what you have

X_train, X_val, y_train, y_val = train_test_split(
    data[TEXT_COL], data["label"],
    test_size=0.2, random_state=42, stratify=data["label"]
)

# TF-IDF + Logistic Regression classifier
clf = make_pipeline(
    TfidfVectorizer(
        lowercase=True,
        stop_words="english",
        ngram_range=(1,2),   # unigrams + bigrams
        max_features=50000,  # cap vocab size (optional)
    ),
    LogisticRegression(max_iter=1000, n_jobs=-1)  # strong baseline
)

clf.fit(X_train, y_train)

pred = clf.predict(X_val)
print("Validation accuracy:", accuracy_score(y_val, pred))
print(classification_report(y_val, pred))


Validation accuracy: 0.99
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       112
           1       1.00      0.98      0.99        88

    accuracy                           0.99       200
   macro avg       0.99      0.99      0.99       200
weighted avg       0.99      0.99      0.99       200



### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to **find the most relevant features**.

For example, you can test the following options and check which of them performs better:
- Using "Bag of Words" only
- Using "TF-IDF" only
- Bag of Words + extra flags (money_mark, suspicious_words, text_len)
- TF-IDF + extra flags


You can work with teams of two persons (recommended).

In [None]:
# Your code