In [42]:
import pandas as pd

df = pd.read_csv('SMS_test.csv', encoding='iso-8859-1')

df

Unnamed: 0,S. No.,Message_body,Label
0,1,"UpgrdCentre Orange customer, you may now claim...",Spam
1,2,"Loan for any purpose £500 - £75,000. Homeowner...",Spam
2,3,Congrats! Nokia 3650 video camera phone is you...,Spam
3,4,URGENT! Your Mobile number has been awarded wi...,Spam
4,5,Someone has contacted our dating service and e...,Spam
...,...,...,...
120,121,7 wonders in My WORLD 7th You 6th Ur style 5th...,Non-Spam
121,122,Try to do something dear. You read something f...,Non-Spam
122,123,Sun ah... Thk mayb can if dun have anythin on....,Non-Spam
123,124,"SYMPTOMS when U are in love: ""1.U like listeni...",Non-Spam


In [43]:
#Tout mettre en minuscule
def text_lowercase(text) : 
    return text.lower()

df['Case Normalization'] = df['Message_body'].apply(text_lowercase)
df['Case Normalization'][0]

"upgrdcentre orange customer, you may now claim your free camera phone upgrade for your loyalty. call now on 0207 153 9153. offer ends 26th july. t&c's apply. opt-out available"

In [44]:
#Supprimer les chiffres
import re
def remove_numbers(text) : 
    result = re.sub(r'\d+', '', text)
    return result

df['Noise Removal'] = df['Case Normalization'].apply(remove_numbers)
df['Noise Removal'][0]

"upgrdcentre orange customer, you may now claim your free camera phone upgrade for your loyalty. call now on   . offer ends th july. t&c's apply. opt-out available"

In [45]:
#Supprimer la ponctuation
def remove_punctuation(text) : 
    result = re.sub(r'[^\w\s]', '', text)
    return result
df['Noise Removal'] = df['Noise Removal'].apply(remove_punctuation)
df['Noise Removal'][0]

'upgrdcentre orange customer you may now claim your free camera phone upgrade for your loyalty call now on    offer ends th july tcs apply optout available'

In [46]:
#Supprimer les espaces
def remove_whitespace(text):
    # Utilise split() pour diviser le texte en mots (en ignorant les espaces supplémentaires)
    # Puis utilise join() pour les rassembler avec un seul espace entre chaque mot
    return " ".join(text.split())

df['Noise Removal'] = df['Noise Removal'].apply(remove_whitespace)
df['Noise Removal'][0]

'upgrdcentre orange customer you may now claim your free camera phone upgrade for your loyalty call now on offer ends th july tcs apply optout available'

In [47]:
#Tokenization
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

def tokenization(text) :
    text = word_tokenize(text)
    return text

df['Tokenization'] = df['Noise Removal'].apply(tokenization)
df.Tokenization[0]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\marwane.boukili\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['upgrdcentre',
 'orange',
 'customer',
 'you',
 'may',
 'now',
 'claim',
 'your',
 'free',
 'camera',
 'phone',
 'upgrade',
 'for',
 'your',
 'loyalty',
 'call',
 'now',
 'on',
 'offer',
 'ends',
 'th',
 'july',
 'tcs',
 'apply',
 'optout',
 'available']

In [48]:
#Supprimer les stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text) :
    text = [word for word in text if word not in stop_words]
    return text

df['Stopwords'] = df['Tokenization'].apply(remove_stopwords)
df.Stopwords[0]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marwane.boukili\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['upgrdcentre',
 'orange',
 'customer',
 'may',
 'claim',
 'free',
 'camera',
 'phone',
 'upgrade',
 'loyalty',
 'call',
 'offer',
 'ends',
 'th',
 'july',
 'tcs',
 'apply',
 'optout',
 'available']

In [49]:
#Stemming
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
stemmer = PorterStemmer()

def stemming(text) :
    text = [stemmer.stem(word) for word in text]
    return text

df['Stemming'] = df['Stopwords'].apply(stemming)
df.Stemming[0]

['upgrdcentr',
 'orang',
 'custom',
 'may',
 'claim',
 'free',
 'camera',
 'phone',
 'upgrad',
 'loyalti',
 'call',
 'offer',
 'end',
 'th',
 'juli',
 'tc',
 'appli',
 'optout',
 'avail']

In [50]:
#Lemmatization
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemmatizer = WordNetLemmatizer()

def lemmatization(text) :
    text = [lemmatizer.lemmatize(word) for word in text]
    return text

df['Lemmatization'] = df['Stopwords'].apply(lemmatization)
df.Lemmatization[0]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\marwane.boukili\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['upgrdcentre',
 'orange',
 'customer',
 'may',
 'claim',
 'free',
 'camera',
 'phone',
 'upgrade',
 'loyalty',
 'call',
 'offer',
 'end',
 'th',
 'july',
 'tc',
 'apply',
 'optout',
 'available']