In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development.

In [3]:
data = pd.read_csv("kg_train.csv",encoding='latin-1')

data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


### Let's divide the training and test set into two partitions

In [8]:
from sklearn.model_selection import train_test_split

data_train, data_val = train_test_split(data, test_size=0.2, random_state=42)

print(data_train.shape)
print(data_val.shape)


(800, 2)
(200, 2)


In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Data Preprocessing

In [7]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each']


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [12]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [13]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def clean_text(text):

    text = re.sub(r'<.*?>', '', text)

    text = text.translate(str.maketrans('', '', string.punctuation))

    text = re.sub(r'\d+', '', text)

    text = text.lower()

    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])

    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

data_train['clean_text'] = data_train['text'].apply(clean_text)
data_val['clean_text'] = data_val['text'].apply(clean_text)


- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters

- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [None]:
# Your code

## Now let's work on removing stopwords
Remove the stopwords.

In [None]:
# Your code

## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [None]:
# Your code

## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=10)
spam_words = vectorizer.fit_transform(data_train[data_train['label'] == 1]['clean_text']).toarray()
ham_words = vectorizer.fit_transform(data_train[data_train['label'] == 0]['clean_text']).toarray()

print("Top spam words:", vectorizer.get_feature_names_out())

Top spam words: ['call' 'mr' 'one' 'percent' 'pm' 'president' 'secretary' 'state' 'time'
 'would']


## Extra features

In [16]:
money_symbol_list = "|".join(["euro", "dollar", "pound", "€", "$"])
suspicious_words = "|".join(["free", "cheap", "sex", "money", "account", "bank",
                             "fund", "transfer", "transaction", "win", "deposit", "password"])

data_train['money_mark'] = data_train['clean_text'].str.contains(money_symbol_list).astype(int)
data_train['suspicious_words'] = data_train['clean_text'].str.contains(suspicious_words).astype(int)
data_train['text_len'] = data_train['clean_text'].apply(len)

data_val['money_mark'] = data_val['clean_text'].str.contains(money_symbol_list).astype(int)
data_val['suspicious_words'] = data_val['clean_text'].str.contains(suspicious_words).astype(int)
data_val['text_len'] = data_val['clean_text'].apply(len)

data_train.head()

Unnamed: 0,text,label,clean_text,money_mark,suspicious_words,text_len
29,"----------- REGARDS, MR NELSON SMITH.KINDLY RE...",1,regard mr nelson smithkindly reply private ema...,1,0,75
535,I have not been able to reach oscar this am. W...,0,able reach oscar supposed send pdb u receive,1,0,44
695,; Huma Abedin B6I'm checking with Pat on the 5...,0,huma abedin bim checking pat k work jack jake ...,1,0,81
557,I can have it announced here on Monday - can't...,0,announced monday cant today,1,0,27
836,BANK OF AFRICAAGENCE SAN PEDRO14 BP 1210 S...,1,bank africaagence san pedro bp san pedro cote ...,1,1,1067


## How would you create a Bag of Words with the CountVectorizer method?

In [None]:
# Your code

## TD-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=500)
X_train = tfidf_vectorizer.fit_transform(data_train['clean_text'])
X_val = tfidf_vectorizer.transform(data_val['clean_text'])

print("TF-IDF shape (train):", X_train.shape)
print("TF-IDF shape (validation):", X_val.shape)


TF-IDF shape (train): (800, 500)
TF-IDF shape (validation): (200, 500)


### Extra Task (optional) - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

Use a MultinimialNB with default parameters.

Your task is to find the **best feature representation**.

You can work with teams of two persons (recommended).

In [None]:
# Your code