# 13517031 - Karina Iswaras


In [1]:
import pandas as pd

# read csv file into 'df' dataframe
df_train = pd.read_csv('./data/training_data.csv', sep = ",")
df_val = pd.read_csv('./data/val_data.csv', sep = ",")
df_test = pd.read_csv('./data/testing_data.csv', sep = ",")

# print some data rows
df_train.head()

Unnamed: 0,type,text
0,ham,"Babe, I'm back ... Come back to me ..."
1,ham,S:)no competition for him.
2,ham,Yup having my lunch buffet now.. U eat already?
3,ham,"Storming msg: Wen u lift d phne, u say HELLO D..."
4,ham,Mark works tomorrow. He gets out at 5. His wor...


## Preprocessing

Preprocessing dilakukan terhadap semua dataset, yakni training, validation, dan testing. Prepocessing yang dilakukan berupa : 
1. Memberikan label numeric terhadap kelas spam dan ham, dengan spam = 0 dan ham = 1.
2. Tokenization yakni untuk memecah kalimat text menjadi kata (token). Tokenization dilakukan dengan bantuan library nltk.
3. Normalization, yang dilakukan ialah mengubah semua huruf menjadi lowercase karena perbedaan huruf besar dan kecil dianggap tidak berpengaruh dalam spam classification.
4. Membuang tanda baca (punctuation) termasuk emoji yang dibuat dari simbol-simbol, karena tidak berpengaruh terhadap penentuan kelas. Hal ini dilakukan dengan bantuan list punctuation dari string.
5. Remove stopwords, dikarenakan stopwords tidak diperlukan untuk spam classification dan seharusnya memiliki nilai tfidf yang rendah. Remove stopwords dibantu dengan corpus stopwords bahasa inggris dari nltk.
6. Lemmatization, agar kata dengan kata dasar yang sama dapat dihitung menjadi sebuah kata yang sama, hal ini akan mempengaruhi nilai dari tfidf yang akan digunakan pada tahap feature extraction. Hal ini dilakukan dengan bantuan WordNetLemmatizer yang dimiliki nltk.

In [2]:
# Put numberic label

df_train['label'] = df_train['type'].map({'ham':0, 'spam':1})
df_val['label'] = df_val['type'].map({'ham':0, 'spam':1})
df_test['label'] = df_test['type'].map({'ham':0, 'spam':1})
df_train.head()

Unnamed: 0,type,text,label
0,ham,"Babe, I'm back ... Come back to me ...",0
1,ham,S:)no competition for him.,0
2,ham,Yup having my lunch buffet now.. U eat already?,0
3,ham,"Storming msg: Wen u lift d phne, u say HELLO D...",0
4,ham,Mark works tomorrow. He gets out at 5. His wor...,0


In [3]:
# tokenize
import nltk

df_train['token'] = df_train.apply(lambda row : nltk.word_tokenize(row['text']), axis=1)
df_val['token'] = df_val.apply(lambda row : nltk.word_tokenize(row['text']), axis=1)
df_test['token'] = df_test.apply(lambda row : nltk.word_tokenize(row['text']), axis=1)
df_train.head()

Unnamed: 0,type,text,label,token
0,ham,"Babe, I'm back ... Come back to me ...",0,"[Babe, ,, I, 'm, back, ..., Come, back, to, me..."
1,ham,S:)no competition for him.,0,"[S, :, ), no, competition, for, him, .]"
2,ham,Yup having my lunch buffet now.. U eat already?,0,"[Yup, having, my, lunch, buffet, now, .., U, e..."
3,ham,"Storming msg: Wen u lift d phne, u say HELLO D...",0,"[Storming, msg, :, Wen, u, lift, d, phne, ,, u..."
4,ham,Mark works tomorrow. He gets out at 5. His wor...,0,"[Mark, works, tomorrow, ., He, gets, out, at, ..."


In [4]:
# normalization, make token lowercase

df_train['processed_tokens'] = df_train.apply(lambda row: [w.lower() for w in row['token']], axis=1)
df_val['processed_tokens'] = df_val.apply(lambda row: [w.lower() for w in row['token']], axis=1)
df_test['processed_tokens'] = df_test.apply(lambda row: [w.lower() for w in row['token']], axis=1)
df_train.head()

Unnamed: 0,type,text,label,token,processed_tokens
0,ham,"Babe, I'm back ... Come back to me ...",0,"[Babe, ,, I, 'm, back, ..., Come, back, to, me...","[babe, ,, i, 'm, back, ..., come, back, to, me..."
1,ham,S:)no competition for him.,0,"[S, :, ), no, competition, for, him, .]","[s, :, ), no, competition, for, him, .]"
2,ham,Yup having my lunch buffet now.. U eat already?,0,"[Yup, having, my, lunch, buffet, now, .., U, e...","[yup, having, my, lunch, buffet, now, .., u, e..."
3,ham,"Storming msg: Wen u lift d phne, u say HELLO D...",0,"[Storming, msg, :, Wen, u, lift, d, phne, ,, u...","[storming, msg, :, wen, u, lift, d, phne, ,, u..."
4,ham,Mark works tomorrow. He gets out at 5. His wor...,0,"[Mark, works, tomorrow, ., He, gets, out, at, ...","[mark, works, tomorrow, ., he, gets, out, at, ..."


In [5]:
# remove punctuation
import string

df_train['processed_tokens'] = df_train.apply(lambda row: [w for w in row['processed_tokens'] if w not in string.punctuation], axis=1)
df_val['processed_tokens'] = df_val.apply(lambda row: [w for w in row['processed_tokens'] if w not in string.punctuation], axis=1)
df_test['processed_tokens'] = df_test.apply(lambda row: [w for w in row['processed_tokens'] if w not in string.punctuation], axis=1)
df_train.head()

Unnamed: 0,type,text,label,token,processed_tokens
0,ham,"Babe, I'm back ... Come back to me ...",0,"[Babe, ,, I, 'm, back, ..., Come, back, to, me...","[babe, i, 'm, back, ..., come, back, to, me, ...]"
1,ham,S:)no competition for him.,0,"[S, :, ), no, competition, for, him, .]","[s, no, competition, for, him]"
2,ham,Yup having my lunch buffet now.. U eat already?,0,"[Yup, having, my, lunch, buffet, now, .., U, e...","[yup, having, my, lunch, buffet, now, .., u, e..."
3,ham,"Storming msg: Wen u lift d phne, u say HELLO D...",0,"[Storming, msg, :, Wen, u, lift, d, phne, ,, u...","[storming, msg, wen, u, lift, d, phne, u, say,..."
4,ham,Mark works tomorrow. He gets out at 5. His wor...,0,"[Mark, works, tomorrow, ., He, gets, out, at, ...","[mark, works, tomorrow, he, gets, out, at, 5, ..."


In [6]:
# remove stopwords
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))
df_train['processed_tokens'] = df_train.apply(lambda row: [w for w in row['processed_tokens'] if w not in stop_words], axis=1)
df_val['processed_tokens'] = df_val.apply(lambda row: [w for w in row['processed_tokens'] if w not in stop_words], axis=1)
df_test['processed_tokens'] = df_test.apply(lambda row: [w for w in row['processed_tokens'] if w not in stop_words], axis=1)
df_train.head()

Unnamed: 0,type,text,label,token,processed_tokens
0,ham,"Babe, I'm back ... Come back to me ...",0,"[Babe, ,, I, 'm, back, ..., Come, back, to, me...","[babe, 'm, back, ..., come, back, ...]"
1,ham,S:)no competition for him.,0,"[S, :, ), no, competition, for, him, .]",[competition]
2,ham,Yup having my lunch buffet now.. U eat already?,0,"[Yup, having, my, lunch, buffet, now, .., U, e...","[yup, lunch, buffet, .., u, eat, already]"
3,ham,"Storming msg: Wen u lift d phne, u say HELLO D...",0,"[Storming, msg, :, Wen, u, lift, d, phne, ,, u...","[storming, msg, wen, u, lift, phne, u, say, he..."
4,ham,Mark works tomorrow. He gets out at 5. His wor...,0,"[Mark, works, tomorrow, ., He, gets, out, at, ...","[mark, works, tomorrow, gets, 5, work, house, ..."


In [7]:
# lemmatization
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
df_train['processed_tokens'] = df_train.apply(lambda row : [lemmatizer.lemmatize(w) for w in row['processed_tokens']], axis=1)
df_val['processed_tokens'] = df_val.apply(lambda row : [lemmatizer.lemmatize(w) for w in row['processed_tokens']], axis=1)
df_test['processed_tokens'] = df_test.apply(lambda row : [lemmatizer.lemmatize(w) for w in row['processed_tokens']], axis=1)
df_train.head()

Unnamed: 0,type,text,label,token,processed_tokens
0,ham,"Babe, I'm back ... Come back to me ...",0,"[Babe, ,, I, 'm, back, ..., Come, back, to, me...","[babe, 'm, back, ..., come, back, ...]"
1,ham,S:)no competition for him.,0,"[S, :, ), no, competition, for, him, .]",[competition]
2,ham,Yup having my lunch buffet now.. U eat already?,0,"[Yup, having, my, lunch, buffet, now, .., U, e...","[yup, lunch, buffet, .., u, eat, already]"
3,ham,"Storming msg: Wen u lift d phne, u say HELLO D...",0,"[Storming, msg, :, Wen, u, lift, d, phne, ,, u...","[storming, msg, wen, u, lift, phne, u, say, he..."
4,ham,Mark works tomorrow. He gets out at 5. His wor...,0,"[Mark, works, tomorrow, ., He, gets, out, at, ...","[mark, work, tomorrow, get, 5, work, house, me..."


## Feature Extraction

Feature extraction yang dilakukan ialaha TF-IDF (Term Frequency — Inverse Document Frequency). TF-IDF dilakukan dengan bantuan library sklearn.

In [8]:
# Feature extraction dengan TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# Calculate TF-IDF from training data and apply it to the data
tfidf = TfidfVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x, lowercase=False)
train_data = tfidf.fit_transform(df_train['processed_tokens'])
train_label = df_train['label']

# Apply TF-IDF to validation and testing data
val_data = tfidf.transform(df_val['processed_tokens'])
val_label = df_val['label']
test_data = tfidf.transform(df_test['processed_tokens'])
test_label = df_test['label']

## Classification

Implementasi model klasifikasi dengan algoritma Multinomial Naive Bayes, K Nearest Neighbor, dan Multilayer Perceptron. Ketiganya dilakukan dengan bantuan library sklearn.

In [9]:
# Naive Bayes Model Implementation
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

# Make and train the model
NBclassifier = MultinomialNB()
NBclassifier.fit(train_data, train_label)
# Predict validation data
NBprediction = NBclassifier.predict(val_data)
print('Accuracy on training data for NB model : ', accuracy_score(val_label, NBprediction))

Accuracy on training data for NB model :  0.9660678642714571


In [10]:
# KNN Model Implementation
from sklearn.neighbors import KNeighborsClassifier

# Make and train the model
KNclassifier = KNeighborsClassifier(n_neighbors=1)
KNclassifier.fit(train_data, train_label)
# Predict validation data
KNprediction = KNclassifier.predict(val_data)
print('Accuracy on training data for KNN model : ', accuracy_score(val_label, KNprediction))

Accuracy on training data for KNN model :  0.9560878243512974


In [11]:
# MLP Model Implementation
from sklearn.neural_network import MLPClassifier

# Make and train the model
MLPclassifier = MLPClassifier(random_state=1, max_iter=60)
MLPclassifier.fit(train_data, train_label)
# Predict validation data
MLPprediction = MLPclassifier.predict(val_data)
print('Accuracy on training data for MLP model : ', accuracy_score(val_label, MLPprediction))

Accuracy on training data for MLP model :  0.9920159680638723


### Accuracy on data test

Test ketiga model dengan testing data.

In [12]:
# Predict testing data with Naive Bayes Model
NBpredictTest = NBclassifier.predict(test_data)
print('Accuracy on testing data for NB model : ', accuracy_score(test_label, NBpredictTest))
confusion_matrix(test_label, NBpredictTest)

Accuracy on testing data for NB model :  0.9712230215827338


array([[484,   0],
       [ 16,  56]], dtype=int64)

In [13]:
# Predict testing data with KNN Model
KNpredictTest = KNclassifier.predict(test_data)
print('Accuracy on testing data for KNN model : ', accuracy_score(test_label, KNpredictTest))
confusion_matrix(test_label, KNpredictTest)

Accuracy on testing data for KNN model :  0.9514388489208633


array([[484,   0],
       [ 27,  45]], dtype=int64)

In [16]:
# Predict testing data with MLP Model
MLPpredictTest = MLPclassifier.predict(test_data)
print('Accuracy on testing data for MLP model : ', accuracy_score(test_label, MLPpredictTest))
confusion_matrix(test_label, MLPpredictTest)

Accuracy on testing data for MLP model :  0.9838129496402878


array([[484,   0],
       [  9,  63]], dtype=int64)