In [3]:
import pandas as pd
import numpy as np
import nltk
import seaborn as sns

In [47]:
# read data 
data = pd.read_csv("../dataset/spam.csv", encoding="latin1")

In [26]:
# head
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [48]:
# drop na
data.drop(data.iloc[:,[2,3,4]], inplace=True, axis=1)

In [28]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [29]:
# shape control

In [30]:
data.shape

(5572, 2)

In [14]:
# missing values control

In [31]:
data.isna().sum()

v1    0
v2    0
dtype: int64

In [18]:
# column rename

In [49]:
data.rename(columns={"v1":"spam/ham", "v2":"sms"}, inplace=True)

In [33]:
data.head()

Unnamed: 0,spam/ham,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [50]:
# str lower method
data.sms = data.sms.str.lower()

In [51]:
# ham spam encoding
data = pd.get_dummies(data=data, columns=["spam/ham"], drop_first=True, dtype=int)

In [36]:
data.head()

Unnamed: 0,sms,spam/ham_spam
0,"go until jurong point, crazy.. available only ...",0
1,ok lar... joking wif u oni...,0
2,free entry in 2 a wkly comp to win fa cup fina...,1
3,u dun say so early hor... u c already then say...,0
4,"nah i don't think he goes to usf, he lives aro...",0


In [22]:
# clean punctition

In [52]:
data.sms.replace("[^a-zA-Z ]", "", regex=True, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data.sms.replace("[^a-zA-Z ]", "", regex=True, inplace=True)


In [54]:
# object to list
data.sms = data.sms.str.split()

In [55]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [58]:
# apply function
ps = PorterStemmer()
def apply_function(x):
    d = [ps.stem(kelime) for kelime in x if kelime not in set(stopwords.words("english"))]
    d = " ".join(d)
    return d

In [60]:
data.sms = data.sms.apply(apply_function)

In [61]:
data.head()

Unnamed: 0,sms,spam/ham_spam
0,go jurong point crazi avail bugi n great world...,0
1,ok lar joke wif u oni,0
2,free entri wkli comp win fa cup final tkt st m...,1
3,u dun say earli hor u c alreadi say,0
4,nah dont think goe usf live around though,0


In [63]:
# count vectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [81]:
one = len(data[data["spam/ham_spam"]==1])
zero = len(data[data["spam/ham_spam"]==0])
zero, one

(4825, 747)

In [84]:
# base prediction
one/(zero+one), zero/(zero+one)

(0.13406317300789664, 0.8659368269921034)

In [86]:
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(data.sms).toarray()
y = np.array(data["spam/ham_spam"])

In [87]:
X.shape, y.shape

((5572, 1500), (5572,))

In [132]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_shape = train_test_split(X, y, train_size=0.75, random_state=42)
X_train.shape, y_train.shape

((4179, 1500), (4179,))

In [130]:
X_train = X_train.reshape(-1,1)

In [133]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_depth=10, n_estimators=150, n_jobs=4)
rf.fit(X_train, y_train)

In [134]:
from sklearn.metrics import accuracy_score, confusion_matrix
predctions = rf.predict(X_test)
accuracy_score(y_shape, predctions), confusion_matrix(y_shape, predctions)

(0.9217516152189519,
 array([[1202,    0],
        [ 109,   82]]))