In [18]:
import string

import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer #this will take the individual tokens and vectorize them by counting them cuz we can't take text and feed it into a model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


In [19]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [36]:
df = pd.read_csv('spam_ham_dataset.csv')
df['text'] = df['text'].apply(lambda x: x.replace('\r\n',' '))
df

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291 thi...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001 ( see at...",0
2,3624,ham,"Subject: neon retreat ho ho ho , we ' re aroun...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs this deal is to b...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft the transport vo...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms hpl c...,0
5168,2933,ham,Subject: calpine daily gas nomination > > juli...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [22]:
stemmer = PorterStemmer()
"stemmer.stem('sophistication') #to reduce the number of terms by reducing them down to the word stem"

corpus = []

stopwords_set = set(stopwords.words('english'))

for i in range(len(df)):
  text = df['text'].iloc[i].lower()
  text = text.translate(str.maketrans('','',string.punctuation)).split()
  text = [stemmer.stem(word) for word in text if word not in stopwords_set]
  text = ' '.join(text)
  corpus.append(text)

In [24]:
df.text.iloc[0]

"Subject: enron methanol ; meter # : 988291 this is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary flow data provided by daren } . please override pop ' s daily volume { presently zero } to reflect daily activity you can obtain from gas control . this change is needed asap for economics purposes ."

In [25]:
corpus[0]

'subject enron methanol meter 988291 follow note gave monday 4 3 00 preliminari flow data provid daren pleas overrid pop daili volum present zero reflect daili activ obtain ga control chang need asap econom purpos'

In [26]:
vectorizer = CountVectorizer()

x = vectorizer.fit_transform(corpus).toarray()
y = df.label_num

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [28]:
clf = RandomForestClassifier(n_jobs=-1) #to instruct it to use all the CPU cores to be faster
clf.fit(x_train, y_train)

In [29]:
clf.score(x_test, y_test)

0.978743961352657

In [38]:
email_to_classify = df.text.values[15]
email_to_classify

"Subject: underpriced issue with high return on equity stock report . dont sieep on this stock ! this is a hot one ! company : gaming transactions inc . stock symbol : ggts currentiy trading at : o . 30 rating : strong purchase near - term target : 0 . 45 long - term target : 1 . oo breaking news for ggts : gaming transactions inc . ( ggts ) , a | eading provider of online gaming porta | management is pleased to announce that it has launched its proprietary gaming portal ( k e n o . com ) furthermore , the company has begun an intensive marketing campaign to support the | aunch and establish itself as the | eader in the online gaming industry . ( k e n o . c o m ) is an oniine games destination where people piay popular casino style games to win real money . the foundation of the site is an online version of keno . the game of keno uses 80 balls numbered 1 thru 8 o . every game , the house draws 20 bails at random and displays their numbers on screens ( called keno boards ) located on 

In [33]:
email_text = email_to_classify.lower().translate(str.maketrans('','',string.punctuation)).split()
email_text = [stemmer.stem(word) for word in text if word not in stopwords_set]
email_text = ' '.join(email_text)

email_corpus = [email_text]

x_email = vectorizer.transform(email_corpus)

In [34]:
clf.predict(x_email)

array([1])

In [39]:
df.label_num.iloc[15]

1