# **Importing Libraries**

In [1]:
import pandas as pd

# **Loading file**

In [2]:
data = pd.read_csv('/content/spam.csv',encoding='Windows-1252')

In [3]:
pd.set_option('display.max_colwidth', None)
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives around here though",,,


In [4]:
data = data[['v1','v2']]
data.head(10)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"
5,spam,"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv"
6,ham,Even my brother is not like to speak with me. They treat me like aids patent.
7,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
8,spam,WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
9,spam,Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030


In [5]:
data.v1.value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

# **Labeling spam & ham to 0 & 1**

In [7]:
def custom_encoder(data):
  data.replace(to_replace=["ham"], value=1, inplace=True)
  data.replace(to_replace=["spam"], value=0, inplace=True)
  return data

In [8]:
data.v1 = custom_encoder(data.v1)

In [9]:
data.head(5)

Unnamed: 0,v1,v2
0,1,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives around here though"


In [10]:
data.v1.value_counts()

1    4825
0     747
Name: v1, dtype: int64

# **Splitting data into test and train**

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split (data['v2'], data['v1'] , test_size=0.2)

In [31]:
X_train.head()

4409                                                                            You also didnt get na hi hi hi hi hi
5551                                                             Wen did you get so spiritual and deep. That's great
2933                                                                 Yup n her fren lor. I'm meeting my fren at 730.
2229    haha but no money leh... Later got to go for tuition... Haha and looking for empty slots for driving lessons
4219                                   U free on sat rite? U wan 2 watch infernal affairs wif me n darren n mayb xy?
Name: v2, dtype: object

In [32]:
X_test.head()

2679                 New Tones This week include: 1)McFly-All Ab.., 2) Sara Jorge-Shock.. 3) Will Smith-Switch.. To order follow instructions on next message
345                                                                                                                         Gudnite....tc...practice going on
1508                                                                                                 Sounds like something that someone testing me would sayy
502                                                                                                                                   Check with nuerologist.
5035    You won't believe it but it's true. It's Incredible Txts! Reply G now to learn truly amazing things that will blow your mind. From O2FWD only 18p/txt
Name: v2, dtype: object

In [70]:
y_test.head()

2679    0
345     1
1508    1
502     1
5035    0
Name: v1, dtype: int64

# **Data Pre-processing**

In [25]:
import string
def remove_punctuation(v2):
    punctuationfree="".join([i for i in v2 if i not in string.punctuation])
    return punctuationfree

In [26]:
import nltk
nltk.download('punkt')
def tokenization(v2):
    words = nltk.word_tokenize(v2)
    return words

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [19]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
def remove_stopwords(v2):
    output= [i for i in v2 if i not in stopwords]
    return output

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatizer(v2):
  lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in v2]
  return lemm_text

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [33]:
def preprocess(X_train):
  corpus = []
  for item in X_train:
    new_item = remove_punctuation(item)
    new_item = new_item.lower()
    new_item = tokenization(new_item)
    new_item = remove_stopwords(new_item)
    new_item = lemmatizer(new_item)
    corpus.append(' '.join(str(x) for x in new_item))
  return corpus

In [39]:
X_train=pd.DataFrame(X_train)

In [40]:
corpus = preprocess(X_train.v2)

In [41]:
corpus[0:10]

['also didnt get na hi hi hi hi hi',
 'wen get spiritual deep thats great',
 'yup n fren lor im meeting fren 730',
 'haha money leh later got go tuition haha looking empty slot driving lesson',
 'u free sat rite u wan 2 watch infernal affair wif n darren n mayb xy',
 'ur tonexs subscription renewed charged å£450 choose 10 polys month wwwclubzedcouk billing msg',
 'ree entry 2 weekly comp chance win ipod txt pod 80182 get entry std txt rate tc apply 08452810073 detail 18',
 'stop story ive told ive returned he saying order',
 'dont cancer mom making big deal regular checkup aka pap smear',
 'oh gei happend tron maybe ill dl 3d']

In [44]:
X_train.head(5)

Unnamed: 0,v2
4409,You also didnt get na hi hi hi hi hi
5551,Wen did you get so spiritual and deep. That's great
2933,Yup n her fren lor. I'm meeting my fren at 730.
2229,haha but no money leh... Later got to go for tuition... Haha and looking for empty slots for driving lessons
4219,U free on sat rite? U wan 2 watch infernal affairs wif me n darren n mayb xy?


# **Bag-of-words**

In [48]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1,2))
traindata = cv.fit_transform(corpus)
X = traindata
y = y_train

In [49]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 100)
clf.fit(X, y)

RandomForestClassifier()

In [50]:
from sklearn import metrics
y_pred = clf.predict(X) 
metrics.accuracy_score(y_pred, y)

1.0

# **Testing**

In [51]:
X_test.head()

2679                 New Tones This week include: 1)McFly-All Ab.., 2) Sara Jorge-Shock.. 3) Will Smith-Switch.. To order follow instructions on next message
345                                                                                                                         Gudnite....tc...practice going on
1508                                                                                                 Sounds like something that someone testing me would sayy
502                                                                                                                                   Check with nuerologist.
5035    You won't believe it but it's true. It's Incredible Txts! Reply G now to learn truly amazing things that will blow your mind. From O2FWD only 18p/txt
Name: v2, dtype: object

In [52]:
y_test.head()

2679    0
345     1
1508    1
502     1
5035    0
Name: v1, dtype: int64

In [54]:
test_corpus = preprocess(X_test)

In [55]:
test_corpus[0:5]

['new tone week include 1mcflyall ab 2 sara jorgeshock 3 smithswitch order follow instruction next message',
 'gudnitetcpractice going',
 'sound like something someone testing would sayy',
 'check nuerologist',
 'wont believe true incredible txts reply g learn truly amazing thing blow mind o2fwd 18ptxt']

In [56]:
testdata = cv.transform(test_corpus)

In [58]:
predictions = clf.predict(testdata)

In [59]:
metrics.accuracy_score(y_test,predictions)

0.9730941704035875

In [68]:
def find_sentiment(input):
  input = cv.transform(preprocess(input))
  prediction = clf.predict(input)
  if prediction == 0: 
    print('Spam mail')
  if prediction == 1:
    print('Ham mail')

In [69]:
input = ["Sounds like something that someone testing me would sayy"]
find_sentiment(input)

Ham mail
