In [1]:
import pandas as pd
import numpy as np


In [12]:
data = pd.read_csv('/content/spam.csv', encoding='latin-1', sep=',')

In [13]:
data.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [15]:
data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],inplace=True,axis=1)

In [90]:
data.to_csv('spam_ham.csv')

In [19]:
data.rename(columns={'v1':'label','v2':'message'},inplace=True)

In [20]:
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [26]:
import nltk
import re
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [34]:
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [94]:
corpus = []
for i in range (0,len(data)):
  review = re.sub('[^a-zA-Z]',' ',data['message'][i])
  review = review.lower()
  review = review.split()
  review = [lemmatizer.lemmatize(word) for word in review if word not in stopwords.words('english')]
  review = ' '.join(review)
  corpus.append(review)

In [93]:
corpus

['go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat',
 'ok lar joking wif u oni',
 'free entry in a wkly comp to win fa cup final tkts st may text fa to to receive entry question std txt rate t c s apply over s',
 'u dun say so early hor u c already then say',
 'nah i don t think he go to usf he life around here though',
 'freemsg hey there darling it s been week s now and no word back i d like some fun you up for it still tb ok xxx std chgs to send to rcv',
 'even my brother is not like to speak with me they treat me like aid patent',
 'a per your request melle melle oru minnaminunginte nurungu vettam ha been set a your callertune for all caller press to copy your friend callertune',
 'winner a a valued network customer you have been selected to receivea prize reward to claim call claim code kl valid hour only',
 'had your mobile month or more u r entitled to update to the latest colour mobile with camera for free call the mobile up

In [39]:
#Bag of Words
from sklearn.feature_extraction.text import CountVectorizer

In [40]:
cv =CountVectorizer(max_features=2500,ngram_range=(1,2))

In [43]:
X=cv.fit_transform(corpus).toarray()

In [45]:
X.shape

(5572, 2500)

In [46]:
cv.vocabulary_

{'go': np.int64(811),
 'point': np.int64(1614),
 'crazy': np.int64(447),
 'available': np.int64(112),
 'bugis': np.int64(227),
 'great': np.int64(856),
 'world': np.int64(2436),
 'la': np.int64(1096),
 'cine': np.int64(345),
 'got': np.int64(847),
 'wat': np.int64(2336),
 'ok': np.int64(1477),
 'lar': np.int64(1108),
 'joking': np.int64(1053),
 'wif': np.int64(2399),
 'oni': np.int64(1493),
 'free': np.int64(718),
 'entry': np.int64(616),
 'wkly': np.int64(2422),
 'comp': np.int64(401),
 'win': np.int64(2404),
 'cup': np.int64(455),
 'final': np.int64(683),
 'st': np.int64(1961),
 'may': np.int64(1267),
 'text': np.int64(2090),
 'receive': np.int64(1696),
 'question': np.int64(1662),
 'std': np.int64(1979),
 'txt': np.int64(2200),
 'rate': np.int64(1675),
 'apply': np.int64(77),
 'free entry': np.int64(723),
 'entry wkly': np.int64(618),
 'std txt': np.int64(1980),
 'txt rate': np.int64(2204),
 'rate apply': np.int64(1676),
 'dun': np.int64(571),
 'say': np.int64(1796),
 'early': np.in

In [52]:
from sklearn.preprocessing import LabelEncoder
lc = LabelEncoder()

In [53]:
y=lc.fit_transform(data['label'])

In [55]:
y

array([0, 0, 1, ..., 0, 0, 0])

In [56]:
from sklearn.model_selection import train_test_split

In [58]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.2)

In [59]:
from sklearn.naive_bayes import MultinomialNB

In [60]:
model =MultinomialNB()

In [61]:
model.fit(X_train,y_train)

In [62]:
y_pred = model.predict(X_test)

In [65]:
from sklearn.metrics import accuracy_score,classification_report

In [64]:
print(accuracy_score(y_pred,y_test))

0.9838565022421525


In [67]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       968
           1       0.94      0.93      0.94       147

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [69]:
tfidf = TfidfVectorizer(max_features=2500,ngram_range=(1,2))

In [70]:
X=tfidf.fit_transform(corpus).toarray()

In [71]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [72]:
y = pd.get_dummies(data['label'],dtype='int')

In [82]:
y = y.values

In [83]:
y

array([0, 0, 1, ..., 0, 0, 0])

In [84]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.2)

In [85]:
model2 = MultinomialNB()

In [86]:
model2.fit(X_train,y_train)

In [87]:
y_pred2=model2.predict(X_test)

In [88]:
print(classification_report(y_pred2,y_test))

              precision    recall  f1-score   support

           0       1.00      0.97      0.98       974
           1       0.82      0.99      0.90       141

    accuracy                           0.97      1115
   macro avg       0.91      0.98      0.94      1115
weighted avg       0.98      0.97      0.97      1115



In [89]:
print(accuracy_score(y_pred2,y_test))

0.9713004484304932
