**Import Libraries**

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

**Creating DataFrame**

In [18]:
df = pd.read_csv("sms.tsv",sep="\t",names=["label","message"])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


**preprocess pipeline**

In [21]:
def preprocess(message):
    message = message.lower()
    message = message.translate(str.maketrans("","",string.punctuation))
    
    tokens = word_tokenize(message)
    stop_words = set(stopwords.words("english"))
    tokens = [w for w in tokens if w not in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return " ".join(tokens)

In [22]:
df["cleaned"] = df["message"].apply(preprocess)
df.head()

Unnamed: 0,label,message,cleaned
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think go usf life around though


**Text to Numbers**

In [26]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['cleaned'])
y = df["label"]
X.shape

(5572, 8903)

**Train Test Split**

In [27]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42)
X_train.shape,X_test.shape

((4179, 8903), (1393, 8903))

In [30]:
model = MultinomialNB()
model.fit(X_train,y_train)
"Model Created"

'Model Created'

In [31]:
model.predict(X_test)

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype='<U4')

**Accuarcy**

In [33]:
predictions = model.predict(X_test)
"Accuracy",accuracy_score(predictions,y_test)

('Accuracy', 0.968413496051687)

**With own message**

In [36]:
message = "Congratulations You are selected for Nasa"
message = preprocess(message)
message_vector = vectorizer.transform([message])
model.predict(message_vector)

array(['spam'], dtype='<U4')

In [37]:
message = "This is me, just wanted to make sure that you are okay"
message = preprocess(message)
message_vector = vectorizer.transform([message])
model.predict(message_vector)

array(['ham'], dtype='<U4')