## Importing libraries

In [3]:
import numpy as np
import pandas as pd
import re
import nltk
# importing python files
import contractions
import nlp_tools

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Load the data

In [4]:
#Load the data:
df = pd.read_csv('spam.csv', encoding='latin-1')
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

df.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
txt=df['message'][4]
txt

"Nah I don't think he goes to usf, he lives around here though"

In [6]:
nlp_tools.lemmatization_sentence(txt)

"nah i do n't think he go to usf he live around here though"

## Cleaning-

In [7]:
print('Before cleaning:-',"\n",txt)
contractions.expand_contraction(txt)

Before cleaning:- 
 Nah I don't think he goes to usf, he lives around here though


'nah i do not think he goes to usf, he lives around here though'

In [8]:
df["clean_message"]=df['message'].apply(contractions.expand_contraction)

In [9]:
df["clean_message"].head(10).to_frame()

Unnamed: 0,clean_message
0,"go until jurong point, crazy.. available only ..."
1,ok lar... joking wif u oni...
2,free entry in 2 a wkly comp to win fa cup fina...
3,u dun say so early hor... u c already then say...
4,"nah i do not think he goes to usf, he lives ar..."
5,freemsg hey there darling it is been 3 week's ...
6,even my brother is not like to speak with me. ...
7,as per your request 'melle melle (oru minnamin...
8,winner!! as a valued network customer you have...
9,had your mobile 11 months or more? u r entitle...


## Tokenizaton-

In [10]:
df["clean_message"]=df['message'].apply(nlp_tools.lemmatization_sentence)

In [11]:
df.head(10)

Unnamed: 0,label,message,clean_message
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy .. available only ...
1,ham,Ok lar... Joking wif u oni...,ok lar ... joke wif u oni ...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor ... u c already then sa...
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i do n't think he go to usf he live around...
5,spam,FreeMsg Hey there darling it's been 3 week's n...,freemsg hey there darling it 's be 3 week 's n...
6,ham,Even my brother is not like to speak with me. ...,even my brother be not like to speak with me t...
7,ham,As per your request 'Melle Melle (Oru Minnamin...,a per your request 'melle melle oru minnaminun...
8,spam,WINNER!! As a valued network customer you have...,winner a a value network customer you have be ...
9,spam,Had your mobile 11 months or more? U R entitle...,have your mobile 11 month or more u r entitle ...


## Bag of words-

In [12]:
message_list=df['clean_message'].tolist()
type(message_list)

list

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(df['clean_message'])

In [14]:
X=cv.transform(message_list).toarray()

X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [15]:
type(X)

numpy.ndarray

In [16]:
len(cv.get_feature_names_out())

7725

In [17]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y = encoder.fit_transform(df['label'])


## Train/Test Split:

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [19]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
model = MultinomialNB()


## Model Fiting-

In [20]:
model.fit(X_train, y_train)

In [21]:
y_pred = model.predict(X_test)

In [22]:
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred) * 100
print(f'Accuracy: {accuracy:.2f}%')



              precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.93      0.93      0.93       150

    accuracy                           0.98      1115
   macro avg       0.96      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115

Accuracy: 98.03%


In [23]:
# Function to predict new messages
def predict_message(message):
    # Preprocess the message
    clean_message = contractions.expand_contraction(message)
    clean_message = nlp_tools.lemmatization_sentence(clean_message)
    
    # Vectorize the message
    message_vector = cv.transform([clean_message])
    
    # Predict using the trained model
    prediction = model.predict(message_vector)
    
    # Convert prediction to label
    label = encoder.inverse_transform(prediction)[0]
    
    return label



In [24]:

# Test the model with user input
user_input = input("Enter a message to classify (or 'exit' to quit): ")
if user_input.lower() != 'exit':
    result = predict_message(user_input)
    print(f'The message is classified as: {result}')
else:
    print("Exited")

The message is classified as: spam
