# library

In [1]:
import re
import nltk
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score , classification_report

# data

In [2]:
data = [
    ['spam', "Had your mobile 11 months or more?"],
    ['ham', "I'm gonna to be home soon will connect later"],
    ['spam', "Congratulations! Pranav you have won $1000000"],
    ['ham', "Hey, are we still meeting tomorrow?"],
    ['spam', "You have won a free cruise! Call now."],
    ['ham', "Sure, I'll bring the documents."],
]


# Extracting data

In [3]:
lables = [row[0] for row in data]
message = [row[1] for row in data]

# Preproses the text

In [4]:
def preprosess_text(text):
    text = text.lower() #converting to lower case
    text = re.sub(r'[^a-z\s]',' ',text)
    words = text.split()
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

In [5]:
preprocesssed_messages = [preprosess_text(msg) for msg in message]
print(preprocesssed_messages)

['mobile months', 'gonna home soon connect later', 'congratulations pranav', 'hey still meeting tomorrow', 'free cruise call', 'sure bring documents']


# Vectorization

In [6]:
vectorize = CountVectorizer()
x = vectorize.fit_transform(preprocesssed_messages)
print(x)

  (0, 12)	1
  (0, 13)	1
  (1, 7)	1
  (1, 9)	1
  (1, 15)	1
  (1, 3)	1
  (1, 10)	1
  (2, 2)	1
  (2, 14)	1
  (3, 8)	1
  (3, 16)	1
  (3, 11)	1
  (3, 18)	1
  (4, 6)	1
  (4, 4)	1
  (4, 1)	1
  (5, 17)	1
  (5, 0)	1
  (5, 5)	1


In [7]:
y = lables
print(y)

['spam', 'ham', 'spam', 'ham', 'spam', 'ham']


# Spliting data into training and testing

In [8]:
X_train , X_test , y_train, y_test = train_test_split(x,y,test_size=0.2,random_state = 42,stratify=y)

In [77]:
print(y_train)

['ham', 'spam', 'ham', 'spam']


In [9]:
print(X_train)


  (0, 8)	1
  (0, 16)	1
  (0, 11)	1
  (0, 18)	1
  (1, 6)	1
  (1, 4)	1
  (1, 1)	1
  (2, 7)	1
  (2, 9)	1
  (2, 15)	1
  (2, 3)	1
  (2, 10)	1
  (3, 2)	1
  (3, 14)	1


In [11]:
print(y_test)

['spam', 'ham']


In [12]:
print(X_test)


  (0, 12)	1
  (0, 13)	1
  (1, 17)	1
  (1, 0)	1
  (1, 5)	1


# Creating a LogisticRegression model  

In [13]:
model = LogisticRegression()

In [15]:
model.fit(X_train,y_train)

In [16]:
y_pred = model.predict(X_test)
print("Accuracy : ",accuracy_score(y_test,y_pred))

Accuracy :  0.5


# New message

In [19]:
new_sms = "gonna"
new_sms_preprocessed = preprosess_text(new_sms)
new_vectorized = vectorize.transform([new_sms_preprocessed])

if new_vectorized.nnz == 0:
    print("No matching words found in the training vocabulary. Unable to classify.")
else:
    prediction = model.predict(new_vectorized)
    print(f"Prediction for new SMS: {prediction[0]}")

Prediction for new SMS: spam
