<a href="https://colab.research.google.com/github/KATAMKAVYA/Text-classification/blob/main/First_task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import requests
import zipfile
import io
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
response = requests.get(url)
zip_file = zipfile.ZipFile(io.BytesIO(response.content))
zip_file.extractall()

data = pd.read_csv('SMSSpamCollection', sep='\t', names=["label", "message"])

nltk.download('stopwords')

data['label'] = data['label'].map({'ham': 0, 'spam': 1})

def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    text = [word for word in text.split() if word not in stopwords.words('english')]
    return ' '.join(text)

data['message'] = data['message'].apply(preprocess_text)

tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data['message'])
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

def predict_message(message):
    message = preprocess_text(message)
    message_tfidf = tfidf_vectorizer.transform([message])
    prediction = model.predict(message_tfidf)
    return message, 'spam' if prediction[0] == 1 else 'ham'

spam_messages = [
    "Congratulations! You've won a $1,000 Walmart gift card. Go to http://bit.ly/123456 to claim now.",
    "URGENT! You have been selected to receive a prize. Reply 'CLAIM' to this message to claim your reward.",

]

ham_messages = [
    "Hi, how are you? Let's meet up for lunch tomorrow.",
    "Hey, I've attached the document you requested. Please review it when you get a chance.",
    "Reminder: Our meeting is scheduled for 2:00 PM today. See you there!"
]

print("Predictions for Spam Messages:")
for spam_message in spam_messages:
    message, label = predict_message(spam_message)
    print(f'Message: {message} - Label: {label}')

print("\nPredictions for Ham Messages:")
for ham_message in ham_messages:
    message, label = predict_message(ham_message)
    print(f'Message: {message} - Label: {label}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.9704035874439462
Precision: 1.0
Recall: 0.7785234899328859
F1 Score: 0.8754716981132076
Predictions for Spam Messages:
Message: congratulations youve 1000 walmart gift card go httpbitly123456 claim - Label: spam
Message: urgent selected receive prize reply claim message claim reward - Label: spam

Predictions for Ham Messages:
Message: hi lets meet lunch tomorrow - Label: ham
Message: hey ive attached document requested please review get chance - Label: ham
Message: reminder meeting scheduled 200 pm today see - Label: ham
