<a href="https://colab.research.google.com/github/MorrisLesinko/Machine_Learning_Projects/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# New Section

In [None]:
import nltk

# Ensure necessary NLTK resources are downloaded
nltk.download('stopwords')
nltk.download('wordnet')

# Now you can proceed with your imports and the rest of your script
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score

# Load the dataset
dataset = pd.read_csv("/content/SMSSpamCollection", sep='\t', names=["labels", "message"])

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Preparing the corpus
stop_words = set(stopwords.words('english'))
corpus = []
for message in dataset['message']:
    review = re.sub('[^a-zA-Z]', ' ', message).lower().split()
    review = [lemmatizer.lemmatize(word) for word in review if word not in stop_words]
    corpus.append(' '.join(review))

# Vectorization: Creating the TF-IDF model
vectorizer = TfidfVectorizer(max_features=2500)
X = vectorizer.fit_transform(corpus).toarray()
y = dataset['labels']

# Label encoding
y = pd.get_dummies(y, drop_first=True).values.ravel()

# Splitting the datasets into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

# Training the Multinomial Naive Bayes model on the Training set
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix and calculating accuracy
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Accuracy:", accuracy_score(y_test, y_pred))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


[[954   1]
 [ 23 137]]
Accuracy: 0.97847533632287
