<a href="https://colab.research.google.com/github/Mouneeshsaravanan/BHARATH-INTERN-/blob/main/Project_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

# Download NLTK resources
nltk.download('stopwords')

# Load external dataset for training and testing with explicit encoding
external_data_path = '/content/spam.csv'
external_data = pd.read_csv(external_data_path, encoding='latin-1')

# Text preprocessing for external dataset
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower().split()
    text = [ps.stem(word) for word in text if not word in stop_words]
    text = ' '.join(text)
    return text

# Convert string labels to integers
external_data['label'] = (external_data['label'] == 'spam').astype(int)

# Tokenization
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(external_data['text'])
X = tokenizer.texts_to_sequences(external_data['text'])

# Padding sequences
maxlen = 100  # maximum sequence length
X = pad_sequences(X, padding='post', maxlen=maxlen)

# Define CNN model
model = Sequential([
    Embedding(input_dim=5000, output_dim=50, input_length=maxlen),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Split external data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, external_data['label'], test_size=0.2, random_state=42
)

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model on the external testing set
loss, accuracy = model.evaluate(X_test, y_test)
print("External Dataset Model Evaluation:")
print("Accuracy:", accuracy)

# Load the new dataset for prediction
new_dataset_path = '/content/prediction spam msg.csv'
new_dataset = pd.read_csv(new_dataset_path)

# Text preprocessing for the new dataset for prediction
new_dataset['processed_text'] = new_dataset['text'].apply(preprocess_text)
X_new = tokenizer.texts_to_sequences(new_dataset['processed_text'])
X_new = pad_sequences(X_new, padding='post', maxlen=maxlen)

# Define a function to convert label from 1/0 to "spam"/"not spam"
def label_to_text(label):
    return "spam" if label == 1 else "not spam"

# Predict using the trained model
predictions = model.predict(X_new)
predictions_binary = (predictions > 0.5).astype(int)
predictions_text = [label_to_text(pred) for pred in predictions_binary]

# Display predictions for the new dataset
new_dataset['predicted_label'] = predictions_text
print("\nPredictions for New Dataset:")
print(new_dataset[['text', 'predicted_label']])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
External Dataset Model Evaluation:
Accuracy: 0.9874439239501953

Predictions for New Dataset:
                              text predicted_label
0        Claim your prize now!                spam
1   Meeting rescheduled to 3 PM.          not spam
