In [1]:
# Specify the encoding to avoid UnicodeDecodeError
import numpy as np
import pandas as pd
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')

In [2]:
df.shape

(5572, 5)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [5]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder() 

In [6]:
# renaming the cols
df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df.head()

Unnamed: 0,target,text,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [7]:
df['target'] = encoder.fit_transform(df['target'])

In [8]:
# drop last 3 cols
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [9]:
df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
# missing values
df.isnull().sum()

target    0
text      0
dtype: int64

In [11]:
# check for duplicate values
df.duplicated().sum()

403

In [12]:
# remove duplicates
df = df.drop_duplicates(keep='first')

In [13]:
df.duplicated().sum()

0

In [14]:
df.shape

(5169, 2)

In [15]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string

# Ensure you have downloaded the necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

def transform_text(text):
    # Initialize the Porter Stemmer
    ps = PorterStemmer()
    
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenize the text
    text = word_tokenize(text)
    
    # Filter out non-alphanumeric tokens
    y = [i for i in text if i.isalnum()]
    
    # Remove stopwords and punctuation
    y = [i for i in y if i not in stopwords.words('english')]

    # Stem the remaining words
    y = [ps.stem(i) for i in y]
    
    return " ".join(y)




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\91915\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91915\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
# Example usage
result = transform_text("I'm gonna be home soon and I don't want to talk about this stuff anymore tonight, k? I've cried enough today.")
print(result)

gon na home soon want talk stuff anymor tonight k cri enough today


In [17]:
df['text'][10]

"I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today."

In [18]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
ps.stem('loving')

'love'

In [19]:
df['transformed_text'] = df['text'].apply(transform_text)

In [20]:
df.head()

Unnamed: 0,target,text,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5169 entries, 0 to 5571
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   target            5169 non-null   int32 
 1   text              5169 non-null   object
 2   transformed_text  5169 non-null   object
dtypes: int32(1), object(2)
memory usage: 270.4+ KB


In [22]:
# Set parameters for tokenization and padding
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Embedding, Dropout
vocab_size = 5000  # Number of unique words to keep
max_length = 100   # Maximum length of each input sequence

# Initialize and fit the tokenizer on the transformed text column
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer

<keras.src.preprocessing.text.Tokenizer at 0x20bf7527390>

In [23]:
tokenizer.fit_on_texts(df['transformed_text'])

In [24]:
tokenizer

<keras.src.preprocessing.text.Tokenizer at 0x20bf7527390>

In [25]:
# Convert texts to sequences
X = tokenizer.texts_to_sequences(df['transformed_text'])
X = pad_sequences(X, maxlen=max_length)

# Labels
y = df['target'].values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [96]:
# Step 3: Build the Model
model = Sequential()

# Embedding layer
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length))

# CNN layer
model.add(Conv1D(filters=64, kernel_size=2, activation='relu'))

# LSTM layer
model.add(LSTM(88, return_sequences=True))
model.add(Dropout(0.3))  # Dropout for regularization

# Second LSTM layer
model.add(LSTM(44))

# Output layer
model.add(Dense(1, activation='sigmoid'))  # Sigmoid activation for binary classification

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [98]:
# Step 4: Train the Model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=2, batch_size=32)

Epoch 1/2
Epoch 2/2


In [99]:
# Step 5: Evaluate the Model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%') 

Test Accuracy: 98.26%


In [100]:
import pickle

In [106]:
pickle.dump(model,open('model2.pkl','wb')) 

In [107]:
import tensorflow
print(tensorflow.__version__)

2.14.0


In [108]:
pickle.dump(tokenizer,open('tokenizer.pkl','wb'))