In [None]:
# Import necessary libraries

from operator import index

import pandas as pd

!pip install langdetect
import langdetect as detect

import numpy as np

import os

import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

from google.colab import drive

# Download necessary data for natural language processing tasks

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
# Mount Google Drive to access data

drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
# Read in spam data from CSV file

df = pd.read_csv('/content/drive/MyDrive/Training/spam.csv',
                 sep=',', header=0, on_bad_lines='skip', encoding = "ISO-8859-1")

In [None]:
# Drop any "Unnamed" columns

unnamed_cols = df.columns[df.columns.str.contains("Unnamed")]
df.drop(columns=unnamed_cols, inplace=True)

In [None]:
# Define regular expression variables to remove from SMS messages

reg_vars = ['http\S+', 'www\S+', 'https\S+', '\W\s+', '\d+', '\t+', '\d+', '\-+', '\\+', '\/+', '\"+', '\#+', '\++', '\@+', '\$+', '\%+', '\^+', '\&+', '\*+', '\(+', '\)+', '\[+', '\]+', '\{+', '\}+', '\|+', '\;+', '\:+', '\<+', '\>+', '\?+', '\,+', '\.+', '\=+', '\_+', '\~+', '\`+', '\s+']

df.replace(reg_vars, ' ', regex=True, inplace=True)

df.drop_duplicates(inplace=True)

df.replace('', np.nan, inplace=True)

df.dropna(inplace=True)

In [None]:
# Remove rows with non-ASCII characters from the dataframe

df = df[df['v2'].map(lambda x: x.isascii())]

In [None]:
# Drop non-English rows from the dataframe

for i in range(len(df)):
    try:
        ['v2'][i] = detect.detect(df['v2'][i])
        if df['v2'][i] != 'en':
            df.drop(i, inplace=True, index=False)
    except:
        pass

In [None]:
# Convert all the text data into lowercase

df['v2'] = df['v2'].astype(str).str.lower()

In [None]:
# Retrieve a list of English stop words and assign it to a var

stopwords = nltk.corpus.stopwords.words("english")

In [None]:
# Tokenize the SMS messages in the dataframe

df['TokenSMS'] = df.apply(lambda column: nltk.word_tokenize(column['v2']), axis=1)

In [None]:
df['TokenSMS'].head(2)

0    [go, until, jurong, point, crazy, available, o...
1                       [ok, lar, joking, wif, u, oni]
Name: TokenSMS, dtype: object

In [None]:
# Create a column containing the Tokenized words without the stopwords

df['StopTokenSMS'] = df['TokenSMS'].apply(lambda x: [item for item in x if item not in stopwords])

In [None]:
# Create a coolumn containting the StopTokenSMS text with words less than 2 characters

df['LengthTokenSMS'] = df['StopTokenSMS'].apply(lambda x: ' '.join([item for item in x if len(item)>2]))

In [None]:
# Instantiate and the assigne a variable to the WordNetLemmatizer class

wordnet_lem = WordNetLemmatizer()

In [None]:
# Create a new column which contains the lemmatized words

df['LemTokenSMS'] = df['LengthTokenSMS'].apply(wordnet_lem.lemmatize)

In [None]:
# Clean all dataframes again

reg_vars = ['http\S+', 'www\S+', 'https\S+', '\W\s+', '\d+', '\t+', '\d+', '\-+', '\\+', '\/+', '\"+', '\#+', '\++', '\@+', '\$+', '\%+', '\^+', '\&+', '\*+', '\(+', '\)+', '\[+', '\]+', '\{+', '\}+', '\|+', '\;+', '\:+', '\<+', '\>+', '\?+', '\,+', '\.+', '\=+', '\_+', '\~+', '\`+', '\s+']

df.replace(reg_vars, ' ', regex=True, inplace=True)

df.replace('', np.nan, inplace=True)

df.dropna(inplace=True)

In [None]:
# Initialize a CountVectorizer object

cv = CountVectorizer()

In [None]:
# fit_transform the data to a numpy array

x = cv.fit_transform(df['LemTokenSMS']).toarray()

In [None]:
x.shape

(4672, 6869)

In [None]:
# Replace 'spam' with 1 and 'ham' with 0
df['v1'] = df['v1'].replace({'spam': 1, 'ham': 0})

In [None]:
# Store the labels in  y

y = df['v1'].values

In [None]:
y.shape

(4672,)

In [None]:
# convert y to to int type

y = y.astype('int')

In [None]:
# Split the data into a training set and testing set

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
x_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
y_train.astype

<function ndarray.astype>

In [None]:
# Initialize a MultinomialNB objec

mnb=MultinomialNB()

In [None]:
# Training the classifier and making predictions on the test data

mnb.fit(x_train,y_train)
y_pred=mnb.predict(x_test)
print(accuracy_score(y_test,y_pred))

0.9689839572192513


In [None]:
import pickle

# Save the model to a file
with open('model.pkl', 'wb') as file:
    pickle.dump(mnb, file)

# Save the model to a file
with open('cv.pkl', 'wb') as file:
    pickle.dump(cv, file)

In [None]:
from google.colab import files

files.download('model.pkl')
files.download('cv.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Create a preprocessing function to process new text

def clean_dataframe(df):
    reg_vars = ['http\S+', 'www\S+', 'https\S+', '\W\s+', '\d+', '\t+', '\d+', '\-+', '\\+', '\/+', '\"+', '\#+', '\++', '\@+', '\$+', '\%+', '\^+', '\&+', '\*+', '\(+', '\)+', '\[+', '\]+', '\{+', '\}+', '\|+', '\;+', '\:+', '\<+', '\>+', '\?+', '\,+', '\.+', '\=+', '\_+', '\~+', '\`+', '\s+']
    df['text'].replace(reg_vars, ' ', regex=True, inplace=True)
    df['text'] = df['text'].astype(str).str.lower()
    df['text'] = df.apply(lambda column: nltk.word_tokenize(column['text']), axis=1)
    stopwords = nltk.corpus.stopwords.words('english')
    df['text'] = df['text'].apply(lambda x: [item for item in x if item not in stopwords])
    df['text'] = df['text'].apply(lambda x: ' '.join([item for item in x if len(item)>2]))
    df['text'] = df['text'].apply(wordnet_lem.lemmatize)


In [None]:
data = [{"text": "Urgent dont miss news dun say so early hor... U c already then say lucky man"}]

ndf = pd.DataFrame(data)

clean_dataframe(ndf)

In [None]:
newtext = cv.transform(ndf['text']).toarray()
prediction = mnb.predict(newtext)
print(prediction)

[0]
