In [1]:
! gdown --id 1N7rk - kfnDFIGMeX0ROVTjKh71gcgx -7R

usage: gdown [-h] [-V] [-O OUTPUT] [-q] [--fuzzy] [--id] [--proxy PROXY] [--speed SPEED]
             [--no-cookies] [--no-check-certificate] [--continue] [--folder] [--remaining-ok]
             [--format FORMAT] [--user-agent USER_AGENT]
             url_or_id
gdown: error: unrecognized arguments: - kfnDFIGMeX0ROVTjKh71gcgx -7R


# **1) Import the required libraries**.

In [22]:
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# **2) Reading the data file, then splitting the data into features and labels. After creating a dataframe, the data from each column is saved into corresponding "messages" and "labels" variables.**

In [5]:
datapath= '/content/sample_data/2cls_spam_text_cls (1).csv'
df= pd.read_csv(datapath)
messages= df['Message'].values.tolist()
labels= df['Category'].values.tolist()
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# **3) Preprocessing Data**

In [35]:
def lowercase(text):
  return text.lower()
def remove_punctuation(text):
  return text.translate(str.maketrans('', '', string.punctuation))
def remove_stopwords(tokens):
  stop_words= set(nltk.corpus.stopwords.words('english'))
  return [token for token in tokens if token not in stop_words]
def stemming(tokens):
  stemmer= nltk.stem.PorterStemmer()
  return [stemmer.stem(token) for token in tokens]
def tokenize(text):
  return nltk.word_tokenize(text)
def preprocess_text(text):
  text= lowercase(text)
  text= remove_punctuation(text)
  tokens= tokenize(text)
  tokens= remove_stopwords(tokens)
  tokens= stemming(tokens)
  return ' '.join(tokens)
def preprocess_data(messages):
  preprocessed_messages= []
  for message in messages:
    preprocessed_message= preprocess_text(message)
    preprocessed_messages.append(preprocessed_message)
  return preprocessed_messages
preprocessed_messages= [preprocess_text(message) for message in messages]


# 4) **Create a dictionary, containing all words or characters appear in all Messages after preprocessing and are not duplicated.**

In [36]:
def create_dictionary(messages):
  dictionary= {}
  for message in messages:
    tokens= message.split()
    for token in tokens:
      if token in dictionary:
        dictionary[token]+= 1
      else:
        dictionary[token]= 1
  return dictionary
dictionary= create_dictionary(preprocessed_messages)


# **5) Create features that represent the information (words) of messages. One simple method is to rely on the frequency of occurrence. For each message, the representation vector will have a size equal to the number of words contained in the dictionary.**

In [37]:
from tkinter.constants import X
def create_features(tokens, dicrionary):
  features= np.zeros(len(dictionary))
  for token in tokens:
    if token in dictionary:
      features[list(dictionary.keys()).index(token)]+= 1
  return features
X= []
for message in preprocessed_messages:
  tokens= message.split()
  features= create_features(tokens, dictionary)
  X.append(features)
X= np.array(X)
y= np.array(labels)

# **6) Preprocessing label**

In [38]:
le= LabelEncoder()
y= le.fit_transform(y)


# **7) Splitting the dataset into training, validation, and testing sets.**

In [62]:
VAL_SIZE= 0.3
TEST_SIZE= 0.1
SEED= 0
X_train, X_val, y_train, y_val= train_test_split(X, y, test_size= VAL_SIZE, random_state= SEED, shuffle= True)
X_train, X_test, y_train, y_test= train_test_split(X_train, y_train, test_size= TEST_SIZE, random_state= SEED, shuffle= True)

In [66]:
model = GaussianNB ()

In [67]:
print('start')
model= model.fit(X_train, y_train)
print('completed')

start
completed


# **8) Evaluating the model**

In [68]:
y_val_pred= model.predict(X_val)
y_test_pred= model.predict(X_test)
val_accuracy= accuracy_score(y_val, y_val_pred)
test_accuracy= accuracy_score(y_test, y_test_pred)
print('Validation Accuracy:', val_accuracy)
print('Test Accuracy:', test_accuracy)

Validation Accuracy: 0.8809808612440191
Test Accuracy: 0.8487179487179487


# 9) Testing

In [69]:
def predict(text, model, dictionary):
  processed_text= preprocess_text(text)
  features= create_features(text, dictionary)
  features= np.array(features).reshape(1, -1)
  prediction= model.predict(features)
  prediction_cls= le.inverse_transform(prediction)
  return prediction_cls
test_one= "My name is Le tu Minh Kien. at present, I am undergraduate in International University"
test_two= ")U*&$)(*@)(*)(&*^#@)"
prediction_cls_one= predict(test_one, model, dictionary)
prediction_cls_two= predict(test_two, model, dictionary)
print(prediction_cls_one)
print(prediction_cls_two)



['ham']
['spam']
