In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score



In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
data=pd.read_csv("/kaggle/input/maildataset/mail_data.csv")

In [5]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Data preprocess

##### Label encoding

In [6]:
data.Category=data.Category.replace("ham",1)
data.Category=data.Category.replace("spam",0)

In [7]:
data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


### Cleaning Data 

In [8]:
from nltk.tokenize import word_tokenize
def clean_data (text):
    # clean from symbols 
    cleaded =re.sub(r'[^\w\s]','',text)
    cleaned_text = cleaded.lower()
    
    # Tokenize the text into individual words
    tokens = word_tokenize(text)
    
    # Get the list of stopwords
    stop_words = set(stopwords.words('english'))

    # Remove stopwords from the tokenized text
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    
    joined_string = ' '.join(filtered_tokens)
    return joined_string

In [9]:
clean_data("film is good ")

'film good'

#### Splite data to features and target.

In [10]:
X=data.Message.apply(clean_data)
y=data.Category

In [11]:
X[0]

'Go jurong point , crazy.. Available bugis n great world la e buffet ... Cine got amore wat ...'

In [12]:
data.Message[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

### Feature Extraction

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Feature extraction using TF-IDF
vectorize=TfidfVectorizer(max_features=2000)

### Spliting traing and test

In [14]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.3,random_state=42)

#### Vectorize words

In [15]:
X_train=vectorize.fit_transform(X_train)
X_test=vectorize.transform(X_test)

In [16]:
import pickle

In [17]:
#Save vectorize 
pickle.dump(vectorize, open('vectorizer.pkl', 'wb'))

## Create models

In [18]:
model_naive=MultinomialNB()
model_naive.fit(X_train,y_train)

In [19]:
y_pred_naive=model_naive.predict(X_test)

In [20]:
acc_naive=accuracy_score(y_test,y_pred_naive)
acc_naive

0.9838516746411483

In [21]:
#save model to use in deploy
pickle.dump(model_naive,open('model_naive.pkl','wb'))

In [22]:
model_SVC=SVC()
model_SVC.fit(X_train,y_train)

In [23]:
y_pred_SVC=model_SVC.predict(X_test)

In [24]:
acc_SVC=accuracy_score(y_test,y_pred_SVC)
acc_SVC

0.9850478468899522

In [25]:
pickle.dump(model_SVC,open('model_SVC.pkl','wb'))

In [26]:
model_NN=MLPClassifier()
model_NN.fit(X_train,y_train)

In [27]:
y_pred_NN=model_NN.predict(X_test)

In [28]:
acc_NN=accuracy_score(y_test,y_pred_NN)
acc_NN

0.9868421052631579

In [29]:
pickle.dump(model_NN, open('model_NN.pkl', 'wb'))

## Test models 

In [30]:
test_text=vectorize.transform([clean_data("this film is good")])
print("model_naive:",model_naive.predict(test_text)[0])
print("model_SVC:",model_SVC.predict(test_text)[0])
print("model_NN:",model_NN.predict(test_text)[0])

model_naive: 1
model_SVC: 1
model_NN: 1
