In [1]:
import pandas as pd

In [3]:
data = pd.read_csv('../messages.csv')
data.head()

Unnamed: 0,subject,message,label
0,job posting - apple-iss research center,content - length : 3386 apple-iss research cen...,0
1,,"lang classification grimes , joseph e . and ba...",0
2,query : letter frequencies for text identifica...,i am posting this inquiry for sergei atamas ( ...,0
3,risk,a colleague and i are researching the differin...,0
4,request book information,earlier this morning i was on the phone with a...,0


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [5]:
# Remove rows with missing 'message' data
data_clean = data.dropna(subset=['message'])

In [6]:
#  Create a CountVectorizer object
vectorizer = CountVectorizer(stop_words='english')

In [7]:
# Fit and transform the 'message' data to create feature vectors
X = vectorizer.fit_transform(data_clean['message'])

In [8]:
y = data_clean['label']

In [9]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Create a Multinomial Naive Bayes classifier and train the model
model = MultinomialNB()
model.fit(X_train, y_train)

In [11]:
# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Confusion matrix:\n{conf_matrix}')
print(f'Classification report:\n{class_report}')


Accuracy: 0.9913644214162349
Confusion matrix:
[[460   4]
 [  1 114]]
Classification report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       464
           1       0.97      0.99      0.98       115

    accuracy                           0.99       579
   macro avg       0.98      0.99      0.99       579
weighted avg       0.99      0.99      0.99       579



In [13]:
# Export the model to a file
import joblib

joblib.dump(model, 'model.joblib')


['model.joblib']