# NLP (Natural language processing)

## 1: Key Concepts, text data cleaning

In [1]:
from tabulate import tabulate
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.stem import SnowballStemmer
from scipy.spatial.distance import pdist, squareform
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline  import Pipeline, FeatureUnion, make_pipeline
from sklearn.naive_bayes import MultinomialNB
import numpy as np
import pandas as pd
import string
import nltk
import math
 

# stopwords

In [2]:
corpus = ["Jack stole my tuna sandwich.", 
    "'Help!' I sobbed, sandwichlessly.", 
    "'Drop the sandwiches!' said the sandwich police."]

## Function for tokenize °||° Corpus -> list of documents

In [3]:
def our_tokenizer(doc, stops=None, stemmer=None):
    doc = word_tokenize(doc.lower())
    tokens = [''.join([char for char in tok if char not in string.punctuation]) for tok in doc]
    tokens = [tok for tok in tokens if tok]
    if stops:
        tokens = [tok for tok in tokens if (tok not in stops)]
    if stemmer:
        tokens = [stemmer.stem(tok) for tok in tokens]
    return tokens

creating a list of token lists, where each internal list represents the tokens of a document in the corpus.

In [4]:
tokenized_docs = [our_tokenizer(doc) for doc in corpus]
tokenized_docs

[['jack', 'stole', 'my', 'tuna', 'sandwich'],
 ['help', 'i', 'sobbed', 'sandwichlessly'],
 ['drop', 'the', 'sandwiches', 'said', 'the', 'sandwich', 'police']]

## Step 1: lowercase, lose punctuation, split into tokens
```
# 'i' in stopwords
  is True 

  because 
    ∴  
Row: ['jack', 'stole', 'my', 'tuna', 'sandwich'], Conteo de 'i': 1
Row: ['help', 'i', 'sobbed', 'sandwichlessly'], Conteo de 'i': 2
Row: ['drop', 'the', 'sandwiches', 'said', 'the', 'sandwich', 'police'],4 Counting  'i': 7
```

In [5]:
# validate the occurrences of "i" in all sublists

def contar_i(lista):
    return sum(words.lower().count('i') for words in lista)

table_data = []
for i, renglon in enumerate(tokenized_docs, start=1):
    count_i = contar_i(renglon)
    table_data.append([f'Row  {i}'      , count_i])

total_i_in_all_lists = sum(count for _, count in table_data)
table_data.append(["Total     ∴'i'∴", total_i_in_all_lists])
print(tabulate(table_data, headers=["tokenized_docs", "i  "], tablefmt="grid"))


+------------------+-------+
| tokenized_docs   |   i   |
| Row  1           |     1 |
+------------------+-------+
| Row  2           |     2 |
+------------------+-------+
| Row  3           |     4 |
+------------------+-------+
| Total     ∴'i'∴  |     7 |
+------------------+-------+


In [6]:
# Filter out tokenized words and exclude those that are stopwords.
from nltk.corpus import stopwords
nltk.download('stopwords')
corpus = list(corpus)

# Get the stopwords in English
stopwords_english = set(stopwords.words('english'))

# Then apply tokenization and stopword filtering
tokenized_docs = [our_tokenizer(doc, stops=stopwords_english) for doc in corpus]

tokenized_docs

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[['jack', 'stole', 'tuna', 'sandwich'],
 ['help', 'sobbed', 'sandwichlessly'],
 ['drop', 'sandwiches', 'said', 'sandwich', 'police']]

## Step 2: remove stop words
```
[
 ['jack', 'stole', 'tuna', 'sandwich'],
 ['help', 'sobbed', 'sandwichlessly'],
 ['drop', 'sandwiches', 'said', 'sandwich', 'police']
]
```


In [7]:
# Stemming process of reducing words to their root or base, which often 
# involves removing suffixes and prefixes.
tokenized_docs = [our_tokenizer(doc, stops=stopwords.words('english'), stemmer=SnowballStemmer('english')) for doc in corpus]
tokenized_docs

[['jack', 'stole', 'tuna', 'sandwich'],
 ['help', 'sob', 'sandwichless'],
 ['drop', 'sandwich', 'said', 'sandwich', 'polic']]

## Step 3: Stemming/Lemmatization

Vocabulary:
```
['drop', 'help', 'jack', 'tuna', 'police', 'said', 'sandwich', 'sandwichlessly', 'sobbed', 'stole']
```

In [8]:
# It is initialized as an empty set that way vocab_set will be
# ready to store unique elements without duplicates
vocabulary = list(set(word for doc in tokenized_docs for word in doc))
print(vocabulary)

['jack', 'drop', 'polic', 'sandwich', 'tuna', 'sob', 'sandwichless', 'said', 'help', 'stole']


## 2: Count Vectorizer, TFIDF 

In the given list, the words that have a term frequency (TF) value of zero are those that do not appear in the document. In the first document ['jack', 'stole', 'sandwich', 'tuna'], the words "jack" and "stole" have a term frequency value of zero

In [9]:
# Calculates and returns the term frequency of each word 
# in the vocabulary for the provided document.
def cal_tf(document, vocabulary):
    tf_vector = []
    total_words = len(document)
    for words in vocabulary:
        tf = document.count(words) / total_words
        tf_vector.append(tf)
    return tf_vector


#Calculates and returns the document frequency of each word
# in the vocabulary across the provided set of tokenized documents
def cal_df(tokenized_docs, vocabulary):
    df_vector = []
    total_tokenized_docs = len(tokenized_docs)
    for words in vocabulary:
        df = sum(1 for doc in tokenized_docs if words in doc) / total_tokenized_docs
        df_vector.append(df)
    return df_vector


#It returns a list of IDF values, where each value
# represents the importance of a term in the context of 
# the entire document collection.
def calc_idf(df_vector):
    idf_vector = [math.log(1 / df) if df != 0 else 0 for df in df_vector]
    return idf_vector


#The function computes the TF-IDF values for a document
# by multiplying the term frequency (TF) values by the inverse 
# document frequency (IDF) values for each term. The result is a 
# list of TF-IDF values representing the importance of each term in 
# the document relative to the entire collection of documents.
def cal_tfidf(tf_vector, idf_vector):
    return [tf * idf for tf, idf in zip(tf_vector, idf_vector)]

tf_vectors = [cal_tf(doc, vocabulary) for doc in tokenized_docs]
df_vector = cal_df(tokenized_docs, vocabulary)
idf_vector = calc_idf(df_vector)
tfidf_vectors = [list(map(lambda x: round(x, 3), cal_tfidf(tf, idf_vector))) for tf in tf_vectors]

similarities = cosine_similarity(tfidf_vectors)
print(similarities)


[[1.         0.         0.08115802]
 [0.         1.         0.        ]
 [0.08115802 0.         1.        ]]


```
Vector 0 is very similar to itself (similarity 1.).
Vector 0 is not very similar to vector 1 (similarity 0.).
Vector 0 is not very similar to vector 2 (similarity 0.08115802).
Vector 1 is very similar to itself (similarity 1.).
Vector 1 is not very similar to vector 0 (similarity 0.).
Vector 1 is not very similar to vector 2 (similarity 0.).
Vector 2 is very similar to itself (similarity 1.).
Vector 2 is not very similar to vector 0 (similarity 0.08115802).
Vector 2 is very similar to vector 1 (similarity 0.).
```

## 3: Example with Spam data 

In [10]:
df= pd.read_table(r'SMSSpamCollection', header=None)
df.head(4)

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...


In [11]:
df.columns=['spam', 'msg']
df.head(4)

Unnamed: 0,spam,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...


In [12]:
#The following code removes prepositions and punctuation marks 
# or characters from the original text.
stopwords_set = set(stopwords.words('english'))
punctuation_set = set(string.punctuation)
data = [['Stopwords', len(stopwords_set)], ['Punctuation', len(punctuation_set)]]
print(tabulate(data, headers=['Filter', 'Quantity'], tablefmt='presto'))

 Filter      |   Quantity
-------------+------------
 Stopwords   |        179
 Punctuation |         32


In [13]:
# We create the column 'msg_ cleaned' in lowercase
df['msg_cleaned'] = df['msg'].apply(lambda x: ' '.join([word.lower() for word in x.split() if word.lower() not in stopwords_set and word.lower() not in punctuation_set]))
df.head(4)

Unnamed: 0,spam,msg,msg_cleaned
0,ham,"Go until jurong point, crazy.. Available only ...","go jurong point, crazy.. available bugis n gre..."
1,ham,Ok lar... Joking wif u oni...,ok lar... joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor... u c already say...


In [14]:
# Create a CountVectorizer object for transforms the text data 
# in the msg_ cleaned column of the DataFrame df into an array 
# of term counts
count_vect= CountVectorizer()
X= count_vect.fit_transform(df.msg_cleaned)
print(tabulate([['X.shape', X.shape]], headers=['Array', ' rows  | tokens'], tablefmt='simple'))

Array     rows  | tokens
-------  -----------------
X.shape  (5572, 8693)


In [15]:
y=df.spam
y_info = [['', 'rows'],
          ['Length', len(y)]]
print(tabulate(y_info, headers='firstrow', tablefmt='simple'))

          rows
------  ------
Length    5572


In [16]:
# Iterate from 10% to 90% in 10% increments
def calcular_scores(X, y):
    resultados = []
    for porcentaje in range(10, 91, 10):  
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=porcentaje/100, random_state=42)
        lg = LogisticRegression()
        lg.fit(X_train, y_train)
        score = lg.score(X_test, y_test)
        resultados.append({'% test': porcentaje, 'Score': score})
    return pd.DataFrame(resultados)

resultados = calcular_scores(X, y)
resultados = resultados.sort_values(by='Score', ascending=False)
print(tabulate(resultados, headers='keys', tablefmt='grid'))

+----+----------+----------+
|    |   % test |    Score |
|  1 |       20 | 0.984753 |
+----+----------+----------+
|  3 |       40 | 0.984747 |
+----+----------+----------+
|  0 |       10 | 0.983871 |
+----+----------+----------+
|  2 |       30 | 0.983852 |
+----+----------+----------+
|  4 |       50 | 0.981335 |
+----+----------+----------+
|  5 |       60 | 0.978768 |
+----+----------+----------+
|  6 |       70 | 0.974109 |
+----+----------+----------+
|  7 |       80 | 0.969493 |
+----+----------+----------+
|  8 |       90 | 0.956929 |
+----+----------+----------+


#### We use 10%

In [17]:
# X_train, X_test, y_train, y_test= train_test_split(X,y)    #By default it uses 25% 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
lg= LogisticRegression()
lg.fit(X_train,y_train)
y_pred=lg.predict(X_test)
lg.score(X_test,y_test)
confusion_matrix(y_test, y_pred)

array([[478,   1],
       [ 11,  68]], dtype=int64)

In [18]:
# number of correct and incorrect predictions
print(tabulate([["correct prediction", sum(y_test == y_pred)],["wrong prediction",len(y_test) - sum(y_test == y_pred)]], headers=["Type", "Quantity"], tablefmt="grid"))

+--------------------+------------+
| Type               |   Quantity |
| correct prediction |        546 |
+--------------------+------------+
| wrong prediction   |         12 |
+--------------------+------------+


## 4: Tweak model with Spam data 

In [19]:
df.head(4)

Unnamed: 0,spam,msg,msg_cleaned
0,ham,"Go until jurong point, crazy.. Available only ...","go jurong point, crazy.. available bugis n gre..."
1,ham,Ok lar... Joking wif u oni...,ok lar... joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor... u c already say...


In [20]:
tfidf= TfidfVectorizer()  
X= tfidf.fit_transform(df.msg_cleaned)
y=df.spam 
X_train, X_test, y_train, y_test= train_test_split(X,y)  
## try random forest 
rf= RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred=rf.predict(X_test)
rf.score(X_test,y_test)
confusion_matrix(y_test, y_pred)   

array([[1215,    0],
       [  28,  150]], dtype=int64)

In [21]:
tfidf=TfidfVectorizer()
X=tfidf.fit_transform(df.msg_cleaned)
y=df.spam
X_train, X_test, y_train, y_test=  train_test_split(X,y)
## try Logistic Regression
lg= LogisticRegression()
lg.fit(X_train,y_train)
y_pred=lg.predict(X_test)
lg.score(X_test,y_test)
confusion_matrix(y_test, y_pred) 

array([[1215,    1],
       [  60,  117]], dtype=int64)

In [22]:
# Try tfidf with bigrams & trigrams 
tfidf=TfidfVectorizer(ngram_range=(1,3)) 
X= tfidf.fit_transform(df.msg_cleaned)
y=df.spam
X_train, X_test, y_train, y_test= train_test_split(X,y)
## try Logistic Regression
lg= LogisticRegression()
lg.fit(X_train,y_train)
y_pred=lg.predict(X_test)
lg.score(X_test,y_test)
confusion_matrix(y_test, y_pred) 

array([[1210,    1],
       [ 125,   57]], dtype=int64)

In [23]:
# Try tfidf with bigrams & trigrams 
tfidf=TfidfVectorizer(ngram_range=(1,3)) 
X= tfidf.fit_transform(df.msg_cleaned)
y=df.spam
X_train, X_test, y_train, y_test= train_test_split(X,y)
## try Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[1201,    0],
       [  43,  149]], dtype=int64)

In [24]:
#try gradient boost 
gb= GradientBoostingClassifier()
gb.fit(X_train,y_train)
y_pred=gb.predict(X_test)
gb.score(X_test,y_test)
confusion_matrix(y_test, y_pred)

array([[1197,    4],
       [  40,  152]], dtype=int64)

## Section 5.5: Pipeline with Spam data 

In [25]:
# Define el conjunto de stopwords y conviértelo en una lista
stopwords_set = set(stopwords.words('english'))
stopwords_list = list(stopwords_set)
pipeline= Pipeline([('countvect', CountVectorizer(stop_words=stopwords_list)),\
                    #('tfidf', TfidfVectorizer(stop_words=stopwords_set)),\
                    ('lg',  LogisticRegression())])

X=df.msg_cleaned #note we are passing the cleaned msg to the pipeline 
y=df.spam
X_train, X_test, y_train, y_test= train_test_split(X,y) 

pipeline.fit(X_train, y_train) 
y_pred= pipeline.predict(X_test)
print (pipeline.score(X_test, y_test))
print (confusion_matrix(y_test, y_pred)) 

0.9755922469490309
[[1183    3]
 [  31  176]]


In [26]:
# Define el conjunto de stopwords
vectorizer = CountVectorizer(stop_words='english')
clf = MultinomialNB()

pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('clf', clf)
])

X=df.msg_cleaned 
y=df.spam
X_train, X_test, y_train, y_test= train_test_split(X,y) 

pipeline.fit(X_train, y_train) 
y_pred= pipeline.predict(X_test)
print (pipeline.score(X_test, y_test))
print (confusion_matrix(y_test, y_pred))

0.9885139985642498
[[1214    4]
 [  12  163]]


this is with MultinomialNB classifier

In [27]:

vectorizer = CountVectorizer(stop_words='english')
clf = MultinomialNB()

pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('clf', clf)
])

X=df.msg_cleaned 
y=df.spam
X_train, X_test, y_train, y_test= train_test_split(X,y) 

pipeline.fit(X_train, y_train) 
y_pred= pipeline.predict(X_test)
print (pipeline.score(X_test, y_test))
print (confusion_matrix(y_test, y_pred))

0.9849246231155779
[[1175   10]
 [  11  197]]


In [28]:
stopwords_set = set(stopwords.words('english'))
stopwords_list = list(stopwords_set)
pipeline= Pipeline([#('countvect', CountVectorizer(stop_words=stopwords_set)),\
                    ('countvect', CountVectorizer(stop_words=stopwords_list)),\
                    ('rf',  RandomForestClassifier())])
X=df.msg_cleaned #note we are passing the cleaned msg to the pipeline 
y=df.spam
X_train, X_test, y_train, y_test= train_test_split(X,y) 


pipeline.fit(X_train, y_train) 
y_pred= pipeline.predict(X_test)
print (pipeline.score(X_test, y_test))
print (confusion_matrix(y_test, y_pred))  
 

0.9806173725771715
[[1214    1]
 [  26  152]]


In [29]:
stopwords_set = set(stopwords.words('english'))
stopwords_list = list(stopwords_set)
pipeline= Pipeline([#('countvect', CountVectorizer(stop_words=stopwords_set)),\
                    ('countvect', CountVectorizer(stop_words=stopwords_list, ngram_range=(1,3))),\
                    ('rf',  RandomForestClassifier())])
X=df.msg_cleaned #note we are passing the cleaned msg to the pipeline 
y=df.spam
X_train, X_test, y_train, y_test= train_test_split(X,y) 


pipeline.fit(X_train, y_train) 
y_pred= pipeline.predict(X_test)
print (pipeline.score(X_test, y_test))
print (confusion_matrix(y_test, y_pred)) 

0.9691313711414213
[[1206    0]
 [  43  144]]


In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV

stopwords_set = set(stopwords.words('english'))
stopwords_list = list(stopwords_set)

pipeline = Pipeline([
    ('countvect', CountVectorizer(stop_words=stopwords_list, ngram_range=(1,3))),
    ('rf', RandomForestClassifier())
])

param_grid = {
    'countvect__ngram_range': [(1, 1), (1, 2), (1, 3)],  # N-gram range adjustment
    'rf__n_estimators': [50, 100, 200],  # Adjusting the number of estimators in the RandomForestClassifier
    'rf__max_depth': [None, 10, 20]  # Setting the maximum depth of the RandomForestClassifier
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

y_pred_best = best_model.predict(X_test)

print("Better hyperparameters:", best_params)
print("Accuracy of the best model:", best_model.score(X_test, y_test))
print("Best model confusion matrix:")
print(confusion_matrix(y_test, y_pred_best))



Better hyperparameters: {'countvect__ngram_range': (1, 1), 'rf__max_depth': None, 'rf__n_estimators': 200}
Accuracy of the best model: 0.9813352476669059
Best model confusion matrix:
[[1205    1]
 [  25  162]]


In [31]:
stopwords_set = set(stopwords.words('english'))
stopwords_list = list(stopwords_set)
pipeline= Pipeline([#('countvect', CountVectorizer(stop_words=stopwords_list)),\
                    ('tfidf', TfidfVectorizer(stop_words=stopwords_list)),\
                    ('lg',  LogisticRegression())])
X=df.msg_cleaned #note we are passing the cleaned msg to the pipeline 
y=df.spam
X_train, X_test, y_train, y_test= train_test_split(X,y) 


pipeline.fit(X_train, y_train) 
y_pred= pipeline.predict(X_test)
print (pipeline.score(X_test, y_test))
print (confusion_matrix(y_test, y_pred)) 

0.9655419956927495
[[1194    2]
 [  46  151]]


In [32]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

stopwords_set = set(stopwords.words('english'))
stopwords_list = list(stopwords_set)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stopwords_list)),
    ('lg', LogisticRegression())
])

param_grid = {
    'lg__C': [0.1, 1.0, 10.0]  # Example of adjusting the regularization parameter C in logistic regression
}

X = df.msg_cleaned
y = df.spam
X_train, X_test, y_train, y_test = train_test_split(X, y)

grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

y_pred_best = best_model.predict(X_test)

print("Better hyperparameters:", best_params)
print("Accuracy of the best model:", best_model.score(X_test, y_test))
print("Best model confusion matrix:")
print(confusion_matrix(y_test, y_pred_best))


Better hyperparameters: {'lg__C': 10.0}
Accuracy of the best model: 0.9813352476669059
Best model confusion matrix:
[[1196    2]
 [  24  171]]


In [33]:
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = CountVectorizer(stop_words='english')
tfidf = TfidfTransformer()
clf = MultinomialNB()

pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('tfidf', tfidf),
    ('clf', clf)
])

X=df.msg_cleaned 
y=df.spam
X_train, X_test, y_train, y_test= train_test_split(X,y) 

pipeline.fit(X_train, y_train) 
y_pred= pipeline.predict(X_test)
print (pipeline.score(X_test, y_test))
print (confusion_matrix(y_test, y_pred))

0.9669777458722182
[[1207    0]
 [  46  140]]


In [34]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid to explore
param_grid = {
    'clf__alpha': [0.1, 0.5, 1.0]  # Example of tuning the alpha parameter of the Naive Bayes classifier
}

# Create a GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=5)

# Adjust the GridSearchCV object
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions with the best model
y_pred_best = best_model.predict(X_test)

# Print the precision and confusion matrix of the best model
print("Better hyperparameters:", best_params)
print("Accuracy of the best model:", best_model.score(X_test, y_test))
print("Best model confusion matrix:")
print(confusion_matrix(y_test, y_pred_best))


Better hyperparameters: {'clf__alpha': 0.1}
Accuracy of the best model: 0.9813352476669059
Best model confusion matrix:
[[1204    3]
 [  23  163]]


In [35]:
import joblib

#Save the best model
joblib.dump(best_model, 'best_model.pkl')


['best_model.pkl']

In [36]:
import joblib

# Load the saved model
loaded_model = joblib.load('best_model.pkl')

#Make predictions on new data
new_data = ["Example text 1", "Example text  2",'Quarterly Sales Summary',"¡Special offer! Win an amazing prize today!!!.","Receive your $1000 check today!",'Buy now and receive a 50% discount!',' Your inheritance is waiting, contact us!']
predictions = loaded_model.predict(new_data)

# Create a list of tuples (phrase, prediction)
results = list(zip(new_data, predictions))

# Print results with tabulate
print(tabulate(results, headers=["Phrase", "Prediction"], tablefmt="grid"))


+------------------------------------------------+--------------+
| Phrase                                         | Prediction   |
| Example text 1                                 | ham          |
+------------------------------------------------+--------------+
| Example text  2                                | ham          |
+------------------------------------------------+--------------+
| Quarterly Sales Summary                        | ham          |
+------------------------------------------------+--------------+
| ¡Special offer! Win an amazing prize today!!!. | spam         |
+------------------------------------------------+--------------+
| Receive your $1000 check today!                | spam         |
+------------------------------------------------+--------------+
| Buy now and receive a 50% discount!            | spam         |
+------------------------------------------------+--------------+
| Your inheritance is waiting, contact us!       | ham          |
+---------