# 2. Multinomial Naive Bayes Model

In [1]:
# Import the dependencies used for the project
import pandas as pd

# ML libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report
import joblib

# nltk for text cleaning
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [2]:
# Load the cleaned dataset from section 1
data = pd.read_csv("Data/cleaned-news.csv")
data

Unnamed: 0,Sentiment,Text,Word Count
0,neutral,seller eos innovationsmanagement group private...,12
1,neutral,believe however hkscan atrium start use import...,24
2,neutral,gross area eight house,11
3,neutral,utility also provide service related electrici...,21
4,neutral,amount included pensionable salary,11
...,...,...,...
4840,positive,proving good cellphone design truly work art w...,29
4841,positive,strong company brand esl shipping leipurin tel...,24
4842,neutral,swedish engineering consultant firm etteplan e...,23
4843,positive,commission income rose eur mn eur mn,17


### 2.1 Converting Sentiment to Numerical Values
 - -1 = Negative
 - 0 = Neutral
 - 1 = Positive

In [3]:
# Assign numerical value depending on Sentiment text
sentiment_num = []

for i in range(len(data)):
    if data["Sentiment"][i] == "neutral":
        sentiment_num.append(0)
    elif data["Sentiment"][i] == "positive":
        sentiment_num.append(1)
    else:
        sentiment_num.append(-1)

In [4]:
# Create a new DataFrame for the model
ModelData = pd.DataFrame()
ModelData["Sentiment"] = sentiment_num
ModelData["Text"] = data["Text"]
ModelData

Unnamed: 0,Sentiment,Text
0,0,seller eos innovationsmanagement group private...
1,0,believe however hkscan atrium start use import...
2,0,gross area eight house
3,0,utility also provide service related electrici...
4,0,amount included pensionable salary
...,...,...
4840,1,proving good cellphone design truly work art w...
4841,1,strong company brand esl shipping leipurin tel...
4842,0,swedish engineering consultant firm etteplan e...
4843,1,commission income rose eur mn eur mn


### 2.2 Preparing the Data for Modelling

In [5]:
# Collect a sample of the data for X & y
X = ModelData.iloc[:3500,1]
y = ModelData.iloc[:3500,0]

# Run vectorizer so we can convert the text into numerical features
vectorizer = TfidfVectorizer(max_features = 50000 , lowercase=False , ngram_range=(1,2))

In [6]:
# View X data (articles)
X.head()

0    seller eos innovationsmanagement group private...
1    believe however hkscan atrium start use import...
2                               gross area eight house
3    utility also provide service related electrici...
4                   amount included pensionable salary
Name: Text, dtype: object

In [7]:
# View y data (real/fake news flag) 0 = True, 1 = False
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Sentiment, dtype: int64

In [8]:
# Get the variables for train, test & split
train_data , test_data , train_label , test_label = train_test_split(X , y , test_size = 0.2 ,random_state = 0)
train_data.shape , test_data.shape

((2800,), (700,))

In [9]:
# Transform the training data
vector_train = vectorizer.fit_transform(train_data)
vector_train = vector_train.toarray()

# Transform the test data
vector_test = vectorizer.transform(test_data).toarray()

In [10]:
# Create the dataframes for the test & train data
training_data = pd.DataFrame(vector_train , columns=vectorizer.get_feature_names_out())
testing_data = pd.DataFrame(vector_test , columns= vectorizer.get_feature_names_out())

### 2.3 Multinomial NB Model

In [11]:
# Prepare the Multinomial Naive Bayes model
clf = MultinomialNB()

# Fit the model
clf.fit(training_data.values, train_label.values)
y_pred  = clf.predict(testing_data.values)

In [12]:
# Check the results of the prediction
pd.Series(y_pred).value_counts()

 0    595
 1    103
-1      2
dtype: int64

In [13]:
# Check the results of the test
test_label.value_counts()

 0    425
 1    178
-1     97
Name: Sentiment, dtype: int64

In [14]:
# Classification report for the test data
print(classification_report(test_label , y_pred))

              precision    recall  f1-score   support

          -1       1.00      0.02      0.04        97
           0       0.69      0.96      0.80       425
           1       0.49      0.28      0.36       178

    accuracy                           0.66       700
   macro avg       0.72      0.42      0.40       700
weighted avg       0.68      0.66      0.58       700



In [15]:
# Classification report for the training data
y_pred_train = clf.predict(training_data.values)
print(classification_report(train_label.values , y_pred_train))

              precision    recall  f1-score   support

          -1       0.90      0.05      0.10       329
           0       0.78      0.99      0.87      1677
           1       0.82      0.66      0.73       794

    accuracy                           0.79      2800
   macro avg       0.83      0.57      0.57      2800
weighted avg       0.80      0.79      0.74      2800



In [16]:
# Accuracy score for the training data
accuracy_score(train_label , y_pred_train)

0.7875

In [17]:
# Accuracy score for the test data
accuracy_score(test_label , y_pred)

0.66

In [18]:
# Save the model
joblib.dump(clf , 'Models/multinomial-NB-model.pkl')

# Save the vectorizer
joblib.dump(vectorizer, open("Models/multinomial-NB-vector.pkl", "wb"))

### 2.3 Testing the Model

In [19]:
# create the values for the text cleaning
ps = WordNetLemmatizer()
stopwords = stopwords.words("english")
# nltk.download("wordnet")

# Define a funtion to clean the text
def cleaning_data(row):
    
    # convert text to into lower case
    row = row.lower() 
    
    # this line of code only take words from text and remove number and special character using RegX
    row = re.sub('[^a-zA-Z]' , ' ' , row)
    
    # split the data and make token.
    token = row.split() 
    
    # lemmatise the word and remove stop words like a, an , the , is ,are ...
    news = [ps.lemmatize(word) for word in token if not word in stopwords]  
    
    # finaly join all the token with space
    cleaned_news = ' '.join(news) 
    
    # return cleanned data
    return cleaned_news

In [20]:
# Run a single prediction with the model using a positive title from the dataset
positive = cleaning_data(str("commission income rose eur mn eur mn"))

single_prediction = clf.predict(vectorizer.transform([positive]).toarray())

print("The model says...")

if single_prediction == 0:
    print("Your article is neutral")
elif single_prediction == 1:
    print("Your article is positive")
else:
    print("Your article is negative")

The model says...
Your article is positive


In [21]:
# Run a single prediction with the model using a negative title from the dataset
negative = cleaning_data(str("operating loss non recurring item eur mn compared profit eur mn"))

single_prediction = clf.predict(vectorizer.transform([negative]).toarray())

print("The model says...")

if single_prediction == 0:
    print("Your article is neutral")
elif single_prediction == 1:
    print("Your article is positive")
else:
    print("Your article is negative")

The model says...
Your article is negative


In [22]:
# Run a single prediction with the model using a neutral title from the dataset
neutral = cleaning_data(str("also technopolis plan build million euro technology park special economic zone neudorf st petersburg st petersburg government said february"))

single_prediction = clf.predict(vectorizer.transform([neutral]).toarray())

print("The model says...")

if single_prediction == 0:
    print("Your article is neutral")
elif single_prediction == 1:
    print("Your article is positive")
else:
    print("Your article is negative")

The model says...
Your article is neutral
