In [554]:
import pandas as pd
import numpy as np
import json
import nltk
import string
from sklearn.metrics import confusion_matrix

# My custom built Naive Bayes Classifier 
from naive_bayes import NaiveBayesClassifier

# Natural Language Toolkit
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

stop_words = stopwords.words('english')
snowball_stemmer = SnowballStemmer('english')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/leon/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [555]:
# Data Handling 

data = pd.read_csv('car-reviews.csv')

positive_data = data[691:]
negative_data = data[:691]

train_positive = positive_data[:553]
test_positive = positive_data[553:]

train_negative = negative_data[:553]
test_negative = negative_data[553:]

raw_training_data = pd.concat([train_positive, train_negative]).reset_index(drop=True)
raw_testing_data = pd.concat([test_positive, test_negative]).reset_index(drop=True)

raw_training_data['Sentiment'] = np.where(raw_training_data['Sentiment'] == 'Pos', 1, 0)
raw_testing_data['Sentiment'] = np.where(raw_testing_data['Sentiment'] == 'Pos', 1, 0)

In [566]:
class Processer():

    def __init__(self):
        self.sentiment_words = None 
        self.vector_sample = []
        self.stemmed_sample = []
        self.non_stemmed_sample = []

    
    def pre_processing(self, sentiment, review):
        words = review.split()
        stemmed_words = []
        stemmed_sample = {'Sentiment': sentiment,
                  'Word Table': {} }
        non_stemmed_sample = {'Sentiment': sentiment,
                              'Word Table': {} }
    
        # Loop through all words in sample 
        for word in words:

            # Remove cappital sensitivity 
            word = word.lower()

            # Removes all punctuation 
            word = ''.join(char for char in word if char not in string.punctuation)

            # Remove numbers and filter out stop words 
            if word not in stop_words:
                try:
                    int(word)
                    continue
                except:
                    pass

                try:
                    non_stemmed_sample['Word Table'][word] = non_stemmed_sample['Word Table'][word] + 1 
                except:
                    non_stemmed_sample['Word Table'][word] = 0
        
                # Applies the Snoball Stemmer from the NLTK package to the list of words in the sample 
                stemmed_word = snowball_stemmer.stem(word)
                stemmed_words.append(stemmed_word)
    
        # Create word count table of filtered stemmed words 
        for stemmed_word in stemmed_words:
            try:
                stemmed_sample['Word Table'][stemmed_word] = stemmed_sample['Word Table'][stemmed_word] + 1 
            except:
                stemmed_sample['Word Table'][stemmed_word] = 0

        self.stemmed_sample.append(stemmed_sample)
        self.non_stemmed_sample.append(non_stemmed_sample)

        return stemmed_sample



    
    def word_processing(self, training_data, testing):
        processed_samples = []
        all_words = None
    
        # Run each sample though pre-processing to filter and stem the words. Table returned
        for index, row in training_data.iterrows():
            sentiment = row['Sentiment']
            review = row['Review']
            sample = self.pre_processing(sentiment, review)
            processed_samples.append(sample)  

            # Create a sample to demonstrate the code logic in categorising and counting words per sample 
            if index in [0, (training_data.count()[0] / 2), (training_data.count()[0] - 1)] and testing == False:
                self.vector_sample.append(sample)

        # Itterate through each processed sample and add new words the all_words list 
        for idx, sample in enumerate(processed_samples):
            words = list(sample['Word Table'].keys())
            
            if idx == 0:
                all_words = words
            else:
                for word in words:
                    if word not in all_words:
                        all_words.append(word)

        return all_words, processed_samples



    
    def sentiment_filter(self, all_words):
        
        # Initialise the sentiment intensity analyser
        sia = SentimentIntensityAnalyzer()
        sentiment_words = []
        
        for word in all_words:
            # Find the sentiment intensity score of the word
            score = sia.polarity_scores(word)['compound']
            
            # Filter words based on sentiment being over or under 0. 0 being a neutral word
            if score != 0:
                sentiment_words.append(word)

        return sentiment_words

    
    
    
    def process_binary_data(self, processed_samples):
        all_data = []
        sample_data = []
        sample_count = 0

        # Convert word data to binary lists. First element is the label, the rest are the features 
        for sample in processed_samples:
            sentiment = sample['Sentiment']
            sample_data.append(sentiment)

            '''
            This logic below transforms the processed sample data into a binary format, which is vital for machine learning models to learn from.

            The sentiment of each sample is encoded at the start of its binary list (1 for 'Pos', 0 for all others), this is the label for supervised learning. 
            Then for each sample, a binary feature vector is generated. Each element in this vector signals whether a word from the list of sentiment-relevant words 
            (self.sentiment_words) is present (1) or absent (0) in the sample.

            The reason for selecting this approach is due to the consistency in feature vector length. This ensures uniformity in the length of feature vectors across 
            samples, this is vital for efficient model training and evaluation. A consistent vector size eliminates the need for adjustments based on individual sample
            content. This approach also simplifies error handling, by assigning absent words a '0' in the feature vector, the method avoids potential errors due to missing
            keys. This approach also utilises Python's handling of non-existent dictionary keys, which naturally avoids raising errors for such cases. As a result, this 
            eliminates the necessity for explicit error handling constructs like try-except blocks, reducing code complexity and improving readability. 
            This design also enhances the robustness and maintainability of the code by relying on Python's inherent features.
            '''
            
            for word in self.sentiment_words:
                if word in list(sample['Word Table'].keys()):
                    sample_data.append(1)
                else:
                    sample_data.append(0)
                    
            all_data.append(sample_data) 
            sample_data = []
    
        return np.array(all_data)


    
    
    def process(self, raw_data, testing=False):
        # Controller function that passes the raw text data to pre-processing and filters the word table for non-neutral words and outputs a binary data table for learning
        
        all_words, processed_samples = self.word_processing(raw_data, testing)

        if testing == False:
            self.sentiment_words = self.sentiment_filter(all_words)
            
        binary_data = self.process_binary_data(processed_samples)
        
        return binary_data

    
    
    def get_vector_samples(self):
        # 3 selected examples of my code outputting a word table during pre-processing 
        
        print(f'**********************  Vector Samples  **********************')
        print('\n')
        for sample in self.vector_sample:
            json_str = json.dumps(sample, indent=2)
            flattened_str = json_str.replace('\n', ' ').replace('  ', '')
            print(flattened_str)
            print('\n')


    
    def get_stemmed_samples(self):
        # 5 selected examples of my code stemming words during pre-processing 
        
        example_count = 0
        stemmed_sample_data = [{'Index': 0,
                               'Stemmed Word': 'work'},
                               {'Index': 25,
                               'Stemmed Word': 'cost'},
                               {'Index': 42,
                               'Stemmed Word': 'comfort'},
                               {'Index': 97,
                               'Stemmed Word': 'replace'},
                               {'Index': 105,
                               'Stemmed Word': 'handl'}]

        print(f'**********************  Stemmed Samples  **********************')
        print('\n')
        
        for stemmed_data in stemmed_sample_data:
        
            example_count += 1
            print(f'Stemmed Sample {example_count}')
            print('\n')
            index = stemmed_data['Index']
            word = stemmed_data['Stemmed Word']
        
            print('Training Data Index: ', index)
            print('Stemmed Word: ', word)
            print('\n')
        
            for sample in self.non_stemmed_sample[index]['Word Table']:
                if word in sample:        
                    count = self.non_stemmed_sample[index]['Word Table'][sample]
                    print('Pre-Stemmed Word: ', sample)
                    
            print('\n')
            print('Pre-Stemmed Word Table:')
            print('\n')
            print(self.non_stemmed_sample[index]['Word Table'])
            print('\n')
            print('Stemmed Word Table:')
            print('\n')
            print(self.stemmed_sample[index]['Word Table'])
            print('\n')
            print('\n')
            print('\n')

In [567]:
def process_data():
    
    processor = Processer()
    training_data = processor.process(raw_training_data, testing=False)
    testing_data = processor.process(raw_testing_data, testing=True)
    
    return training_data, testing_data



def custom_bayes(training_data, testing_data):
    
    naive_bayes = NaiveBayesClassifier(training_data, testing_data)
    predictions = naive_bayes.run()

    conf_matrix = confusion_matrix(testing_data[:, 0], predictions)
    conf_matrix_df = pd.DataFrame(conf_matrix, index=["Actual Negative", "Actual Positive"], columns=["Predicted Negative", "Predicted Positive"])
    display(conf_matrix_df)
    print('\n')
    
    accuracy = np.sum(predictions == testing_data[:, 0]) / len(testing_data[:, 0])
    print(f"Accuracy: {round(accuracy, 3)}")
    print('\n')



def show_samples():
    
    processor.get_vector_samples()
    processor.get_stemmed_samples()



training_data, testing_data = process_data()
custom_bayes(training_data, testing_data)
# show_samples()

Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,115,23
Actual Positive,47,91




Accuracy: 0.746


