Author: Zhile Xu

UUN: s2500393

In [None]:
import os
import glob
import spacy
import pandas as pd
import seaborn as sns
import string
import matplotlib.pyplot as plt
from collections import Counter

# turn off depreciation warnings and future warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
# load spacy model
nlp = spacy.load("en_core_web_lg")

# function to load data
def load_data(base_dir):
    data = []
    labels = []
    files = []
    for label in ['positive', 'negative']:
        for filepath in glob.glob(os.path.join(base_dir, label, '*.txt')):
            with open(filepath, 'r', encoding='utf-8') as file:
                data.append(file.read())
                labels.append(1 if label == 'positive' else 0)
                files.append(filepath)
                
    return data, labels, files

In [None]:
# Load training data
train_data, train_labels, train_files = load_data('data/train')

## Preprocess the Data

In [None]:
# delete irrelevant text
def remove_irrelevant_text(data):
    new_data = []
    for text in data:
        new_data.append(text.split("What I've decided and why")[0])
    return new_data


# # calutate the occurence of "What I've decided and why" in the text
# def count_irrelevant_text(data):
#     count = 0
#     for text in data:
#         if "What I've decided and why" in text:
#             count += 1
#     return count

# count_pre = count_irrelevant_text(train_data)
# print("The number of irrelevant text is: ", count_pre)
# # print the number of documents in the training data
# print("The number of documents in the training data is: ", len(train_data))

# # remove irrelevant text
# train_data = remove_irrelevant_text(train_data)
# count_after = count_irrelevant_text(train_data)
# print("The number of irrelevant text after removing is: ", count_after)

# preprocess the texts
def preprocess_texts(texts):
    docs = [nlp(text) for text in texts]
    return docs

# preprocess the training data
docs = preprocess_texts(train_data)

# separate positive and negative documents
positive_docs = [doc for doc, label in zip(docs, train_labels) if label == 1]
negative_docs = [doc for doc, label in zip(docs, train_labels) if label == 0]


In [None]:
# function to remove stopwords and punctuation
def remove_stopwords_punctuation(doc):
    # remove stopwords and punctuation
    doc = [token for token in doc if not token.is_stop and not token.is_punct]
    # remove "\n", "Mr", "Mrs", "Miss" and "Ms"
    doc = [token for token in doc if token.text not in ['\n', 'Mr', 'Mrs', 'Miss', 'Ms']]
    # remove single characters
    doc = [token for token in doc if len(token.text) > 1]
    return doc


cleaned_docs = [remove_stopwords_punctuation(doc) for doc in docs]
positive_cleaned_docs = [remove_stopwords_punctuation(doc) for doc in positive_docs]
negative_cleaned_docs = [remove_stopwords_punctuation(doc) for doc in negative_docs]

##  Lowercase and lemmatise the tokens
def lowercase_and_lemmatise(docs):
    lemmatised_docs = []
    for doc in docs:
        lemmatised_tokens = [token.lemma_.lower() for token in doc]
        
        lemmatised_docs.append(lemmatised_tokens)
    return lemmatised_docs

# lowercase and lemmatise the tokens
lemmatised_docs = lowercase_and_lemmatise(cleaned_docs)
lemmatised_positive_docs = lowercase_and_lemmatise(positive_cleaned_docs)
lemmatised_negative_docs = lowercase_and_lemmatise(negative_cleaned_docs)

## Topic Modelling

In [None]:
# Build a Topic Model
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import pyLDAvis.gensim
from gensim.models.coherencemodel import CoherenceModel
pyLDAvis.enable_notebook()

def build_topic_model(docs, dictionary, corpus, num_topics):

    # train an LDA model
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, iterations=50,
                          num_topics=num_topics, passes=20, random_state=123)

    # compute the coherence score
    coherence_model = CoherenceModel(model=lda_model, texts=docs, 
                                     dictionary=dictionary, coherence='c_v')

    print(f'Num Topics: {num_topics}, Coherence Score: {coherence_model.get_coherence()}')

    return lda_model, coherence_model.get_coherence()


In [None]:
# call the function on the whole dataset
print("Topic model for the whole dataset")

# create a dictionary
dictionary = Dictionary(lemmatised_docs)

# filter out tokens that appear in less than 5 documents or more than 50% of the documents
dictionary.filter_extremes(no_below=5, no_above=0.5)

# create a bag-of-words representation of the documents
corpus = [dictionary.doc2bow(doc) for doc in lemmatised_docs]

coherence = pd.DataFrame(index=range(2, 21), columns=['coherence'])
for num_topics in coherence.index:
    _, coherence_score = build_topic_model(lemmatised_docs, dictionary, corpus, num_topics)
    coherence.loc[num_topics, 'coherence'] = coherence_score

In [None]:
# find the optimal number of topics
optimal_num_topics = coherence['coherence'].idxmax()
print(f'The optimal number of topics is: {optimal_num_topics}')

In [None]:
# 9 topics has the highest coherence score
# train the LDA model with 16 topics
lda_model, _ = build_topic_model(lemmatised_docs, dictionary, corpus, optimal_num_topics)

In [None]:
# print the words in each topic (dont show the whole list)
for topic_id in range(optimal_num_topics):
    # get the words in the topic
    print(f'Topic {topic_id}: {lda_model.print_topic(topic_id)}')

In [None]:
dict_topics = {
    0: "Car Insurance (vehicle loss, repair, and accident liability)",
    1: "Home Appliance Repair Insurance (appliance repairs, warranties, and claim denials)",
    2: "Property Repair Insurance (certified repairs, home repairs, and property settlements)",
    3: "Home Repair Insurance (work delays, property loss, and compensation)",
    4: "Travel Insurance (vehicle delays and travel issues)",
    5: "Car Insurance (driver and vehicle issues and claim denials)",
    6: "Home Boiler Insurance (boiler repairs, replacements, and engineer services)",
    7: "Legal Expense Insurance (legal consultations, disputes, and litigation costs)",
    8: "Car Insurance (vehicle valuation and settlement)",
    9: "Home Water Damage Insurance (water leaks, storm damage, and repairs)",
    10: "Subsidence Insurance (property foundation issues and related compensation)",
    11: "Insurance Fraud (misrepresentation and claim disputes)",
    12: "Insurance Premiums and Renewals (premium increases and renewal issues)",
    13: "Health Insurance (medical treatments, claim denials, and hospital issues)"
}

import random
random.seed(123)
# print 5 document text with their topics
for i in range(5):
    # choose a random document
    doc_id = random.randint(0, len(lemmatised_docs))
    # get the document's topic distribution
    topic_distribution = lda_model.get_document_topics(corpus[doc_id])
    # get the topic with the highest probability
    topic_id = max(topic_distribution, key=lambda x: x[1])[0]
    # print the document text and the topic
    print(f'File: {train_files[doc_id]}')
    print(f'Topic: {topic_id} - {dict_topics[topic_id]}')
    print(train_data[doc_id][:500])
    print()

In [None]:
# assign topics to the documents
topics = [max(lda_model[doc], key=lambda x: x[1])[0] for doc in corpus]

# create a dataframe with the topics
df_topics = pd.DataFrame({'topic': topics, 'topic_label': train_labels})

def plot_topic_distribution(df_topics, dict_topics=dict_topics):
    df_topics['topic_label'] = df_topics['topic_label'].map({1: 'Upheld', 0: 'Not Upheld'})
    df_topics['topic'] = df_topics['topic'].map(dict_topics)
    
    # set the aesthetic style of the plots
    sns.set_style("whitegrid")
    
    # create the count plot
    plt.figure(figsize=(12, 8))
    sns.countplot(y='topic', hue='topic_label', data=df_topics, palette='Set1')
    
    # add titles and labels
    plt.title('Distribution of Topics in the Dataset', fontsize=16)
    plt.ylabel('Topic', fontsize=14)
    plt.xlabel('Count', fontsize=14)
    
    # customize the legend
    plt.legend(title='Topic Label', title_fontsize='13', fontsize='12', loc='lower right')
    
    # adjust the y-axis tick labels for better readability
    plt.yticks(fontsize=12)
    
    # display the plot
    plt.tight_layout()
    # save the plot in the highest quality
    plt.savefig('plots/topic-distribution.png', dpi=300)
    plt.show()


plot_topic_distribution(df_topics)

In [None]:
# draw a word cloud for 3 topics
from wordcloud import WordCloud

def draw_word_cloud(lda_model, topic_id, dict_topics=dict_topics):
    # get the words in the topic
    words = lda_model.show_topic(topic_id, topn=50)
    
    # create a dictionary from the words
    word_dict = {word: score for word, score in words}
    
    # create the word cloud
    word_cloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_dict)
    
    # plot the word cloud
    plt.figure(figsize=(10, 6))
    plt.imshow(word_cloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Topic {topic_id}: {dict_topics[topic_id]}', fontsize=16)
    plt.tight_layout()
    # save the plot in the highest quality
    plt.savefig(f'plots/topic-{topic_id}-word-cloud.png', dpi=300)
    plt.show()

# draw the word cloud for the topics
for topic_id in range(4):
    draw_word_cloud(lda_model, topic_id)


In [None]:
# visualise the topics 
pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)

## Name Entity Recognition

In [None]:
# implement the named entity recognition
def extract_named_entities(docs):
    named_entities = []
    for doc in docs:
        named_entities.extend([ent.text for ent in doc.ents])
    return named_entities

# extract named entities
named_entities = extract_named_entities(docs)

# print the categories of named entities
named_entity_categories = set([ent.label_ for ent in nlp(' '.join(named_entities)).ents])
print("categories of named entities: ", named_entity_categories)
# {'NORP', 'LOC', 'DATE', 'GPE', 'EVENT', 'PRODUCT', 'LAW', 'PERSON', 
# 'PERCENT', 'TIME', 'WORK_OF_ART', 'CARDINAL', 'ORDINAL', 'ORG', 
# 'FAC', 'MONEY', 'LANGUAGE', 'QUANTITY'}

# delete language category
named_entity_categories.remove('LANGUAGE')
print("remaining categories: ", named_entity_categories)

# extract named entities for positive and negative documents
positive_named_entities = extract_named_entities(positive_docs)
negative_named_entities = extract_named_entities(negative_docs)


In [None]:
# plot named entities in different categories for positive and negative documents

def plot_named_entities(pos_named_entities, neg_named_entities, category, category_map):
    pos_named_entity_counts = Counter(pos_named_entities)
    pos_named_entity_df = pd.DataFrame(pos_named_entity_counts.most_common(20), columns=['Named Entity', 'Frequency'])
    neg_named_entity_counts = Counter(neg_named_entities)
    neg_named_entity_df = pd.DataFrame(neg_named_entity_counts.most_common(20), columns=['Named Entity', 'Frequency'])
    fig, axs = plt.subplots(1, 2, figsize=(14, 7))
    sns.barplot(x='Frequency', y='Named Entity', data=pos_named_entity_df, ax=axs[0], palette='coolwarm')
    sns.barplot(x='Frequency', y='Named Entity', data=neg_named_entity_df, ax=axs[1], palette='coolwarm')
    # create a map of category names to more readable names
    axs[0].set_title(f'Top 20 Positive {category_map[category]} Named Entities')
    axs[1].set_title(f'Top 20 Negative {category_map[category]} Named Entities')
    plt.tight_layout()
    plt.show()


category_map = {'NORP': 'Nationalities, Religious/Political Groups',
                'LOC': 'Locations',
                'DATE': 'Dates',
                'GPE': 'Countries, Cities, States',
                'EVENT': 'Events',
                'PRODUCT': 'Products',
                'LAW': 'Laws',
                'PERSON': 'People',
                'PERCENT': 'Percentage',
                'TIME': 'Time',
                'WORK_OF_ART': 'Works of Art',
                'CARDINAL': 'Cardinal Numbers',
                'ORDINAL': 'Ordinal Numbers',
                'ORG': 'Organizations',
                'FAC': 'Facilities',
                'MONEY': 'Monetary Values',
                'LANGUAGE': 'Languages',
                'QUANTITY': 'Measurements'}

# plot named entities in different categories for positive and negative documents
for category in named_entity_categories:
    plot_named_entities([ent.text for ent in nlp(' '.join(positive_named_entities)).ents if ent.label_ == category],
                        [ent.text for ent in nlp(' '.join(negative_named_entities)).ents if ent.label_ == category],
                        category, category_map)