In [1]:
def load_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

# Example for one file
file_path = '/Users/zhaolongjiang/Desktop/UK txt/UK Immigration changes on 4 April 2024 – the latest details.txt'  # Replace this with the actual file path
text = load_text(file_path)


In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

def preprocess_text(text):
    # Tokenize
    tokens = word_tokenize(text.lower())

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [w for w in tokens if not w in stop_words]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(w) for w in filtered_tokens]
    
    return lemmatized_tokens

# Preprocess the text
preprocessed_text = preprocess_text(text)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/zhaolongjiang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zhaolongjiang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhaolongjiang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from gensim import corpora, models

# Create a dictionary and corpus required for Topic Modeling
dictionary = corpora.Dictionary([preprocessed_text])
corpus = [dictionary.doc2bow(text) for text in [preprocessed_text]]

# Apply LDA
lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

# Print topics
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)


(0, '0.003*"," + 0.003*"salary" + 0.003*"." + 0.003*"*" + 0.003*"change"')
(1, '0.003*"," + 0.003*"." + 0.003*"salary" + 0.003*"*" + 0.003*"skilled"')
(2, '0.003*"," + 0.003*"." + 0.003*"salary" + 0.003*"change" + 0.003*"role"')
(3, '0.003*"." + 0.003*"," + 0.003*"salary" + 0.003*"skilled" + 0.003*"*"')
(4, '0.040*"," + 0.039*"." + 0.030*"salary" + 0.019*"change" + 0.019*"*"')


In [4]:
from textblob import TextBlob

blob = TextBlob(text)
sentiment = blob.sentiment
print(f"Polarity: {sentiment.polarity}, Subjectivity: {sentiment.subjectivity}")


Polarity: 0.21766338500209473, Subjectivity: 0.4735156868221385
