In [1]:
# import packages

import pandas as pd

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')
nltk.download('wordnet')   # This downloads WordNet resource.
import re

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/henryasiamah/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/henryasiamah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/henryasiamah/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/henryasiamah/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/henryasiamah/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# load data

df = pd.read_csv("/Users/henryasiamah/Desktop/work/cleaned.csv")
df.head()

Unnamed: 0,Date,Headline,Content,Source,Country/Organization,cleaned_corpus
0,2024-06-07,www.bbc.com,How many people cross the Channel in small boa...,BBC,UK,How many people cross the Channel in small boa...
1,2024-06-07,www.bbc.com,Far right eyes Europe vote surge and ditches G...,BBC,UK,Far right eyes Europe vote surge and ditches G...
2,2024-06-07,www.bbc.com,Greek court throws out shipwreck trial against...,BBC,UK,Greek court throws out shipwreck trial against...
3,2024-06-07,www.bbc.com,Migrants cross Channel for 10th consecutive da...,BBC,UK,Migrants cross Channel for 10th consecutive da...
4,2024-06-07,www.bbc.com,More than 700 people arrive by small boats in ...,BBC,UK,More than 700 people arrive by small boats in ...


In [3]:
df.tail()

Unnamed: 0,Date,Headline,Content,Source,Country/Organization,cleaned_corpus
5284,2021-04-22,Orange-bellied parrots leave Tasmania in bigge...,<p>Nearly 200 endangered orange-bellied parrot...,The Guardian,UK,Nearly 200 endangered orange-bellied parrots h...
5285,2021-04-21,Melting ice in Arctic linked to bowhead whales...,"<p>As the ice melts at pace in the Arctic, the...",The Guardian,UK,"As the ice melts at pace in the Arctic, the mi..."
5286,2021-03-01,A birder's calendar: where and when to watch A...,"<p>During Covid lockdown, birds in our backyar...",The Guardian,UK,"During Covid lockdown, birds in our backyards ..."
5287,2021-01-15,"Air pollution will lead to mass migration, say...",<p>Air pollution does not respect national bou...,The Guardian,UK,Air pollution does not respect national bounda...
5288,2021-01-17,In brief: Passing: An Alternative History of I...,"<h2><strong><a href=""https://guardianbookshop....",The Guardian,UK,Passing: An Alternative History of IdentityLip...


In [4]:
print(df["cleaned_corpus"].apply(type).value_counts())

cleaned_corpus
<class 'str'>      5288
<class 'float'>       1
Name: count, dtype: int64


In [5]:
"""
The functions below are used to preprocess the data for topic modleing. 
"""

# Tokenization
def tokenization(df_col):
    print("Tokenizing articles...\n")
    return df_col.apply(lambda x: word_tokenize(x) if isinstance(x, str) else [])

# Lowercase
def lowercase(df_col):
    print("Making all words lowercase...\n")
    return df_col.apply(lambda x: [token.lower() for token in x])

# Remove Non-Alphabetic Tokens
def only_alphabetic(df_col):
    print("Removing all non-alphabetic words...\n")
    return df_col.apply(lambda x: [token for token in x if re.match(r"^[a-zA-ZÀ-ÿ]+$", token)])

# Stopword Removal
stop_words = set(stopwords.words('english'))
custom_stop_words = ["refugee", "refugees", "migrant", "migrants", "immigrant", "immigrants",
                     "like", "say", "told", "make", "would", "want", "take", "must", "well",
                     "could", "even", "since", "also", "-", "know", "immigration", "migration",
                     "asylum seeker", "asylum seekers", "said"]
stop_words.update(custom_stop_words)

def stopword_removal(df_col):
    print("Removing Stopwords...\n")
    return df_col.apply(lambda x: [token for token in x if token not in stop_words and len(token) > 3])

# Lemmatization
def lemmatization(df_col):
    print("Lemmatizing words...\n")
    lemmatizer = WordNetLemmatizer()
    return df_col.apply(lambda x: [lemmatizer.lemmatize(token) for token in x])

# Preprocessing Pipeline
def preprocessing(df_col, *steps):
    if not isinstance(df_col, pd.Series):
        raise ValueError("Input must be a Pandas Series")
    
    print("Applying preprocessing steps...\n")
    temp = df_col.copy()
    for func in steps:
        temp = func(temp)
    return temp.apply(lambda x: " ".join(x) if isinstance(x, list) else "")

In [6]:
df["TM_corpus"] = preprocessing(df["cleaned_corpus"],
                               tokenization,
                               lowercase,
                               only_alphabetic,
                               stopword_removal,
                               lemmatization)

Applying preprocessing steps...

Tokenizing articles...

Making all words lowercase...

Removing all non-alphabetic words...

Removing Stopwords...

Lemmatizing words...



In [7]:
#printing the results

for i,line in df[["Content","cleaned_corpus","TM_corpus"]].sample(1).iterrows():
    print(line["Content"])
    print("---")
    print(line["cleaned_corpus"])
    print("---")
    print(line["TM_corpus"])
    print("-----------------------------------")

"We can't hold a referendum on immigration in France," Bruno Retailleau said, speaking to broadcaster LCI late on Sunday. "I regret that."
Asked whether he personally wanted to hold such a popular vote, the hardline minister said: "yes, a thousand times yes," adding that "the constitution would first have to be revised."
France currently cannot hold a referendum on immigration as the issue is not within the scope of subjects on which such votes can be held.
Retailleau said that over the past 50 years immigration has greatly affected French society, yet the French have not had the "opportunity to express their opinion".
The French have, of course, been able to express their opinion in the multiple elections in which anti-immigration parties have fielded candidates over the past 50 years.
Stating that he did not think that immigration presented "an opportunity" for France, the minister said it should be controlled and he was ready to use "all levers at our disposal."
"Immigration can onl

## Final Duplicates Removal

In [8]:
# remove duplicates
df.drop_duplicates(subset=['TM_corpus'], inplace=True)

In [9]:
# dropping irrelevant columns

# Drop the specified columns
df = df.drop(columns=['Headline', 'Content', 'cleaned_corpus'])

# Check the updated DataFrame
print("Columns after dropping:")
print(df.columns)

Columns after dropping:
Index(['Date', 'Source', 'Country/Organization', 'TM_corpus'], dtype='object')


In [11]:
# confirm if all values are strings

all_strings = df['TM_corpus'].apply(lambda x: isinstance(x, str)).all()

if all_strings:
    print("All values in the column are strings.")
else:
    print("Not all values in the column are strings.")

All values in the column are strings.


In [12]:
df.count()

Date                    5283
Source                  5283
Country/Organization    5283
TM_corpus               5283
dtype: int64

In [13]:
df.head()

Unnamed: 0,Date,Source,Country/Organization,TM_corpus
0,2024-06-07,BBC,UK,many people cross channel small boat many clai...
1,2024-06-07,BBC,UK,right eye europe vote surge ditch german party...
2,2024-06-07,BBC,UK,greek court throw shipwreck trial nine court g...
3,2024-06-07,BBC,UK,cross channel consecutive small boat crossed c...
4,2024-06-07,BBC,UK,people arrive small boat total people detected...


In [14]:
#remove missing values

df = df.dropna()

# Check the updated DataFrame
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 5283 entries, 0 to 5288
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Date                  5283 non-null   object
 1   Source                5283 non-null   object
 2   Country/Organization  5283 non-null   object
 3   TM_corpus             5283 non-null   object
dtypes: object(4)
memory usage: 206.4+ KB
None


In [15]:
df.count()

Date                    5283
Source                  5283
Country/Organization    5283
TM_corpus               5283
dtype: int64

In [16]:
df.at[3642, 'TM_corpus']

'united nation torture prevention watchdog urged investigate australia handcuff asylum seeker seeking medical care practice advocate condemn inhumane unlawful public interest advocacy centre piac launched landmark test case federal court alleging practice handcuffing detainee medical transfer unlawful traumatic particularly history torture abuse piac requested subcommittee prevention torture investigate practice visit australia next month subcommittee expected scrutinise australia detention network related government security company reach settlement manus island guard piac advocacy group practice effectively creates barrier accessing essential healthcare unlawful disability discrimination asked subcommittee look visit australia october piac principal solicitor camilla pandolfini shouldnt action federal court ensure people access basic essential treatment without trauma indignity handcuffed receive sign receive email story guardian australia every morning australian border force detent

In [18]:
df.to_csv("/Users/henryasiamah/Desktop/work/TM_corpus_final_henry.csv", index=False)