In [1]:
from collections import Counter
import pandas as pd
import re

## Merging the other_dataset and the Guardian Dataset.

In [2]:
# Load both files. 
df1 = pd.read_csv("/Users/henryasiamah/Desktop/work/new_other_dataset.csv")
df2 = pd.read_csv("/Users/henryasiamah/Desktop/work/filtered_guardian_news.csv")

# Rename Columns in df2 to match columns in df1.
df2 = df2.rename(columns={'publication_date_fixed': 'Date', 'webTitle': 'Headline', 'body': 'Content', 'News Paper Source': 'Source'})

# Add a new column to df2: Country/Organization and set it to UK
df2["Country/Organization"] = 'UK'

# Select columns in df1 that matches desired columns in df2.
df1 = df1[['Date', 'Headline', 'Content', 'Source', 'Country/Organization']]

#load df2
df2 = df2[['Date', 'Headline', 'Content', 'Source', 'Country/Organization']]

#Concatenate the two dataframes vertically.
final_df = pd.concat([df1, df2])

#Convert the 'Date' column to datetime format.
final_df['Date'] = pd.to_datetime(final_df['Date'], errors='coerce')

#print the original number of rows and columns for df1 and df2
print("Original number of rows and columns for df1:")
print(df1.shape)
print("Original number of rows and columns for df2:")
print(df2.shape)

#print the number of rows and columns for the final_df or merged dataset.
print("Number of rows and columns for the final dataframe")
print(final_df.shape)

#fill missing values for empty string
final_df.fillna('', inplace=True)

Original number of rows and columns for df1:
(2488, 5)
Original number of rows and columns for df2:
(2967, 5)
Number of rows and columns for the final dataframe
(5455, 5)


In [3]:
final_df.head()

Unnamed: 0,Date,Headline,Content,Source,Country/Organization
0,2024-06-07,www.bbc.com,How many people cross the Channel in small boa...,BBC,UK
1,2024-06-07,www.bbc.com,Far right eyes Europe vote surge and ditches G...,BBC,UK
2,2024-06-07,www.bbc.com,Greek court throws out shipwreck trial against...,BBC,UK
3,2024-06-07,www.bbc.com,Migrants cross Channel for 10th consecutive da...,BBC,UK
4,2024-06-07,www.bbc.com,More than 700 people arrive by small boats in ...,BBC,UK


In [4]:
final_df.tail()

Unnamed: 0,Date,Headline,Content,Source,Country/Organization
2962,2021-04-22,Orange-bellied parrots leave Tasmania in bigge...,<p>Nearly 200 endangered orange-bellied parrot...,The Guardian,UK
2963,2021-04-21,Melting ice in Arctic linked to bowhead whales...,"<p>As the ice melts at pace in the Arctic, the...",The Guardian,UK
2964,2021-03-01,A birder's calendar: where and when to watch A...,"<p>During Covid lockdown, birds in our backyar...",The Guardian,UK
2965,2021-01-15,"Air pollution will lead to mass migration, say...",<p>Air pollution does not respect national bou...,The Guardian,UK
2966,2021-01-17,In brief: Passing: An Alternative History of I...,"<h2><strong><a href=""https://guardianbookshop....",The Guardian,UK


In [5]:
final_df.count()

Date                    5455
Headline                5455
Content                 5455
Source                  5455
Country/Organization    5455
dtype: int64

In [6]:
final_df.to_csv("/Users/henryasiamah/Desktop/work/final_df.csv", index = False)

# Text Cleaning

In [7]:
df = final_df

In [8]:
df.duplicated(subset=['Content']).sum()

166

In [9]:
df.drop_duplicates(subset=['Content'], inplace=True)

In [10]:
# Remove rows with NaN values in 'Cotent' column
df.dropna(subset=['Content'], inplace=True)

# Remove rows with 'NaN' values in 'Content'column
df = df[~df['Content'].isin(['NaN'])]

#print the cleaned dataset
print('The cleaned dataframe ahs {} rows and {} columns'.format(df.shape[0], df.shape[1]))

The cleaned dataframe ahs 5289 rows and 5 columns


In [11]:
df.head(10)

Unnamed: 0,Date,Headline,Content,Source,Country/Organization
0,2024-06-07,www.bbc.com,How many people cross the Channel in small boa...,BBC,UK
1,2024-06-07,www.bbc.com,Far right eyes Europe vote surge and ditches G...,BBC,UK
2,2024-06-07,www.bbc.com,Greek court throws out shipwreck trial against...,BBC,UK
3,2024-06-07,www.bbc.com,Migrants cross Channel for 10th consecutive da...,BBC,UK
4,2024-06-07,www.bbc.com,More than 700 people arrive by small boats in ...,BBC,UK
5,2024-06-07,www.bbc.com,"Small boats migrant arrivals top 7,500 this ye...",BBC,UK
6,2024-06-07,www.bbc.com,Fourth man arrested in Channel deaths investig...,BBC,UK
7,2024-06-07,www.bbc.com,Migrants: TUV distances itself from Reform UK ...,BBC,UK
8,2024-06-07,www.bbc.com,Two males in court over Channel deaths investi...,BBC,UK
9,2024-10-08,www.thelocal.fr,"Asylum seekers, immigrants, refugees, foreigne...",The Local (France),France


In [12]:
df.tail(10)

Unnamed: 0,Date,Headline,Content,Source,Country/Organization
2957,2021-01-11,'There is no noise': inside the controversial ...,"<p>I wanted to come here. No one forced me, an...",The Guardian,UK
2958,2021-06-26,Ancestors by Alice Roberts review – a story of...,"<p>In 2002, not far from Amesbury in southern ...",The Guardian,UK
2959,2021-02-01,Hong Kong migration agents report rush of inqu...,<p>Migration agents in Hong Kong say they have...,The Guardian,UK
2960,2021-04-12,‘My son could die’: the disabled Syrian refug...,"<p>In January, the <a href=""https://www.thegua...",The Guardian,UK
2961,2021-02-04,Migration firm investigated over ads promising...,<p>An international migration firm is being in...,The Guardian,UK
2962,2021-04-22,Orange-bellied parrots leave Tasmania in bigge...,<p>Nearly 200 endangered orange-bellied parrot...,The Guardian,UK
2963,2021-04-21,Melting ice in Arctic linked to bowhead whales...,"<p>As the ice melts at pace in the Arctic, the...",The Guardian,UK
2964,2021-03-01,A birder's calendar: where and when to watch A...,"<p>During Covid lockdown, birds in our backyar...",The Guardian,UK
2965,2021-01-15,"Air pollution will lead to mass migration, say...",<p>Air pollution does not respect national bou...,The Guardian,UK
2966,2021-01-17,In brief: Passing: An Alternative History of I...,"<h2><strong><a href=""https://guardianbookshop....",The Guardian,UK


## Most Common Words

Creating an overview of the most common words in the corpus.

In [15]:
# Concatenate all the strings in the 'Content' column into a single string
all_text = ' '.join(df['Content'].values)

# Use the counter class from the collections module to count the occurences of each word
word_counts = Counter(all_text.split())

# Get the 1000 most common words and their counts
most_common = word_counts.most_common(10000)

# Convert the results to a Pandas DataFrame for easier manipulation and printing
most_common_df = pd.DataFrame(most_common, columns=['Word', 'Count'])

# Print the DataFrame with the most common words and their counts in descending order
print(most_common_df.sort_values('Count', ascending=False))

            Word   Count
0            the  232039
1             to  141965
2             of  111019
3            and  100862
4             in   89933
...          ...     ...
9980      namely      30
9979  favourable      30
9978   disaster.      30
9977   Democracy      30
9999           à      30

[10000 rows x 2 columns]


In [16]:
most_common_df.to_csv("/Users/henryasiamah/Desktop/work/most_common_words.csv", index = False)

In [17]:
#Define a function to clean the text. 
def clean_text(text):
    
    # Remove inline frame element - REMOVE
    text = re.sub(r'', '', text, flags=re.DOTALL)

     # Remove specified text
    patterns = r'\b(p|dataatomidbddcbffdabfcfcfbe|href|blockquote|span|class|element|element-image_caption|element-image_credit|figcaption|liststrong|Photo/Rodrigo|ADVERTISEMENT|element--supporting|Photo/Evgeniy|element-rich-link|copyright|element--showcase|AD-FREE,|meterActive/meterExpired|Photo/Vadim|VideoWhy|/PRNewswire/|itemV1|data-interactive|element-interactive|allowfullscreen="true"|data-atom-type="media"|element-atom|gu-atom|class="twitter-tweet"p|auto-generated|class="timezone"BST/span/time|class="quoted|class="gu-image"|class="/figcaption"|class="timezone"GMT/span/time|class="block"|data-block-contributor=""|class="block-elements"|class="element|href)\b'
    text = re.sub(patterns, '', text)
    
    # Remove 
    text = re.sub(r'', '', text)
    text = re.sub(r'', '', text)
    text = re.sub(r'', '', text)
    text = re.sub(r'/p', '', text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove '@' symbols
    text = re.sub(r'@', '', text)
    
    # Remove links
    text = re.sub(r'http\S+', '', text)
    
    # Remove emojis
    text = text.encode('ascii', 'ignore').decode('ascii')
    
   
    return text 

In [18]:
df['cleaned_corpus'] = df['Content'].apply(clean_text)

In [19]:
df.head()

Unnamed: 0,Date,Headline,Content,Source,Country/Organization,cleaned_corpus
0,2024-06-07,www.bbc.com,How many people cross the Channel in small boa...,BBC,UK,How many people cross the Channel in small boa...
1,2024-06-07,www.bbc.com,Far right eyes Europe vote surge and ditches G...,BBC,UK,Far right eyes Europe vote surge and ditches G...
2,2024-06-07,www.bbc.com,Greek court throws out shipwreck trial against...,BBC,UK,Greek court throws out shipwreck trial against...
3,2024-06-07,www.bbc.com,Migrants cross Channel for 10th consecutive da...,BBC,UK,Migrants cross Channel for 10th consecutive da...
4,2024-06-07,www.bbc.com,More than 700 people arrive by small boats in ...,BBC,UK,More than 700 people arrive by small boats in ...


In [20]:
df.tail()

Unnamed: 0,Date,Headline,Content,Source,Country/Organization,cleaned_corpus
2962,2021-04-22,Orange-bellied parrots leave Tasmania in bigge...,<p>Nearly 200 endangered orange-bellied parrot...,The Guardian,UK,Nearly 200 endangered orange-bellied parrots h...
2963,2021-04-21,Melting ice in Arctic linked to bowhead whales...,"<p>As the ice melts at pace in the Arctic, the...",The Guardian,UK,"As the ice melts at pace in the Arctic, the mi..."
2964,2021-03-01,A birder's calendar: where and when to watch A...,"<p>During Covid lockdown, birds in our backyar...",The Guardian,UK,"During Covid lockdown, birds in our backyards ..."
2965,2021-01-15,"Air pollution will lead to mass migration, say...",<p>Air pollution does not respect national bou...,The Guardian,UK,Air pollution does not respect national bounda...
2966,2021-01-17,In brief: Passing: An Alternative History of I...,"<h2><strong><a href=""https://guardianbookshop....",The Guardian,UK,Passing: An Alternative History of IdentityLip...


In [21]:
df.to_csv("/Users/henryasiamah/Desktop/work/cleaned.csv", index=False)