In [1]:
import pandas as pd

df = pd.read_excel("Articles.xlsx")

df.head()
df.columns


Index(['Article', 'Date', 'Heading', 'NewsType', 'AH', 'year', 'month', 'day',
       'sentiment', 'emotion', 'Unnamed: 10', 'Location'],
      dtype='object')

In [2]:
df = df.drop(columns=["Unnamed: 10"], errors="ignore")

In [3]:
import re

def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r"http\S+", " ", text)
    text = re.sub(r"[^a-zA-Z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["article_clean"] = df["Article"].apply(clean_text)
df["heading_clean"] = df["Heading"].apply(clean_text)

In [4]:
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
df["year"] = df["Date"].dt.year
df["month"] = df["Date"].dt.month
df["day"] = df["Date"].dt.day

In [5]:
df["location_reported"] = df["Location"].fillna("UNKNOWN")

In [6]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [7]:
import spacy
nlp = spacy.load("en_core_web_sm")

def extract_location(text):
    doc = nlp(text)
    locs = [ent.text for ent in doc.ents if ent.label_ in ["GPE", "LOC"]]
    return locs[0] if locs else "UNKNOWN"

df["location_extracted"] = df["Article"].apply(extract_location)

In [8]:
df.to_csv("news_clean.csv", index=False)

In [9]:
df["location_extracted"] = df["Heading"].astype(str).apply(extract_location)

In [10]:
df.to_csv("news_clean.csv", index=False)