In [12]:
import pandas as pd
import numpy as np
import re

In [72]:
df=pd.read_csv('labeled_data.csv')

In [73]:
df["text"]=df["Title"]+ df["Snippet"]
#combine title and snippet into one column

In [74]:
df.drop(columns=['Title', 'Snippet'], inplace=True)
#drop all other columns

In [76]:
label_mapping = {"Medical assistance":0,
    "Shelter request":1,
    "Supplies needed":2,
    "Evacuation support":3,
    "Rescue operations":4,
    "Mental health support":5,
    "Infrastructure repair":6,
    "Animal rescue assistance":7,
    "No assistance needed":8,
    "Authority intervention":9}

df['label'] = df['Label'].map(label_mapping)

In [77]:
print(df.isnull().sum())
#checking if there are empty rows

Label    0
text     4
label    0
dtype: int64


In [78]:
df = df.dropna()
#removing empty rows

In [79]:
df[df.isnull().any(axis=1)]
#checking if we removed them

Unnamed: 0,Label,text,label


In [80]:
#turn into lower case
df['text'] = df['text'].str.lower()

In [81]:
#remove links
df['text'] = df['text'].apply(lambda x: re.sub(r"http\S+|www\S+|https\S+", "", x, flags=re.MULTILINE))

In [82]:
#remove html tags
df['text'] = df['text'].apply(lambda x: re.sub(r"<.*?>", "", x))

In [83]:
#remove non alphanumeric characters
df['text'] = df['text'].apply(lambda x: re.sub(r"[^a-zA-Z0-9\s]", "", x))

In [84]:
df.head()

Unnamed: 0,Label,text,label
0,Authority intervention,north korean troops deploy to russias kursk re...,9
1,No assistance needed,the most important issue is life itselfunfortu...,8
2,No assistance needed,celebrity chef boards the uss tripoli to judge...,8
3,Medical assistance,ksrelief provides medical assistance in lebano...,0
4,Authority intervention,germany wrestles with its arms sales to israel...,9


In [85]:
df.drop(columns='Label',inplace=True)

In [86]:
#remove extra spaces
df['text'] = df['text'].apply(lambda x: re.sub(r"\s+", " ", x).strip())  

In [87]:
df.head()

Unnamed: 0,text,label
0,north korean troops deploy to russias kursk re...,9
1,the most important issue is life itselfunfortu...,8
2,celebrity chef boards the uss tripoli to judge...,8
3,ksrelief provides medical assistance in lebano...,0
4,germany wrestles with its arms sales to israel...,9


In [66]:
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ENVY\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [88]:
#removing english stopwords like a the and....
stop_words = set(stopwords.words("english"))
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

In [89]:
df.head()

Unnamed: 0,text,label
0,north korean troops deploy russias kursk regio...,9
1,important issue life itselfunfortunately lot h...,8
2,celebrity chef boards uss tripoli judge navys ...,8
3,ksrelief provides medical assistance lebanon j...,0
4,germany wrestles arms sales israellegal ideolo...,9


In [92]:
# Save the clean labeled dataset to a new CSV file
df.to_csv('clean_labeled_news_dataset.csv', index=False)