In [40]:
import polars as pl
from bs4 import BeautifulSoup
import html

In [41]:
# data loading
df_test = pl.read_csv("data/test.csv")
df_train = pl.read_csv("data/train.csv")

df_train.head()

Class Index,Title,Description
i64,str,str
3,"""Wall St. Bears Claw Back Into …","""Reuters - Short-sellers, Wall …"
3,"""Carlyle Looks Toward Commercia…","""Reuters - Private investment f…"
3,"""Oil and Economy Cloud Stocks' …","""Reuters - Soaring crude prices…"
3,"""Iraq Halts Oil Exports from Ma…","""Reuters - Authorities have hal…"
3,"""Oil prices soar to all-time re…","""AFP - Tearaway world oil price…"


In [42]:
# Class indices should start from 0 and not from 1
df_train.select("Class Index").unique()

Class Index
i64
3
4
1
2


In [43]:
# creates new column Class
df_train = df_train.with_columns((pl.col("Class Index") - 1).alias("Class"))
df_train.head()

Class Index,Title,Description,Class
i64,str,str,i64
3,"""Wall St. Bears Claw Back Into …","""Reuters - Short-sellers, Wall …",2
3,"""Carlyle Looks Toward Commercia…","""Reuters - Private investment f…",2
3,"""Oil and Economy Cloud Stocks' …","""Reuters - Soaring crude prices…",2
3,"""Iraq Halts Oil Exports from Ma…","""Reuters - Authorities have hal…",2
3,"""Oil prices soar to all-time re…","""AFP - Tearaway world oil price…",2


In [44]:
# Combine "Title" and "Description" into a new column "Text"
df_train = df_train.with_columns((pl.col("Title") + " " + pl.col("Description")).alias("Text"))
df_train.head()

Class Index,Title,Description,Class,Text
i64,str,str,i64,str
3,"""Wall St. Bears Claw Back Into …","""Reuters - Short-sellers, Wall …",2,"""Wall St. Bears Claw Back Into …"
3,"""Carlyle Looks Toward Commercia…","""Reuters - Private investment f…",2,"""Carlyle Looks Toward Commercia…"
3,"""Oil and Economy Cloud Stocks' …","""Reuters - Soaring crude prices…",2,"""Oil and Economy Cloud Stocks' …"
3,"""Iraq Halts Oil Exports from Ma…","""Reuters - Authorities have hal…",2,"""Iraq Halts Oil Exports from Ma…"
3,"""Oil prices soar to all-time re…","""AFP - Tearaway world oil price…",2,"""Oil prices soar to all-time re…"


In [45]:
# Filter rows where "Text" contains "&lt" and select "Description"
filtered = df_train.filter(pl.col("Text").str.contains("&lt")).select("Description")
filtered.head()

Description
str
"""&lt;strong&gt;Opinion&lt;/stro…"
"""&lt;strong&gt;Letters&lt;/stro…"
"""&lt;strong&gt;Poll results&lt;…"
"""&lt;strong&gt;Competition&lt;/…"
""" SEATTLE (Reuters) - Microsoft…"


In [46]:
# Define the HTML cleaning function
def clean_html(text: str) -> str:
    soup = BeautifulSoup(text, "html.parser")
    text = html.unescape(soup.get_text())
    soup = BeautifulSoup(text, "html.parser")
    text = html.unescape(soup.get_text()).strip('\\').strip()
    return text

# Define a lowercasing function
def lowercasing(text: str) -> str:
    return text.lower()

# Apply the cleaning functions
df_train = df_train.with_columns(
    pl.col("Text").map_elements(clean_html, return_dtype=pl.Utf8).map_elements(lowercasing, return_dtype=pl.Utf8).alias("Text")
)
df_train.head()

  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "ht

Class Index,Title,Description,Class,Text
i64,str,str,i64,str
3,"""Wall St. Bears Claw Back Into …","""Reuters - Short-sellers, Wall …",2,"""wall st. bears claw back into …"
3,"""Carlyle Looks Toward Commercia…","""Reuters - Private investment f…",2,"""carlyle looks toward commercia…"
3,"""Oil and Economy Cloud Stocks' …","""Reuters - Soaring crude prices…",2,"""oil and economy cloud stocks' …"
3,"""Iraq Halts Oil Exports from Ma…","""Reuters - Authorities have hal…",2,"""iraq halts oil exports from ma…"
3,"""Oil prices soar to all-time re…","""AFP - Tearaway world oil price…",2,"""oil prices soar to all-time re…"


In [47]:
duplicated_count = df_train.select(pl.col("Text").is_duplicated().sum())
duplicated_count

Text
u32
216


In [65]:
# Print all duplicate rows
duplicates = df_train.filter(pl.col("Text").is_duplicated())
duplicates.head()

Class Index,Title,Description,Class,Text
i64,str,str,i64,str
4,"""Rival Targets Apple's ITunes C…","""AP - For more than a year, App…",3,"""rival targets apple's itunes c…"
4,"""Microsoft readies Host Integra…","""In a continued effort to compe…",3,"""microsoft readies host integra…"
4,"""Microsoft readies Host Integra…","""In a continued effort to compe…",3,"""microsoft readies host integra…"
4,"""IBM adds four-way 550 server t…","""IBM Corp. bolstered the new eS…",3,"""ibm adds four-way 550 server t…"
4,"""IBM adds four-way 550 server t…","""IBM Corp. bolstered the new eS…",3,"""ibm adds four-way 550 server t…"


In [68]:

df_filtered = (
    df_train.group_by("Text", maintain_order=True)
    .agg([
        pl.col("Class").n_unique().alias("class_count"),
        pl.all()
    ])
    .filter(pl.col("class_count") == 1)  # Only keep rows with a single class
    .select([pl.all().exclude("class_count")])  # Drop helper column
)

df_filtered.head()

Text,Class Index,Title,Description,Class
str,list[i64],list[str],list[str],list[i64]
"""wall st. bears claw back into …",[3],"[""Wall St. Bears Claw Back Into the Black (Reuters)""]","[""Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.""]",[2]
"""carlyle looks toward commercia…",[3],"[""Carlyle Looks Toward Commercial Aerospace (Reuters)""]","[""Reuters - Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market.""]",[2]
"""oil and economy cloud stocks' …",[3],"[""Oil and Economy Cloud Stocks' Outlook (Reuters)""]","[""Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums.""]",[2]
"""iraq halts oil exports from ma…",[3],"[""Iraq Halts Oil Exports from Main Southern Pipeline (Reuters)""]","[""Reuters - Authorities have halted oil export\flows from the main pipeline in southern Iraq after\intelligence showed a rebel militia could strike\infrastructure, an oil official said on Saturday.""]",[2]
"""oil prices soar to all-time re…",[3],"[""Oil prices soar to all-time record, posing new menace to US economy (AFP)""]","[""AFP - Tearaway world oil prices, toppling records and straining wallets, present a new economic menace barely three months before the US presidential elections.""]",[2]


In [69]:
# Merge back the original data for deduplication
final_df = (
    df_filtered.group_by(["Text", "Class"], maintain_order=True)
    .agg([])
)

final_df.head()

Text,Class
str,list[i64]
"""wall st. bears claw back into …",[2]
"""carlyle looks toward commercia…",[2]
"""oil and economy cloud stocks' …",[2]
"""iraq halts oil exports from ma…",[2]
"""oil prices soar to all-time re…",[2]


In [70]:
duplicated_count = final_df.select(pl.col("Text").is_duplicated().sum())
duplicated_count

Text
u32
0


In [92]:
import string
import re

def remove_punctuation(text: str) -> str:
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', string.punctuation))

def clean_agency(text: str) -> str:
    # pattern that deletes city and news agency from the start of the news
    pattern = r'^[A-Z,/\s]+(?:\([A-Z.]+\)\s*-\s*|:\s*)' 
    return re.sub(pattern, '', text, flags=re.IGNORECASE)

def clean_agency2(text: str) -> str:
    pattern = r'^[^-]*-\s*'
    return re.sub(pattern, '', text)

def clean_title(text: str) -> str:
    pattern = r'\s*\([^\)]*\)$'
    return re.sub(pattern, '', text)

def process_duplicates(df: pl.DataFrame) -> pl.DataFrame:
    df_filtered = (
        df.group_by("Text", maintain_order=True)
        .agg([
            pl.col("Class").n_unique().alias("class_count"),
            pl.all()
        ])
        .filter(pl.col("class_count") == 1)  # Only keep rows with a single class
        .select([pl.all().exclude("class_count")])  # Drop helper column
    )
    
    return df_filtered.group_by(["Text", "Class"], maintain_order=True).agg([])

def preprocess_pipeline(df: pl.DataFrame) -> pl.DataFrame:
    df = df.with_columns(
            pl.col("Description").map_elements(clean_agency, return_dtype=pl.Utf8).map_elements(clean_agency2, return_dtype=pl.Utf8),
            pl.col("Title").map_elements(clean_title, return_dtype=pl.Utf8),
            (pl.col("Class Index") - 1).alias("Class"),
            (pl.col("Title") + " " + pl.col("Description")).map_elements(clean_html, return_dtype=pl.Utf8).map_elements(lowercasing, return_dtype=pl.Utf8).map_elements(remove_punctuation, return_dtype=pl.Utf8).alias("Text")
    )
    return process_duplicates(df)

In [93]:
train = preprocess_pipeline(pl.read_csv("data/train.csv"))
test = preprocess_pipeline(pl.read_csv("data/test.csv"))

  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "ht

In [105]:
train = train.with_columns(pl.col("Class").cast(pl.List(pl.String)).list.join("").cast(pl.Int64))
test = test.with_columns(pl.col("Class").cast(pl.List(pl.String)).list.join("").cast(pl.Int64))

In [106]:
train.write_csv("data/preprocessed_train.csv")
test.write_csv("data/preprocessed_test.csv")