In [1]:
# import pandas and re 
import pandas as pd
import re

In [2]:
df = pd.read_csv('datasets/train.csv')

In [3]:
# make copy to work with
dup_df = df.copy()

In [5]:
# remove by new reporter name
# example: 'By . Daily Mail Reporter .'
remove_news_stamp = r'\s*By\s+\.[^.]*?\.[^.]*?\.\s*'

# remove string of published and update stamps
# example: '14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 October 2013 .'
remove_update_stamp = r'\d{2}:\d{2}\s+[A-Z]{3},\s+\d{1,2}\s+[A-Za-z]+\s+\d{4}\s+\.\s*\|\s*\.\s+UPDATED:\s+\.\s*\d{2}:\d{2}\s+[A-Z]{3},\s+\d{1,2}\s+[A-Za-z]+\s+\d{4}\s+\.\s*'

# remove last update date and time string
# example: Last updated at 3:31 PM on 19th July 2011 . 
remove_last_updated = r'Last updated.*?\.'

# remove parentesis and anything inside them
# example: (CNN), (left), (right)
remove_parenthesis = r'\(.*?\)'

# remove '\xa0' in strings
# example: 'pictured is\xa0Lake Placid Lodge' or 'the\xa0Echo Valley Ranch & Spa, Canada offers ultimate serenity'
remove_ax0 = r'\xa0'

# remove cases of double dash in the text
remove_double_dash = r'\--\s*'

# remove cases of a single dash in the text surrounded by whitespcae
remove_dash = r'\s+\-\s*'

# remove brakets and anything inside them
remove_brackets= r'\s+\[.*?\]\s*'

# remove random periods marks
# example: 'It includes . carved Buddha statues, has its own yoga hall, swimming pool, hot tub . and ‘chill-out’ 
# area with hammocks – ideal for a Prime\xa0 Minister who . reputedly has a taste for ‘chillaxing’.'
remove_junk_period = r'\s+\.\s+'

# remove details for videos
# example: Scroll down for video .
remove_video = r'\s+Scroll down for video \.'

In [6]:
dup_df['article'] = dup_df['article'].apply(lambda x: re.sub(remove_news_stamp, '', x))

In [7]:
dup_df['article'] = dup_df['article'].apply(lambda x: re.sub(remove_update_stamp, '', x))

In [8]:
dup_df['article'] = dup_df['article'].apply(lambda x: re.sub(remove_last_updated, '', x))

In [9]:
dup_df['article'] = dup_df['article'].apply(lambda x: re.sub(remove_parenthesis, '', x))

In [10]:
dup_df['article'] = dup_df['article'].apply(lambda x: re.sub(remove_double_dash, '', x))

In [11]:
dup_df['article'] = dup_df['article'].apply(lambda x: re.sub(remove_dash, ' ', x))

In [12]:
dup_df['article'] = dup_df['article'].apply(lambda x: re.sub(remove_brackets, '', x))

In [13]:
dup_df['article'] = dup_df['article'].apply(lambda x: re.sub(remove_ax0, ' ', x))

In [14]:
dup_df['article'] = dup_df['article'].apply(lambda x: re.sub(remove_video, '', x))

In [16]:
dup_df = dup_df[~dup_df['article'].str.contains(r' \. ', regex=True)]

In [17]:
len(dup_df)

44553

In [31]:
dup_df.reset_index(drop=True, inplace=True)


In [19]:
from transformers import BartTokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

In [20]:
# Function to tokenize and count tokens
def count_tokens(text):
    tokens = tokenizer.encode(text)
    return len(tokens)


In [21]:
# Add a new column for the token count in the DataFrame and apply function to it
dup_df['token_count'] = dup_df['article'].apply(count_tokens)

In [22]:
under_1024_tokens = dup_df[dup_df['token_count'] <= 1024]

In [23]:
len(under_1024_tokens)

37880

In [33]:
# reset the index
under_1024_tokens.reset_index(drop=True, inplace=True)

In [41]:
under_1024_tokens.head()

Unnamed: 0,id,article,highlights,token_count
0,0002095e55fcbd3a2f366d9bf92a95433dc305ef,Ralph Mata was an internal affairs lieutenant...,Criminal complaint: Cop used his role to help ...,461
1,0002c17436637c4fe1837c935c04de47adb18e9a,With a breezy sweep of his pen President Vlad...,Nina dos Santos says Europe must be ready to a...,627
2,000e009f6b1d954d827c9a550f3f24a5474ee82b,One can hardly read the news these days witho...,U.S. corporations merge with foreign companies...,914
3,001be24b2db1c04f62386f98997fee725c5fd2fb,"Kabul, Afghanistan China's top security offic...",China's top security official visited Afghanis...,238
4,001f9c554f1a29169413d0d2f138212a14c6dcf1,A University of Wisconsin senior official res...,John Chadima resigned after the allegations su...,366


In [42]:
# make a copy to work with for other column
highlights_cleaning_df = under_1024_tokens.copy()

In [162]:
### Highlights column ###

In [44]:
# drop unused columns
highlights_cleaning_df.drop('id', axis=1, inplace=True)

In [47]:
# remove space between end of sentence and the period
period_whitespace = r'(?<=\S)\s+(?=\.)'

# remove line breaks
# example: 'will hurt both sides .\nTargeting Russia's business'
remove_next_line = r'\n+'

# remove NEWS stamps in the highlights
# example: 'NEW: Police make an arrest. Authorities say Matthew Flugence...'
news_stamp = r'NEW+\b.*?\.\s*'

# remove criminal stamps in the highlights
# example: Criminal complaint: Cop used his role to help cocaine traffickers . Ralph Mata'
criminal_complaint_stamp = r'Criminal complaint+\b.*?\.\s*'

In [48]:
highlights_cleaning_df['highlights'] = highlights_cleaning_df['highlights'].apply(lambda x: re.sub(period_whitespace, '', x))

In [49]:
highlights_cleaning_df['highlights'] = highlights_cleaning_df['highlights'].apply(lambda x: re.sub(remove_next_line, ' ', x))

In [51]:
highlights_cleaning_df['highlights'] = highlights_cleaning_df['highlights'].apply(lambda x: re.sub(news_stamp, '', x))

In [52]:
highlights_cleaning_df['highlights'] = highlights_cleaning_df['highlights'].apply(lambda x: re.sub(criminal_complaint_stamp, '', x))

In [53]:
# remove any leading white space
highlights_cleaning_df['highlights'] = highlights_cleaning_df['highlights'].str.lstrip()
highlights_cleaning_df['article'] = highlights_cleaning_df['article'].str.lstrip()

In [None]:
### Split into traning, validate, and test sets ###

In [55]:
len(highlights_cleaning_df)

37880

In [57]:
# use 6000 random samples for validation
validate_df = highlights_cleaning_df.sample(n=6000, random_state=56)

In [59]:
# drop the validation samples from training data
highlights_cleaning_df = highlights_cleaning_df.drop(validate_df.index)

In [63]:
# use 2000 random samples for testingvalidation
testing_df = highlights_cleaning_df.sample(n=2000, random_state=56)

In [64]:
# drop the testing samples from training data
highlights_cleaning_df = highlights_cleaning_df.drop(testing_df.index)

In [65]:
# reset the indexes
highlights_cleaning_df.reset_index(drop=True, inplace=True)
testing_df.reset_index(drop=True, inplace=True)
validate_df.reset_index(drop=True, inplace=True)

In [None]:
# save to cvs files

In [66]:
highlights_cleaning_df.to_csv('training.csv', header=True, index=False)

In [67]:
validate_df.to_csv('validating.csv', header=True, index=False)

In [68]:
testing_df.to_csv('testing.csv', header=True, index=False)