Importing data to the code from google drive

In [None]:
from google.colab import drive
import pandas as pd

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import json

In [None]:
json_folder_path = '/content/drive/MyDrive/Capstone Project/data/raw'

In [None]:
articles_data = []

In [None]:
# Iterate through the JSON files in the folder
for filename in os.listdir(json_folder_path):
    if filename.endswith('.json'):
        file_path = os.path.join(json_folder_path, filename)
        with open(file_path, 'r') as file:
            # Load the JSON data
            article = json.load(file)
            articles_data.append(article)

In [None]:
# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(articles_data)

In [None]:
df.head()

Unnamed: 0,title,meta_title,description,meta_description,text_headers,text
0,The symbols of Poland’s abortion protests expl...,,,,The symbols of Poland’s abortion protests expl...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nThe symbol...
1,Impending Threat of Abortion Criminalization B...,,As the Supreme Court nears a ruling that could...,,\n\t\t\t\tImpending Threat of Abortion Crimina...,\n\n\n\n\n\n\n\nImpending Threat of Abortion C...
2,New lawsuits target state restrictions on abor...,,Supporters of abortion rights have filed separ...,,New lawsuits target state restrictions on abor...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
3,Indiana’s near-total abortion ban leads doctor...,,Indianas near-total abortion ban is preventing...,,"September 1, 2023\nIndiana’s near-total aborti...",\n\n\n \nIndiana’s near-total abortion ban lea...
4,Legal challenge to Texas abortion law official...,,The court ended a legal challenge to the Texas...,The court ended a legal challenge to the Texas...,Federal appeals court ends legal challenge to ...,Legal challenge to Texas abortion law official...


In [None]:
# Extract necessary columns
df = df[['title', 'text']]

In [None]:
# Drop any rows with missing text
df.dropna(subset=['text'], inplace=True)

In [None]:
df.head()

Unnamed: 0,title,text
0,The symbols of Poland’s abortion protests expl...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nThe symbol...
1,Impending Threat of Abortion Criminalization B...,\n\n\n\n\n\n\n\nImpending Threat of Abortion C...
2,New lawsuits target state restrictions on abor...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
3,Indiana’s near-total abortion ban leads doctor...,\n\n\n \nIndiana’s near-total abortion ban lea...
4,Legal challenge to Texas abortion law official...,Legal challenge to Texas abortion law official...


In [None]:
import re

# Take the first article in df['text'] column
sample_article = df.loc[0, 'text']  # Get the first article's text

In [None]:
sample_article



In [None]:
!pip install contractions
!pip install pyspellchecker



In [None]:
import os
import re
import pandas as pd
from bs4 import BeautifulSoup
import contractions
from spellchecker import SpellChecker

# Initialize the spellchecker
spell = SpellChecker()

# Define individual preprocessing steps

def remove_html_tags(text):
    """Remove HTML tags from the text using BeautifulSoup."""
    if text is None:
        return ""
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_urls(text):
    """Remove URLs from the text."""
    if text is None:
        return ""
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

#def to_lowercase(text):
#    """Convert the text to lowercase."""
#    if text is None:
#        return ""
#    return text.lower()

#def expand_contractions(text):
#    """Expand contractions (e.g., don't -> do not)."""
#    if text is None:
#        return ""
#    return contractions.fix(text)

def reduce_excessive_punctuation(text):
    """Reduce excessive punctuation (e.g., !!!! -> !)."""
    if text is None:
        return ""
    return re.sub(r'([.!?])\1+', r'\1', text)

def remove_special_characters(text):
    """Remove special characters, keeping only letters, numbers, spaces, and some punctuation."""
    if text is None:
        return ""
    return re.sub(r'[^A-Za-z0-9\s,.!?\'\"-]', '', text)

def normalize_whitespace(text):
    """Normalize whitespace to remove excessive newlines and redundant spaces."""
    if text is None:
        return ""
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with a single space
    return text.strip()  # Remove leading and trailing spaces

def remove_known_headers_and_footers(text):
    """Remove known headers, footers, and repetitive lines from the text."""
    if text is None:
        return ""
    known_patterns = [
        r'^share this article',       # Example: "Share this article" header
        r'^related articles',         # Example: "Related articles" section
        r'^advertisement',            # Example: "Advertisement" sections
        r'^subscribe to our newsletter', # Example: "Subscribe" footer
    ]

    for pattern in known_patterns:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.MULTILINE)

    return text

def remove_repetitive_lines(text):
    """Remove consecutive duplicate lines."""
    if text is None:
        return ""
    lines = text.split('\n')
    seen = set()
    cleaned_lines = []
    for line in lines:
        line = line.strip()  # Strip extra spaces
        if line and line not in seen:  # Add line if it's not empty and hasn't been seen
            cleaned_lines.append(line)
            seen.add(line)
    return ' '.join(cleaned_lines)

# Combine all the steps into one pipeline function
def clean_article_text(text):
    """Combine all preprocessing steps to clean the text."""
    text = remove_html_tags(text)              # Step 1: Remove HTML tags
    text = remove_urls(text)                   # Step 2: Remove URLs
    #text = to_lowercase(text)                  # Step 3: Convert to lowercase
    #text = expand_contractions(text)           # Step 4: Expand contractions
    text = reduce_excessive_punctuation(text)  # Step 5: Reduce excessive punctuation
    text = remove_special_characters(text)     # Step 7: Remove special characters
    text = remove_known_headers_and_footers(text) # Step 8: Remove known headers and footers
    text = remove_repetitive_lines(text)       # Step 9: Remove repetitive lines
    text = normalize_whitespace(text)          # Step 10: Normalize whitespace
    return text

# Applying the pipeline to a single article (for testing)
cleaned_sample_article = clean_article_text(sample_article)
print(cleaned_sample_article)




In [None]:
# Applying the pipeline to the entire DataFrame
def clean_articles_in_df(df):
    """Apply text cleaning pipeline to all articles in the DataFrame."""
    df['cleaned_article'] = df['text'].apply(clean_article_text)
    return df

# Assuming 'df' is your DataFrame containing the articles
# Applying the cleaning function to your DataFrame
df_cleaned = clean_articles_in_df(df)

In [None]:
df_cleaned.head()

Unnamed: 0,title,text,cleaned_article
0,The symbols of Poland’s abortion protests expl...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nThe symbol...,The symbols of Polands abortion protests expla...
1,Impending Threat of Abortion Criminalization B...,\n\n\n\n\n\n\n\nImpending Threat of Abortion C...,Impending Threat of Abortion Criminalization B...
2,New lawsuits target state restrictions on abor...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,New lawsuits target state restrictions on abor...
3,Indiana’s near-total abortion ban leads doctor...,\n\n\n \nIndiana’s near-total abortion ban lea...,Indianas near-total abortion ban leads doctors...
4,Legal challenge to Texas abortion law official...,Legal challenge to Texas abortion law official...,Legal challenge to Texas abortion law official...


In [None]:
df_cleaned = df_cleaned[['title', 'cleaned_article']]

In [None]:
df_cleaned.head()

Unnamed: 0,title,cleaned_article
0,The symbols of Poland’s abortion protests expl...,The symbols of Polands abortion protests expla...
1,Impending Threat of Abortion Criminalization B...,Impending Threat of Abortion Criminalization B...
2,New lawsuits target state restrictions on abor...,New lawsuits target state restrictions on abor...
3,Indiana’s near-total abortion ban leads doctor...,Indianas near-total abortion ban leads doctors...
4,Legal challenge to Texas abortion law official...,Legal challenge to Texas abortion law official...


In [None]:
df_cleaned.to_csv('/content/drive/MyDrive/Capstone Project/data/processed/cleaned_articles.csv', index=False)