In [1]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
import pandas as pd
from typing import List

# If you haven't already, you'll need to download these resources.
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\20182877\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\20182877\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
def preprocess_dataframe_content(df: pd.DataFrame) -> List[List[str]]:
    """
    Preprocesses each entry in the 'content' column of the given DataFrame by:
    - Lowercasing
    - Keeping only alphabetic characters
    - Removing stopwords
    - Lemmatizing
    - Filtering out words with length less than 3
    Tokenizes the preprocessed entry into a list of words.
    Returns a list of lists where each inner list is a tokenized and preprocessed entry from the
    'content' column of the DataFrame.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the 'content' column to be processed.

    Returns:
    - list: A list of lists, where each inner list is a tokenized and preprocessed entry from the
    'content' column of the DataFrame.
    """
    # Prepare lemmatizer and stopwords list
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Preprocess the content
    processed_content = []
    for content in df['content']:
        # Keep only alphabetic characters and lowercased
        tokens = re.sub('[^a-zA-Z\s]', '', content.lower().strip()).split()
        # Remove stopwords and short words, and lemmatize
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) >= 3]
        processed_content.append(tokens)

    return processed_content

In [3]:
def save_preprocessed_dataframe(df: pd.DataFrame, filename: str) -> None:
    """
    Preprocesses the 'content' column of the given DataFrame using the preprocess_dataframe_content function,
    then saves the modified DataFrame to a specified CSV file.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the 'content' column to be processed.
    - filename (str): The name of the CSV file where the DataFrame should be saved.

    Returns:
    - None: Saves the DataFrame to a CSV file.
    """

    # Preprocess content
    processed_content = preprocess_dataframe_content(df)

    # Convert lists of words back to space-separated strings
    df['content'] = [' '.join(words) for words in processed_content]

    # Save to CSV
    df.to_csv(filename, index=False)

In [4]:
articles = pd.read_csv("data/us_equities_news_dataset.csv")
articles = articles[articles['content'].notna()]
articles

Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id
0,221515,NIO,Why Shares of Chinese Electric Car Maker NIO A...,news,What s happening\nShares of Chinese electric c...,2020-01-15,The Motley Fool,https://invst.ly/pigqi,2060327
1,221516,NIO,NIO only consumer gainer Workhorse Group amon...,news,Gainers NIO NYSE NIO 7 \nLosers MGP Ingr...,2020-01-18,Seeking Alpha,https://invst.ly/pje9c,2062196
2,221517,NIO,NIO leads consumer gainers Beyond Meat and Ma...,news,Gainers NIO NYSE NIO 14 Village Farms In...,2020-01-15,Seeking Alpha,https://invst.ly/pifmv,2060249
3,221518,NIO,NIO NVAX among premarket gainers,news,Cemtrex NASDAQ CETX 85 after FY results \n...,2020-01-15,Seeking Alpha,https://invst.ly/picu8,2060039
4,221519,NIO,PLUG NIO among premarket gainers,news,aTyr Pharma NASDAQ LIFE 63 on Kyorin Pharm...,2020-01-06,Seeking Alpha,https://seekingalpha.com/news/3529772-plug-nio...,2053096
...,...,...,...,...,...,...,...,...,...
221508,443024,T,Crude And Steel Still In Sync,opinion,We have been reporting on the trade off betwee...,2012-10-04,Ivan Kitov,https://www.investing.com/analysis/crude-and-s...,138733
221509,443025,T,Forget AT T This Is The Telecom Stock You Sho...,opinion,It s the largest cell phone provider in the wo...,2012-05-30,StreetAuthority,https://www.investing.com/analysis/forget-at-t...,124829
221510,443026,T,Wall Street Exposed Part 3 How Dividends C...,opinion,Before we dicuss how the mechanism of dividend...,2012-07-16,Portfolio Cafe,https://www.investing.com/analysis/wall-street...,129651
221511,443027,T,Weighing The Week Ahead It s All About Jobs,opinion,From start to finish the coming week will hav...,2012-09-02,Jeff Miller,https://www.investing.com/analysis/weighing-th...,134926


In [5]:
#Create new dataframe with only articles of which the title contains "Apple"
#Not all AAPL ticker contain apple in the title and vice versa.
apple_articles = articles[articles['title'].str.contains('Apple', case=False)]

In [6]:
save_preprocessed_dataframe(apple_articles, 'data/preprocessed_article_data.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['content'] = [' '.join(words) for words in processed_content]
