In [1]:
import pandas as pd
import numpy as np
import acquire as a
import re
import unicodedata
import nltk
from nltk.tokenize import ToktokTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:
    * Lowercase everything
    * Normalize unicode characters
    * Replace anything that is not a letter, number, whitespace or a single quote.

In [2]:
def basic_clean(text):
    """
    Apply basic text cleaning to the input string.
    
    This function performs the following steps:
    - Lowercase the text.
    - Normalize unicode characters.
    - Replace characters that are not letters, numbers, whitespace, or single quotes.
    
    Parameters:
    - text: The input text to be cleaned.
    
    Returns:
    Cleaned text after applying the basic cleaning operations.
    """
    # Lowercase the text
    text = text.lower()
    
    # Normalize unicode characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    #Replace any characters that are not letters, numbers, spaces, or single quotes.

    text = re.sub(r"[^a-z0-9'\s]", '', text)
    return text

2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [3]:
def tokenize(text):
    """
    Tokenize words in the input string.
    
    This function tokenizes the input text into individual words.
    
    Parameters:
    - text: The input text to be tokenized.
    
    Returns:
    A list of tokenized words.
    """
    tokenizer = ToktokTokenizer()
    tokens = tokenizer.tokenize(text)

3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [4]:
def stem(text):
    """
    Apply stemming to words in the input text.
    
    This function applies stemming to each word in the input text using the Porter Stemmer algorithm.
    
    Parameters:
    - text: The input text to apply stemming to.
    
    Returns:
    Text with words after stemming.
    """
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in nltk.word_tokenize(text)]
    stemmed_text = ' '.join(stemmed_words)
    return stemmed_text

4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [5]:
def lemmatize(text):
    """
    Apply lemmatization to words in the input text.
    
    This function applies lemmatization to each word in the input text using the WordNet Lemmatizer.
    
    Parameters:
    - text: The input text to apply lemmatization to.
    
    Returns:
    Text with words after lemmatization.
    """
    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in nltk.word_tokenize(text)]
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text

5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords. This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [6]:
def remove_stopwords(text, extra_words=None, exclude_words=None):
    """
    Remove stopwords from the input text.
    
    Parameters:
    - text (str): The input text to remove stopwords from.
    - extra_words (list): List of additional words to include as stopwords.
    - exclude_words (list): List of words not to be removed.
    
    Returns:
    str: Text with stopwords removed.
    """
    # Load the stopwords list from the NLTK library
    stopword_list = set(stopwords.words('english'))
    
    # Add extra words to the stopwords list if provided
    if extra_words:
        stopword_list.update(extra_words)
        
    # Remove excluded words from the stopwords list if provided
    if exclude_words:
        stopword_list.difference_update(exclude_words)
        
    # Tokenize the input text into individual words
    words = nltk.word_tokenize(text)
    
    # Filter out words that are in the stopwords list
    filtered_words = [word for word in words if word.lower() not in stopword_list]
    
    # Reconstruct the filtered words into a text string
    filtered_text = ' '.join(filtered_words)
    
    return filtered_text

6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [7]:
# creatings news df
news=pd.read_json('news_articles.json')
pd.set_option("display.max_colwidth", None)
news_df=pd.DataFrame(news)

FileNotFoundError: File news_articles.json does not exist

In [8]:
news_df.head()

NameError: name 'news_df' is not defined

In [9]:
news_df['clean'] = news_df['content'].apply(lambda x: remove_stopwords(stem(basic_clean(' '.join(tokenize(x))))))
news_df['stemmed']=news_df['clean'].apply(stem)
news_df['lemmatized']=news_df['clean'].apply(lemmatize)

NameError: name 'news_df' is not defined

In [10]:
news_df.head()

NameError: name 'news_df' is not defined

7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [11]:
#creating codeup df
codeup=pd.read_json('blog_posts.json')
pd.set_option("display.max_colwidth", None)
codeup_df=pd.DataFrame(codeup)

In [12]:
codeup_df.head(2)

Unnamed: 0,title,link,date_published,content
0,Spotlight on APIDA Voices: Celebrating Heritage and Inspiring Change ft. Arbeena Thapa,https://codeup.edu/featured/apida-heritage-month/,"May 24, 2023","\nMay is traditionally known as Asian American and Pacific Islander (AAPI) Heritage Month. This month we celebrate the history and contributions made possible by our AAPI friends, family, and community. We also examine our level of support and seek opportunities to better understand the AAPI community.\n\nIn an effort to address real concerns and experiences, we sat down with Arbeena Thapa, one of Codeup’s Financial Aid and Enrollment Managers.\nArbeena identifies as Nepali American and Desi. Arbeena’s parents immigrated to Texas in 1988 for better employment and educational opportunities. Arbeena’s older sister was five when they made the move to the US. Arbeena was born later, becoming the first in her family to be a US citizen.\nAt Codeup we take our efforts at inclusivity very seriously. After speaking with Arbeena, we were taught that the term AAPI excludes Desi-American individuals. Hence, we will now use the term Asian Pacific Islander Desi American (APIDA).\nHere is how the rest of our conversation with Arbeena went!\nHow do you celebrate or connect with your heritage and cultural traditions?\n“I celebrate Nepal’s version of Christmas or Dashain. This is a nine-day celebration also known as Dussehra. I grew up as Hindu and I identify as Hindu, this is a very large part of my heritage. “\n“Other ways I connect with my culture include sharing food! Momos are South Asian Dumplings and they’re my favorite to make and share.”\n“On my Asian American side, I am an advocate of immigrant justice and erasure within APIDA social or political movements. I participate in events to embrace my identity such as immigrant justice advocacy because I come from a mixed-status family. I’ve always been in a community with undocumented Asian immigrants. .”\nWhat are some of the challenges you have faced as an APIDA individual, personally or professionally?\n“I often struggle with being gendered as compliant or a pushover. Professionally, I am often stereotyped as meek, so I’ve been overlooked for leadership roles. We are seen as perpetually foreign; people tend to other us in that way, yet put us on a pedestal for what a model minority looks like. This has made me hesitant to share my heritage in the past because these assumptions get mapped onto me. ”\nCan you describe some common barriers of entry that APIDA individuals, specifically women may face when trying to enter or advance in the workplace?\n“Being overlooked for leadership. In the past, I have not been viewed as a leader. People sometimes have preconceived stereotypes of Asian women not being able to be bold, or being vocal can be mistaken for being too emotional. “\nHow do you believe microaggressions impact APIDA individuals in the workplace? Can you provide examples of such microaggressions?\n“Erasure is big. To me, only saying ‘Merry Christmas’ isn’t inclusive to other religions. People are often resistant to saying ‘Happy Holidays,’ but saying Merry Christmas excludes, and does not appreciate my heritage. “\n“Often microaggressions are not micro at all. They typically are not aggressive racialized violence, but the term ‘micro’ minimizes impact.”\n“Some that I’ve heard are ‘What kind of Asian are you?’ or ‘Where are you from?’ This automatically makes me the ‘other’ and not seen as American. Even within the APIDA community, South Asians are overlooked as “Asian”.”\nHow important is representation, specifically APIDA representation, in organizational leadership positions?\n“I want to say that it is important to have someone who looks like you in leadership roles, and it is, but those leaders may not share the same beliefs as you. Certain privileges such as wealth, resources, or lack of interaction with lower-socioeconomic-status Asian Americans may cause a difference in community politics. I do not think the bamboo ceiling is acceptable, but the company you work for plays a big part in your politics and belief alignment.”\nHow do you feel about code-switching, and have you ever felt it necessary to code-switch?\n“I like sharing South Asian terms or connecting with others that have similar heritage and culture. A workplace that is welcoming to going into this sort of breakout is refreshing and makes space for us. However, having to code-switch could also mean a workplace that is not conducive and welcoming of other cultures. “\nFinally, in your opinion, what long-term strategies can create lasting change in the workplace and ensure support, equality, and inclusion for APIDA individuals?\n“Prior to a career in financial aid, I did a lot of research related to the post-9/11 immigration of the South Asian diaspora. This background made me heavily rely on grassroots organizing. Hire the people that want to innovate, hire the changemakers, hire the button-pushers. Reduce reliance on whiteness as change. This will become natural for the organization and become organizational change. Change comes from us on the ground.”\nA huge thank you to Arbeena Thapa for sharing her experiences, and being vulnerable with us. Your words were inspiring and the opportunity to understand your perspective more has been valuable. We hope we can become better support for the APIDA community as we learn and grow on our journey of cultivating inclusive growth.\n"
1,Women in tech: Panelist Spotlight – Magdalena Rahn,https://codeup.edu/featured/women-in-tech-panelist-spotlight/,"Mar 28, 2023","\nWomen in tech: Panelist Spotlight – Magdalena Rahn\nCodeup is hosting a Women in Tech Panel in honor of Women’s History Month on March 29th, 2023! To further celebrate, we’d like to spotlight each of our panelists leading up to the discussion to learn a bit about their respective experiences as women in the tech industry!\n\nMeet Magdalena!\nMagdalena Rahn is a current Codeup student in a Data Science cohort in San Antonio, Texas. She has a professional background in cross-cultural communications, international business development, the wine industry and journalism. After serving in the US Navy, she decided to complement her professional skill set by attending the Data Science program at Codeup; she is set to graduate in March 2023. Magdalena is fluent in French, Bulgarian, Chinese-Mandarin, Spanish and Italian.\nWe asked Magdalena how Codeup impacted her career, and she replied “Codeup has provided a solid foundation in analytical processes, programming and data science methods, and it’s been an encouragement to have such supportive instructors and wonderful classmates.”\nDon’t forget to tune in on March 29th to sit in on an insightful conversation with Magdalena.\n"


In [13]:
codeup_df['clean'] = codeup_df['content'].apply(lambda x: remove_stopwords(stem(basic_clean(' '.join(tokenize(x))))))
codeup_df['stemmed']=codeup_df['clean'].apply(stem)
codeup_df['lemmatized']=codeup_df['clean'].apply(lemmatize)

TypeError: can only join an iterable

8. For each dataframe, produce the following columns:
    * title to hold the title
    * original to hold the original article/post content
    * clean to hold the normalized and tokenized original with the stopwords removed.
    * stemmed to hold the stemmed version of the cleaned data.
    * lemmatized to hold the lemmatized version of the cleaned data.

In [14]:
def add_columns(df):
    """Adds cleaned and processed columns to the input DataFrame.
    """
    
    df['clean'] = df['content'].apply(lambda x: remove_stopwords(stem(basic_clean(' '.join(tokenize(x))))))
    df['stemmed'] = df['clean'].apply(stem)
    df['lemmatized'] = df['clean'].apply(lemmatize)
    return df

In [15]:
updated_codeup_df=add_columns(codeup_df)
updated_codeup_df.head()
updated_codeup_df.to_csv('codeup_df.csv',index=False)

TypeError: can only join an iterable

In [16]:
updated_news_df=add_columns(news_df)
updated_news_df.head()
updated_news_df.to_csv('news_df.csv',index=False)

NameError: name 'news_df' is not defined

In [17]:
df=pd.read_csv('news_df.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'news_df.csv'

9. Ask yourself:
    * If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
    * If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
    * If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?