In [1]:
import pandas as pd
import json
import prepare as prep

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jongarcia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### 1- Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [None]:
def basic_clean(data):
    # Convert the text to lowercase
    data = data.lower()
    
    # Normalize the text by removing any diacritical marks
    data = unicodedata.normalize('NFKD', data)\
        .encode('ascii', 'ignore')\
        .decode('utf-8', 'ignore')
    
    # Remove any characters that are not lowercase letters, numbers, apostrophes, or whitespaces
    data = re.sub(r"[^a-z0-9'\s]", "", data)
    
    # Return the cleaned data
    return data



### 2- Define a function named tokenize. It should take in a string and tokenize all the words in the string.


In [None]:
def tokenize(data):
    # Initialize a tokenizer object
    tokenizer = ToktokTokenizer()

    # Tokenize the input data using the tokenizer object
    data = tokenizer.tokenize(data, return_str=True)

    # Return the processed data
    return data



### 3-Define a function named stem. It should accept some text and return the text after applying stemming to all the words.


In [None]:
def stem(data):
    # Create an instance of the PorterStemmer class from the nltk library
    ps = nltk.porter.PorterStemmer()
    # Create a list of words form data
    words = data.split()
    # Apply stemming to each word in the input data
    stems = [ps.stem(word) for word in words]

    # Join the stemmed words into a single string with spaces in between
    stemmed_data = ' '.join(stems)

    # Return the stemmed data
    return stemmed_data



### 4- Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [None]:
def lemmatize(data):
    # Create an instance of WordNetLemmatizer
    wnl = nltk.stem.WordNetLemmatizer()
    
    # Create a list of words form data
    words = data.split()
    
    # Lemmatize each word in the input data
    lemmas = [wnl.lemmatize(word) for word in words]

    # Join the lemmatized words into a single string
    lemmatized_data = ' '.join(lemmas)

    # Return the lemmatized data
    return lemmatized_data



In [None]:
# Function to apply cleaning and processing functions from prepare.py
def process_dataframe(df):
    # Create a new column 'original' and assign the values from 'content'
    df['original'] = df['content']
    
    # Apply the basic_clean function to 'original', then tokenize the result, and remove stopwords
    df['clean'] = df['original'].apply(basic_clean).apply(tokenize).apply(remove_stopwords)
    
    # Apply the stem function to 'clean' column
    df['stemmed'] = df['clean'].apply(stem)
    
    # Apply the lemmatize function to 'clean' column
    df['lemmatized'] = df['clean'].apply(lemmatize)
    
    # Drop the 'content' column from the dataframe
    df = df.drop(columns='content', axis=1)
    
    # Return the modified dataframe
    return df




### 5- Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

- This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [None]:
def remove_stopwords(data, extra_words, exclude_words):
    # Create a list of stopwords in English
    stopwords_list = stopwords.words('english')

    # Extend the stopwords_list with the elements from the extra_words list
    stopwords_list.extend(extra_words)

    # Iterate over each word in the exclude_words list
    for word in exclude_words:
        # Check if the word exists in the stopwords_list
        if word in stopwords_list:
            # Remove the word from the stopwords_list
            stopwords_list.remove(word)

    # Split the data into individual words and filter out stopwords
    words = [word for word in data.split() if word not in stopwords_list]
    
    # Join the filtered words back into a string
    data = ' '.join(words)
    
    # Return the processed data
    return data

### 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [2]:
# Open and read the JSON file
with open('inshorts_articles.json', "r") as json_file:
    news_articles = json.load(json_file)

# Create a DataFrame from the JSON data
news_df = pd.DataFrame(news_articles)

# Check the DataFrame
news_df.head()

Unnamed: 0,title,content
0,Govt probing accounts of Adani Group-run Mumba...,The Ministry of Corporate Affairs has opened a...
1,IndiGo Co-founder Gangwal to buy SpiceJet stak...,IndiGo Co-founder Rakesh Gangwal is at advance...
2,"Who is KP Ramasamy, a farmer's son who entered...",KP Ramasamy has made his debut on Forbes India...
3,"Zomato, McDonald's fined ₹1 lakh for deliverin...",Food delivery platform Zomato and fast food ch...
4,India scraps plan to impose restrictions on la...,The Indian government has reversed its decisio...


### 7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [3]:
# Open and read the JSON file
with open('codeup_blog_articles.json', "r") as json_file:
    blog_articles = json.load(json_file)

# Create a DataFrame from the JSON data
codeup_df = pd.DataFrame(blog_articles)

# Check the DataFrame
codeup_df.head()

Unnamed: 0,title,content
0,Spotlight on APIDA Voices: Celebrating Heritag...,May is traditionally known as Asian American a...
1,Women in tech: Panelist Spotlight – Magdalena ...,Codeup is hosting a Women in Tech Panel in hon...
2,Women in tech: Panelist Spotlight – Rachel Rob...,Codeup is hosting a Women in Tech Panel in hon...
3,Women in Tech: Panelist Spotlight – Sarah Mellor,Codeup is hosting a Women in Tech Panel in hon...
4,Women in Tech: Panelist Spotlight – Madeleine ...,Codeup is hosting a Women in Tech Panel in hon...


In [6]:
len(codeup_df)

270

In [8]:
a[codeup_df['content'].isna()]

Unnamed: 0,title,content,original,clean,stemmed,lemmatized


In [11]:
codeup_df.iloc[66]

title         Why Should I Become a System Administrator?
content                                                  
original                                                 
clean                                                    
stemmed                                                  
lemmatized                                               
Name: 66, dtype: object

In [15]:
type(codeup_df.content.iloc[66])

str

In [17]:
codeup_df.content.iloc[66]

''

In [9]:
codeup_df.isna().sum()

title         0
content       0
original      0
clean         0
stemmed       0
lemmatized    0
dtype: int64

### 8. For each dataframe, produce the following columns:


- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.


In [4]:
# Processing the news_df dataframe
news_df_processed = prep.process_dataframe(news_df)

# Displaying the first few rows of the processed dataframe to verify the changes
news_df_processed.head()

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Govt probing accounts of Adani Group-run Mumba...,The Ministry of Corporate Affairs has opened a...,ministry corporate affairs opened investigatio...,ministri corpor affair open investig account a...,ministry corporate affair opened investigation...
1,IndiGo Co-founder Gangwal to buy SpiceJet stak...,IndiGo Co-founder Rakesh Gangwal is at advance...,indigo cofounder rakesh gangwal advanced stage...,indigo cofound rakesh gangwal advanc stage tal...,indigo cofounder rakesh gangwal advanced stage...
2,"Who is KP Ramasamy, a farmer's son who entered...",KP Ramasamy has made his debut on Forbes India...,kp ramasamy made debut forbes india ' 100 rich...,kp ramasami made debut forb india ' 100 riches...,kp ramasamy made debut forbes india ' 100 rich...
3,"Zomato, McDonald's fined ₹1 lakh for deliverin...",Food delivery platform Zomato and fast food ch...,food delivery platform zomato fast food chain ...,food deliveri platform zomato fast food chain ...,food delivery platform zomato fast food chain ...
4,India scraps plan to impose restrictions on la...,The Indian government has reversed its decisio...,indian government reversed decision imposing r...,indian govern revers decis impos restrict lapt...,indian government reversed decision imposing r...


In [5]:
# Processing the news_df dataframe
codeup_df_processed = prep.process_dataframe(codeup_df)

# Displaying the first few rows of the processed dataframe to verify the changes
codeup_df_processed.head()

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Spotlight on APIDA Voices: Celebrating Heritag...,May is traditionally known as Asian American a...,may traditionally known asian american pacific...,may tradit known asian american pacif island a...,may traditionally known asian american pacific...
1,Women in tech: Panelist Spotlight – Magdalena ...,Codeup is hosting a Women in Tech Panel in hon...,codeup hosting women tech panel honor womens h...,codeup host women tech panel honor women histo...,codeup hosting woman tech panel honor woman hi...
2,Women in tech: Panelist Spotlight – Rachel Rob...,Codeup is hosting a Women in Tech Panel in hon...,codeup hosting women tech panel honor womens h...,codeup host women tech panel honor women histo...,codeup hosting woman tech panel honor woman hi...
3,Women in Tech: Panelist Spotlight – Sarah Mellor,Codeup is hosting a Women in Tech Panel in hon...,codeup hosting women tech panel honor womens h...,codeup host women tech panel honor women histo...,codeup hosting woman tech panel honor woman hi...
4,Women in Tech: Panelist Spotlight – Madeleine ...,Codeup is hosting a Women in Tech Panel in hon...,codeup hosting women tech panel honor womens h...,codeup host women tech panel honor women histo...,codeup hosting woman tech panel honor woman hi...
