## Load Data

In [50]:
import pandas as pd

In [53]:
df_news = pd.read_csv("nbc_articles_cleaned.csv")

In [56]:
df_news

Unnamed: 0,title,date,link,author,content,tokens,tokens_with_ngrams,clean_text
0,Cubs infielder Matt Shaw defends missing game ...,2025-09-24T16:54:50.717Z,https://www.nbcnews.com/news/us-news/cubs-matt...,Minyvonne Burke,Chicago Cubs infielder Matt Shaw said he thoug...,"['chicago', 'cubs', 'infielder', 'matt', 'shaw...","['chicago', 'cubs', 'infielder', 'matt', 'shaw...",chicago cubs infielder matt shaw think importa...
1,YouTube to start bringing back creators banned...,2025-09-24T16:54:08.647Z,https://www.nbcnews.com/tech/tech-news/youtube...,The Associated Press,YouTube will offer creators a way to rejoin th...,"['youtube', 'offer', 'creator', 'way', 'rejoin...","['youtube', 'offer', 'creator', 'way', 'rejoin...",youtube offer creator way rejoin stream platfo...
2,A trio of space weather satellites blast off t...,2025-09-24T14:15:16.424Z,https://www.nbcnews.com/science/science-news/t...,The Associated Press,"CAPE CANAVERAL, Fla. — A cluster of space weat...","['cape', 'canaveral', 'fla.', 'cluster', 'spac...","['cape', 'canaveral', 'fla.', 'cluster', 'spac...",cape canaveral fla. cluster space weather sate...
3,Trump administration rehires hundreds of feder...,2025-09-24T13:35:34.846Z,https://www.nbcnews.com/politics/trump-adminis...,The Associated Press,MIAMI — Hundreds of federal employees who lost...,"['miami', 'federal', 'employee', 'lose', 'job'...","['miami', 'federal', 'employee', 'lose', 'job'...",miami federal employee lose job elon_musk cost...
4,NASA introduces its newest astronauts,2025-09-23T13:15:55.028Z,https://www.nbcnews.com/science/science-news/n...,The Associated Press,"CAPE CANAVERAL, Fla. — NASA introduced its new...","['cape', 'canaveral', 'fla.', 'nasa', 'introdu...","['cape', 'canaveral', 'fla.', 'nasa', 'introdu...",cape canaveral fla. nasa introduce new astrona...
...,...,...,...,...,...,...,...,...
5920,"Good news, bad news for station",2003-10-20T19:30:10.000Z,https://www.nbcnews.com/id/wbna3226810,NBC News,A Russian spacecraft filled in for the second ...,"['russian', 'spacecraft', 'fill', 'second', 't...","['russian', 'spacecraft', 'fill', 'second', 't...",russian spacecraft fill second time u.s. shutt...
5921,Documents detail shuttle what-ifs,2003-10-09T20:37:06.000Z,https://www.nbcnews.com/id/wbna3077563,James Oberg,An in-depth NASA study concludes that while th...,"['depth', 'nasa', 'study', 'conclude', 'crew',...","['depth', 'nasa', 'study', 'conclude', 'crew',...",depth nasa study conclude crew columbia able s...
5922,Model planes put high-tech eyes in the sky,2003-04-10T18:40:48.000Z,https://www.nbcnews.com/id/wbna3077238,By Alan Boyle,NASA researchers are literally holding the fut...,"['nasa', 'researcher', 'literally', 'hold', 'f...","['nasa', 'researcher', 'literally', 'hold', 'f...",nasa researcher literally hold future aerial m...
5923,Shuttle probe follows a trail of data,2003-03-16T19:49:36.000Z,https://www.nbcnews.com/id/wbna3077573,James Oberg,Investigators looking into the loss of the shu...,"['investigator', 'look', 'loss', 'shuttle', 'c...","['investigator', 'look', 'loss', 'shuttle', 'c...",investigator look loss shuttle columbia seven ...


In [64]:
df_news = df_news[df_news['date'].astype(str).str.contains('2023|2024|2025')]

In [65]:
cases = {
    'Tesla/Electric Car': ['Tesla', 'electric car', 'EV'],
    'SpaceX/NASA': ['SpaceX', 'NASA', 'rocket', 'astronaut']
}

In [66]:
def flag_cases(text):
    if pd.isna(text):  # Handle missing text gracefully
        return None
    # Iterate through the cases and check for keywords
    for case, keywords in cases.items():
        # Use a case-insensitive search for any of the keywords
        if any(keyword.lower() in text.lower() for keyword in keywords):
            return case
    # Return 'None' if no case matches
    return None

In [67]:
# Apply the function to the 'content' column to create the new 'flag_case' column
df_news['flag_case'] = df_news['content'].apply(flag_cases)

## nltk sentiment analysis 

In [68]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [69]:
# Download the VADER lexicon. This only needs to be done once.
try:
    nltk.download('vader_lexicon', quiet=True)
except Exception as e:
    print(f"Error downloading NLTK VADER lexicon: {e}")
    print("Please check your internet connection and try again.")
    exit()

In [70]:
# Initialize the VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Define a function to get the sentiment label from the compound score
def get_sentiment_label(text):
    if pd.isna(text):
        return 'Neutral'
    scores = sia.polarity_scores(str(text))
    compound_score = scores['compound']
    
    if compound_score >= 0.05:
        return 'Positive'
    elif compound_score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

In [71]:
# Apply the function to the 'content' column to create a new sentiment column
df_news['sentiment_label'] = df_news['content'].apply(get_sentiment_label)
df_news['compound_score'] = df_news['content'].apply(lambda x: sia.polarity_scores(str(x))['compound'] if pd.notna(x) else 0)

# Display the first few rows to show the new columns
print("DataFrame with sentiment analysis results:")
print(df_news[['title', 'sentiment_label', 'compound_score']].head())

# Save the updated DataFrame to a new CSV file
output_file = "nbc_articles_with_sentiment.csv"
df_news.to_csv(output_file, index=False)
print(f"\nUpdated data saved to '{output_file}'")

DataFrame with sentiment analysis results:
                                               title sentiment_label  \
0  Cubs infielder Matt Shaw defends missing game ...         Neutral   
1  YouTube to start bringing back creators banned...        Negative   
2  A trio of space weather satellites blast off t...        Positive   
3  Trump administration rehires hundreds of feder...        Negative   
4              NASA introduces its newest astronauts        Positive   

   compound_score  
0          0.0478  
1         -0.9927  
2          0.9929  
3         -0.8658  
4          0.9843  

Updated data saved to 'nbc_articles_with_sentiment.csv'
