In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Increase the quality and resolution of our charts so we can copy/paste or just
# directly save from here.
# See:
# https://ipython.org/ipython-doc/3/api/generated/IPython.display.html
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina', quality=100)

# You can also just do this in Colab/Jupyter, some "magic":
# %config InlineBackend.figure_format='retina'

import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
# need to run these commands the very first time you use these packages
# nltk.download('stopwords') 
# nltk.download('punkt')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [None]:
df = pd.read_csv('finalnewsdataprocessed.csv')
df_final = df[['Source', 
               'Source Bias', 
               'Author', 
               'Title', 
               'Overall Sentiment',
               'compound score', 
               'negative score', 
               'positive score', 
               'neutral score']]
df_final.sort_values(by='Source')

In [None]:
# Total number of headlines in the dataset
df['Title'].count()

In [None]:
# Set style for all graphs
plt.style.use('fivethirtyeight')

In [None]:
# Pie chart of news sources in dataset
pie = df['Source'].value_counts().plot.pie(autopct='%.2f%%', fontsize=12, figsize=(12, 12), labeldistance=None, legend=True, shadow=True, title='News Sources in Dataset')
pie
plt.show()


In [None]:
# Bar graph of news sources in data set
df['Source'].value_counts().plot.bar()
plt.show()


In [None]:
# Bar graph of composite sentiment scores by source
df[['Source', 'negative score', 'positive score', 'neutral score']].groupby('Source').mean().sort_values(by='Source', ascending=False).plot.barh(figsize=(10, 8), title = 'Sentiment Scores by Source')
plt.xlabel('Sentiment Score')
plt.ylabel('News Source')
plt.tight_layout()
scores = df.groupby('Source').mean()


In [None]:
# Bar graph of compound sentiment scores by source
scores['compound score'].plot.barh(title='Compound Sentiment Scores by News Source')
plt.xlabel('Sentiment Score')
plt.ylabel('News Source')
plt.show()

In [None]:
# Bar graph of positive sentiment scores by source 
scores['positive score'].plot.barh(color='y', title='Positive Sentiment Scores by News Source')
plt.xlabel('Sentiment Score')
plt.ylabel('News Source')
plt.show()

In [None]:
# Bar graph of negative sentiment score by source
scores['negative score'].plot.barh(color='r', title='Negative Sentiment Scores by News Source')
plt.xlabel('Sentiment Score')
plt.ylabel('News Source')
plt.show()

In [None]:
# Bar graph of neutral sentiment score by source
scores['neutral score'].plot.barh(color='green', title='Neutral Sentiment Scores by News Source')
plt.xlabel('Sentiment Score')
plt.ylabel('News Source')
plt.show()

In [None]:
# Average sentiment scores
scores

In [None]:
# Calculate mean compound score 
mean_compound = df['compound score'].mean()
print(f'The average compound sentiment score is {mean_compound:.2f}.')

In [None]:
# Box plot of average compound score
scores['compound score'].plot.box(title='Box Plot of Compound Sentiment Scores')
plt.ylabel('Compound Sentiment Score')
plt.xlabel('All News Sources')
plt.show()

In [None]:
# Boxplot of compound scores by news sources
df.boxplot(column='compound score', by='Source', figsize=(10, 10)) 
plt.xticks(rotation='vertical')
plt.title('Boxplot of Compound Score by News Source')
plt.xlabel('News Sources')
plt.ylabel('Sentiment Score')
plt.show()

In [None]:
df_sentiments = df.copy()
for index, row in df_sentiments.iterrows():
    sentiment = row['Overall Sentiment']
    if sentiment == -1:
        df_sentiments.loc[index, 'Overall Sentiment'] = 'Negative'
    if sentiment == 0:
        df_sentiments.loc[index, 'Overall Sentiment'] = 'Neutral'
    if sentiment == 1:
        df_sentiments.loc[index, 'Overall Sentiment'] = 'Positive'
        
df_sentiments

In [None]:
# Bar graph of sentiment labels by news source
sentiments_grouped = df_sentiments[['Source', 'Overall Sentiment', 'Title']].groupby(['Source', 'Overall Sentiment']).count()
sentiments = sentiments_grouped.reset_index()
sentiments

In [None]:
# Create new dataframe of number of headlines with each overall sentiment by source 
sentiment_new = pd.pivot_table(sentiments, index='Source', columns='Overall Sentiment', values ='Title')
sentiment_new

In [None]:
# Bar graph of number of sentiment score types by source
sentiment_new.plot.barh(figsize=(10, 8), title = 'Overall Sentiments of Headlines by Source')
plt.xlabel('Sentiment Category')
plt.ylabel('News Source')
plt.tight_layout()

In [None]:
def process_corpus(titles):
    tokens = []
    for title in titles:
        tokenizer = RegexpTokenizer(r'\w+')
        toks = tokenizer.tokenize(title)
        toks = [t.lower() for t in toks if t.lower() not in stop_words]
        tokens.extend(toks)
    return tokens

In [None]:
# Determine most frequent words in all headlines
headlines = list(df['Title'])
headlines_tokens = process_corpus(headlines)
headlines_freq = nltk.FreqDist(headlines_tokens)
top_terms_all = headlines_freq.most_common(20)
top_terms_all

In [None]:
# Graph top 5 terms in all headlines
terms = []
counts = []
for value in range (0,5):
    term = top_terms_all[value][0]
    count = top_terms_all[value][1]
    terms.append(term)
    counts.append(count)
    
plt.bar(terms, counts)
plt.xlabel('Term')
plt.ylabel('Number of Occurrences')
plt.title('Most Frequent Words in Headlines')
plt.show()


    

In [None]:
# Determine most frequent words in positive headlines
pos_headlines = list(df.loc[df['Overall Sentiment']==1]['Title'])
pos_headlines_tokens = process_corpus(pos_headlines)
pos_headlines_freq = nltk.FreqDist(pos_headlines_tokens)
top_terms_pos = pos_headlines_freq.most_common(20)
top_terms_pos

In [None]:
# Graph top 5 terms in positive headlines
terms_pos = []
counts_pos = []
for value in range (0,5):
    term = top_terms_pos[value][0]
    count = top_terms_pos[value][1]
    terms_pos.append(term)
    counts_pos.append(count)
    
plt.bar(terms_pos, counts_pos, color='y')
plt.xlabel('Term')
plt.ylabel('Number of Occurrences')
plt.title('Most Frequent Words in Positive Headlines')
plt.show()

In [None]:
# Determine most frequent words in negative headlines
neg_headlines = list(df.loc[df['Overall Sentiment']==-1]['Title'])
neg_headlines_tokens = process_corpus(neg_headlines)
neg_headlines_freq = nltk.FreqDist(neg_headlines_tokens)
top_terms_neg = neg_headlines_freq.most_common(20)
top_terms_neg

In [None]:
# Graph top 5 terms in negative headlines
terms_neg = []
counts_neg = []
for value in range (0,5):
    term = top_terms_neg[value][0]
    count = top_terms_neg[value][1]
    terms_neg.append(term)
    counts_neg.append(count)
    
plt.bar(terms_neg, counts_neg, color='r')
plt.xlabel('Term')
plt.ylabel('Number of Occurrences')
plt.title('Most Frequent Words in Negative Headlines')
plt.show()