# News Aggregator


Named Entity Recognition (NER): SpaCy (pre-trained)

Purpose: Extract entities such as people, organizations, locations from the articles.

Use Case: Create a database of key entities mentioned in the news.

## Imports

In [221]:
import json
import requests
import xmltodict
import bs4
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import spacy
from spacy import displacy
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from newspaper import Article

## Data Acquisition

** public data APIs (XML format) & web data scraping

In [2]:
#######content in video#########
# {
#     "source": "CNN",
#     "section": "Video",
#     "link": "http://rss.cnn.com/rss/cnn_freevideo.rss"
# },
# {
#     "source": "CNN",
#     "section": "Top 10 (video)",
#     "link": "http://rss.cnn.com/services/podcasting/cnn10/rss.xml"
# },

In [2]:
def read_rss():
    f = open('./RSS_Feeds.json')
    data = json.load(f)
    #with open('tmp.json','w') as f:
    # out = json.dumps(data, indent=4)
    #f.close()
    return data

In [3]:
def get_rss(rss_url):
    r = requests.get(f'{rss_url}') 
    if ".json" in rss_url:
        lod = json.loads(r.text)
    if ".xml" in rss_url:
        lod = xmltodict.parse(r.text)
    if ".rss" in rss_url:
        lod = xmltodict.parse(r.text)     
    if ".cbsnews" in rss_url:
        lod = xmltodict.parse(r.text)   
    return lod



In [4]:
def get_rss_feeds():
    feed_list=[]
    RSS = read_rss()
    for i , source in enumerate(RSS):
        rss_item = get_rss(source['link'])
        for item in rss_item['rss']['channel']['item']:
            new_record = {
                'source': source['source'],
                'section': source['section'],
                'source_link': source['link'],
                'title':item['title'] if ('title' in item) else None,
                'description': item['description'] if ('description' in item) else None,
                'link':  item['link'] if ('link' in item) else None,
                'pubDate': item['pubDate'] if ('pubDate' in item) else None,
                'creator': item['dc:creator'] if ('dc:creator' in item) else None,
                'content': item['content:encoded'] if ('content:encoded' in item) else None
            }
            feed_list.append(new_record)
    return feed_list


In [5]:
# df = pd.DataFrame(columns=['source', 'section', 'source_link','title','description','link','pubDate','artical'])
data = get_rss_feeds()

data_df = pd.DataFrame(data)


In [6]:
# Save the DataFrame to a CSV file
data_df.to_csv('News_Feed.csv', index=False)

## Data Exploration and Data Cleaning

In [7]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2962 entries, 0 to 2961
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   source       2962 non-null   object
 1   section      2962 non-null   object
 2   source_link  2962 non-null   object
 3   title        2923 non-null   object
 4   description  2884 non-null   object
 5   link         2962 non-null   object
 6   pubDate      2845 non-null   object
 7   creator      1443 non-null   object
 8   content      25 non-null     object
dtypes: object(9)
memory usage: 208.4+ KB


In [8]:
data_df['source'] = data_df['source'].fillna('').astype(str)
data_df['section'] = data_df['section'].fillna('').astype(str)
data_df['source_link'] = data_df['source_link'].fillna('').astype(str)
data_df['title'] = data_df['title'].fillna('').astype(str)
data_df['description'] = data_df['description'].fillna('').astype(str)
data_df['link'] = data_df['link'].fillna('').astype(str)
data_df['creator'] = data_df['creator'].fillna('').astype(str)
data_df['content'] = data_df['content'].fillna('').astype(str)

In [9]:
data_df.describe()

Unnamed: 0,source,section,source_link,title,description,link,pubDate,creator,content
count,2962,2962,2962,2962.0,2962.0,2962,2845,2962.0,2962.0
unique,5,69,110,2223.0,2130.0,2275,2152,643.0,26.0
top,NYTimes,U.S.,http://rss.cnn.com/rss/cnn_topstories.rss,,,https://www.nytimes.com/2024/09/05/climate/bat...,"Tue, 10 Sep 2024 20:42:15 +0000",,
freq,1472,188,69,39.0,78.0,7,9,1519.0,2937.0


In [10]:
data_df.shape

(2962, 9)

In [11]:
from requests.exceptions import HTTPError, ConnectionError, Timeout, RequestException

def read_Feed(Feed_url):
    r = requests.get(f'{Feed_url}') 
    soup = bs4.BeautifulSoup(r.content)
    # soup = bs4.BeautifulSoup(r.content,'html.parser')
    result=''
    for i,p in enumerate(soup.find_all("p")):
        try:
            result += p.text + " "
        except Exception:
            pass
    return result

def read_Feed_FOX(Content_string):
    soup = bs4.BeautifulSoup(Content_string, 'html.parser')

    # Extract all text in a single string with spaces between elements
    all_text = soup.get_text(separator=' ', strip=True)
    return all_text

def read_Feed_CNN(Feed_url):
    try:
        response = requests.get(f'{Feed_url}') 

        if response.status_code != 200:
            return None
    
        soup = bs4.BeautifulSoup(response.content, 'html.parser')
        article_content = soup.find('div', class_=['article__content','storytext'])

        if article_content is None:
            return None
    
        paragraphs_and_subheaders = article_content.find_all(['p', 'h2'], class_=['paragraph', 'subheader','speakable'])
        all_text = (' ').join((element.get_text(strip=True)) for element in paragraphs_and_subheaders)
        return all_text
    
    except HTTPError as http_err:
        # Handle HTTP errors (e.g., 404, 500)
        return None
    except ConnectionError as conn_err:
        # Handle connection errors (e.g., network issues)
        return None
    except Timeout as timeout_err:
        # Handle timeout errors (e.g., request took too long)
        return None
    except RequestException as req_err:
        # Handle any other request-related errors
        return None
    except Exception as e:
        # Handle any other unexpected errors
        return None

def read_Feed_NYTimes(Feed_url):
    try:
        article = Article(Feed_url)
        article.download()
        article.parse()
        return article.text
    
    except HTTPError as http_err:
        # Handle HTTP errors (e.g., 404, 500)
        return None
    except ConnectionError as conn_err:
        # Handle connection errors (e.g., network issues)
        return None
    except Timeout as timeout_err:
        # Handle timeout errors (e.g., request took too long)
        return None
    except RequestException as req_err:
        # Handle any other request-related errors
        return None
    except Exception as e:
        # Handle any other unexpected errors
        return None
    
def read_Feed_CNBC(Feed_url):
    article = Article(Feed_url)
    article.download()
    article.parse()
    return article.text

def read_Feed_CBS(Feed_url):
    article = Article(Feed_url)
    article.download()
    article.parse()
    return article.text

In [15]:
FOXdata =data_df[data_df['source'] == 'FOX News']
FOXdata['text'] = FOXdata['content'].apply(read_Feed_FOX)

CNNdata =data_df[data_df['source'] == 'CNN']
CNNdata['text'] = CNNdata['link'].apply(read_Feed_CNN)

NYTdata =data_df[data_df['source'] == 'NYTimes']
NYTdata['text'] = NYTdata['link'].apply(read_Feed_NYTimes)

CNBCdata =data_df[data_df['source'] == 'CNBC']
CNBCdata['text'] = CNBCdata['link'].apply(read_Feed_CNBC)

CBSdata =data_df[data_df['source'] == 'CBS']
CBSdata['text'] = CBSdata['link'].apply(read_Feed_CBS)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CNBCdata['text'] = CNBCdata['link'].apply(read_Feed_CNBC)


In [16]:
# Replace empty strings with NaN in the text column
FOXdata['text'].replace('', np.nan, inplace=True)
CNNdata['text'].replace('', np.nan, inplace=True)
NYTdata['text'].replace('', np.nan, inplace=True)
CNBCdata['text'].replace('', np.nan, inplace=True)
CBSdata['text'].replace('', np.nan, inplace=True)

# Drop rows where the 'text' column has NaN values
FOXdata.dropna(subset=['text'], inplace=True)
CNNdata.dropna(subset=['text'], inplace=True)
NYTdata.dropna(subset=['text'], inplace=True)
CNBCdata.dropna(subset=['text'], inplace=True)
CBSdata.dropna(subset=['text'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  FOXdata['text'].replace('', np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FOXdata['text'].replace('', np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(v

In [17]:
merged_data = pd.concat([FOXdata, CNNdata, NYTdata, CNBCdata], ignore_index=True)

In [18]:
# Save to a CSV file
merged_data.to_csv('News_Feed_withContent.csv', index=False)

## Data Preprocessing and Feature Extraction

- Tokenize the article content into words.
- Label the tokens with corresponding entity types (for training data).
- Prepare the dataset in a suitable format for the model.

In [19]:
# Preprocess the articles using Spacy for tokenization and entity extraction
nlp = spacy.load('en_core_web_sm')  # Load pre-trained Spacy model

In [20]:
from typing import Counter

doc =nlp(FOXdata['text'][0])
len(doc.ents)
labels = [x.label_ for x in doc.ents]
Counter(labels)

items = [e.text for e in doc.ents if e.label_ not in ('DATE', 'TIME', 'ORDINAL', 'CARDINAL','LANGUAGE','LAW','PERCENT')]
Counter(items).most_common(5)

[('Trump', 6),
 ('Harris', 5),
 ('Trump-Harris', 4),
 ('ABC', 3),
 ('Pennsylvania', 2)]

In [21]:
displacy.render(nlp(str(FOXdata['text'][0])), jupyter=True, style='ent')

In [22]:
def tokenize_text(text):
    doc = nlp(text)
    # return [(token.text, token.ent_type_ if token.ent_type_ else 'O') for token in doc] # includes all the entities
    # return [(token.text, token.ent_type_) for token in doc if token.ent_type_]
    # entities=[(e.text, e.label_) for e in doc.ents if e.label_ not in ('DATE', 'TIME', 'ORDINAL', 'CARDINAL','LANGUAGE','LAW','PERCENT')]
    entities=[e.text for e in doc.ents if e.label_ not in ('DATE', 'TIME', 'ORDINAL', 'CARDINAL','LANGUAGE','LAW','PERCENT')]
    # Remove duplicates while keeping the first occurrence
    return list(dict.fromkeys(entities))


In [23]:
merged_data['tokens'] = merged_data['text'].apply(tokenize_text)

## Model Construction

In [208]:
X_Tokens = [' '.join(tokens) for tokens in merged_data['tokens']]
y_Section = merged_data['section']

In [210]:
X_train, X_test, y_train, y_test= train_test_split(X_Tokens,y_Section, test_size=0.2, random_state=42)

In [212]:
# vectorizer = CountVectorizer()
# X_Tokens = vectorizer.fit_transform(X_Tokens)

# vectorizer = HashingVectorizer()
# X_Tokens = vectorizer.fit_transform(X_Tokens)

In [223]:
pipeline = Pipeline([
    # ('vectorizer', HashingVectorizer()), 
    ("vectorizer",CountVectorizer()),
    # ('classifier', DecisionTreeClassifier())   
    # ('classifier', RandomForestClassifier())   
    ('classifier', SGDClassifier())
])

# # Fit the model
# pipeline.fit(X_train, y_train)


param_grid = {
    'vectorizer__ngram_range': [(1, 1), (1, 3)],
    # 'classifier__n_estimators': [50, 100, 150, 200],  # Number of trees in the forest
    # 'classifier__max_depth': [10, 20, None],  # Maximum depth of the tree
    # 'classifier__max_features': ['auto', 'sqrt', 'log2'],
    'classifier__loss': ['hinge', 'log'],  # Hinge for SVM, log for logistic regression
    'classifier__penalty': ['l2', 'l1', 'elasticnet'],  # Regularization types
    'classifier__alpha': [1e-4, 1e-3, 1e-2],  # Regularization strength
    'classifier__max_iter': [1000,2000],  # Epochs (number of iterations)
    'classifier__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],  # Learning rate schedule
    'classifier__eta0': [0.01, 0.1, 0.25, 0.5]  # Initial learning rate
}


In [224]:
# GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(pipeline, param_grid, cv=5, verbose=1, n_jobs=-1)


# Fit model
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 576 candidates, totalling 2880 fits


  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (


In [225]:
# Predict using the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

## Model Evaluation

In [226]:
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

Best Parameters: {'classifier__alpha': 0.001, 'classifier__eta0': 0.5, 'classifier__learning_rate': 'optimal', 'classifier__loss': 'hinge', 'classifier__max_iter': 1000, 'classifier__penalty': 'l2', 'vectorizer__ngram_range': (1, 1)}
Best Cross-Validation Score: 0.24760849935097914


In [227]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Test Accuracy: {accuracy}')

Test Accuracy: 0.2505307855626327


In [228]:
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
                       precision    recall  f1-score   support

              Africa       1.00      0.33      0.50         3
            Americas       0.00      0.00      0.00         5
        Art & Design       0.00      0.00      0.00         3
                Arts       0.00      0.00      0.00        12
                Asia       0.00      0.00      0.00         8
        Asia Pacific       0.25      0.50      0.33         2
               Autos       0.40      0.50      0.44        12
            Baseball       0.40      0.40      0.40         5
         Book Review       0.00      0.00      0.00         6
            Business       0.00      0.00      0.00        13
  College Basketball       0.33      0.33      0.33         3
    College Football       0.33      0.25      0.29         4
          Commentary       0.33      0.60      0.43         5
               Dance       0.33      0.50      0.40         2
            DealBook       0.00      0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [229]:
# Precision, Recall, F1-Score (for multiclass problems, average='macro' 'weighted')
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Precision: 0.24204071881104477
Recall: 0.2505307855626327
F1 Score: 0.2351421220773038


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [230]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix:\n{conf_matrix}')

Confusion Matrix:
[[1 0 0 ... 0 1 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
