# News for the demo

In [456]:
import requests
import csv
from datetime import datetime
import pandas as pd
import numpy as np

# NewsCatcher API settings

In [457]:
NEWS_API_KEY = ""
API_URL = "https://v3-api.newscatcherapi.com/api/search"

# 1- The featch function: fetch news from NewsCatcher API

In [458]:

# Fetch data from NewsCatcher API
def fetch_news_articles_w_theme(params):

    headers = {
        'x-api-token': NEWS_API_KEY
    }

    response = requests.get(API_URL, headers=headers, params=params)

    if response.status_code == 200:
        return response.json().get('articles', [])
    else:
        print(f"Error fetching data: {response.status_code}")
        return []


# 2- Create csv file to save the news

In [459]:
def create_articles_to_csv(filename):
    # Define the CSV column headers

#     style, Manipulated_title, and declarative boring_title will be added later
    headers = ['aid', 'original_title', 'image_url', 'published_date', 'article_url', 'category', 'text']
    # Open the CSV file for writing
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # Write the headers
        writer.writerow(headers)

# 3- A function to write the fetched news into csv file

In [460]:
global_aid = 0

In [461]:
# Write the articles to a CSV file
def write_articles_to_csv(articles, filename, mytheme):
    
    #Open file in 'append' mode
    with open(filename, mode='a', newline='') as file:
        writer = csv.writer(file)
        
        global global_aid;
        num_articels = 0

        # Write article data rows
        for article in articles:
            global_aid +=1
             
            # Extract necessary fields
            num_articels +=1
            
            original_title = article.get('title', 'N/A')
            media = article.get('media', 'N/A')
            published_date = article.get('published_date', 'N/A') #datetime.now().isoformat()
            article_url = article.get('link', 'N/A') 
            text = article.get('content', 'N/A')
            
            nlp = article.get('nlp', 'N/A')
            category = nlp.get('theme', mytheme)
            category = category or mytheme # if nlp['theme'] is empty, put mytheme
        
            
            # Write the row to the CSV file
            writer.writerow([global_aid, original_title, media, published_date, article_url, category, text])

        print(f"{num_articels} articles have been written to {filename}")
        print ("...")


# 4(a) Call the fetch function (using 'theme')

In [None]:
filename = '../data/articles_'+datetime.now().strftime("%Y-%m-%d %H:%M:%S")+'.csv'
create_articles_to_csv(filename)

#try using theme (nlp feature)

theme = ['Economics', 'Health', 'Politics', 'Science', 'Sports', 'Tech', 'Crime', 'Weather']

for t in theme:

    print ("Category = ", t)

    params = {
        'q': '*',
        'lang': 'no',  
        'countries': 'NO',  
        'sort_by': 'date',  
        'page': 1,
        'page_size': 100,
        'include_nlp_data': True,
        'word_count_min': 800,
        'from_': '2024/11/06',
        'to_': '2024/11/08',
        'theme':t,
    }
        
    articles = fetch_news_articles_w_theme(params) 

    if articles:
        write_articles_to_csv(articles, filename, '')         
    else:
        print("No articles found")
        

# 4(b) Call the fetch function (WITHOUT 'theme')

In [483]:
filename = '../data/articles_'+datetime.now().strftime("%Y-%m-%d %H:%M:%S")+'.csv'
create_articles_to_csv(filename)

#try without 'theme'. try putting the category in the query itself
query = ['Økonomi', 'Helse', 'Politikk', 'Vitenskap', 'Sport', 'Teknikk', 'Kriminalitet', 'Vær']
# query = ['Politikk AND Venstre']


for q in query:

    print ("Category = ", q)
    
    params = {
        'q': q,
        'lang': 'no',  
        'countries': 'NO',  
        'sort_by': 'date',  
        'page': 1,
#         'source': 'www.aftenposten.no',
        'page_size': 100,
        'include_nlp_data': True,
        'word_count_min': 800,
        'from_': '2024/11/11',
        'to_': '2024/11/13',
#         'is_headline': True
    }
    
    articles = fetch_news_articles_w_theme(params) 

    if articles:
        write_articles_to_csv(articles, filename , q)         
    else:
        print("No articles found")

Category =  Politikk AND Venstre
7 articles have been written to ../data/articles_2024-11-13 21:09:45.csv
...


# See the fetched news

In [484]:
df=pd.read_csv(filename,index_col=0)
df.shape

(7, 6)

In [485]:
print (filename)

../data/articles_2024-11-13 21:09:45.csv


In [486]:
df.head()

Unnamed: 0_level_0,original_title,image_url,published_date,article_url,category,text
aid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
243,– Tror ikke Norge kommer til å innfri under kl...,https://www.dagsavisen.no/resizer/v2/66DGGJFF5...,2024-11-12 18:17:54,https://www.dagsavisen.no/nyheter/innenriks/20...,Politikk AND Venstre,– Det gir et uheldig signal når Jonas Gahr Stø...
244,Milliardpress mot Støre for økt Ukraina-hjelp,https://akamai.vgc.no/v2/images/fafed385-0260-...,2024-11-12 14:46:32,https://www.vg.no/nyheter/i/QM8B7Q/milliardpre...,Politics,Jens Stoltenbergs tidligere militære topprådgi...
245,Islam arbeider intenst for å overta svenske sk...,https://www.rights.no/wp-content/uploads/2020/...,2024-11-12 12:07:15,https://www.rights.no/2024/11/islam-arbeider-i...,Politikk AND Venstre,«Vi er maktesløse. Og ofte tenker jeg: De har ...
246,Ny sterk høyrebølge med Ap ved roret,https://g.acdn.no/obscura/API/dynamic/r1/ece5/...,2024-11-11 19:37:30,https://www.ostlendingen.no/ny-sterk-hoyrebolg...,Politikk AND Venstre,"Kommentar Dette er en kommentar, skrevet av en..."
247,Innvandring og bærekraft,https://www.rights.no/wp-content/uploads/2024/...,2024-11-11 13:14:33,https://www.rights.no/2024/11/innvandring-og-b...,Politikk AND Venstre,"Alle vet hva FrP vil i innvandringspolitikken,..."


# Clean the data. Remove duplicates

In [487]:
df_clean = df.drop_duplicates(subset=['original_title'])

In [488]:
df_clean.shape

(7, 6)

In [489]:
df_clean = df_clean.drop_duplicates(subset=['image_url'])

In [490]:
df_clean.shape

(7, 6)

In [491]:
df_clean = df_clean.drop_duplicates(subset=['article_url'])

In [492]:
df_clean.shape

(7, 6)

In [494]:
df_clean['category'].value_counts()

Politikk AND Venstre    5
Politics                2
Name: category, dtype: int64

# Check the categoreis of the news

# Select 3 random articles from each of 5 categories. OR 5 random articles from each of 3 categories (NOTE: change the parameter)

In [497]:
sampled_df = df[df['category'].isin(['Økonomi', 'Helse', 'Politikk', 'Sport', 'Politics'])].groupby('category').sample(n=3, random_state=1)
# sampled_df = df[df['category'].isin(['Business', 'Vær'])].groupby('category').sample(n=3, random_state=1)
# sampled_df = df_clean[(df_clean['category'] == 'Politikk AND Venstre')]

In [498]:
sampled_df['category'].value_counts()

Politikk AND Venstre    5
Name: category, dtype: int64

In [499]:
sampled_df.head(15)

Unnamed: 0_level_0,original_title,image_url,published_date,article_url,category,text
aid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
243,– Tror ikke Norge kommer til å innfri under kl...,https://www.dagsavisen.no/resizer/v2/66DGGJFF5...,2024-11-12 18:17:54,https://www.dagsavisen.no/nyheter/innenriks/20...,Politikk AND Venstre,– Det gir et uheldig signal når Jonas Gahr Stø...
245,Islam arbeider intenst for å overta svenske sk...,https://www.rights.no/wp-content/uploads/2020/...,2024-11-12 12:07:15,https://www.rights.no/2024/11/islam-arbeider-i...,Politikk AND Venstre,«Vi er maktesløse. Og ofte tenker jeg: De har ...
246,Ny sterk høyrebølge med Ap ved roret,https://g.acdn.no/obscura/API/dynamic/r1/ece5/...,2024-11-11 19:37:30,https://www.ostlendingen.no/ny-sterk-hoyrebolg...,Politikk AND Venstre,"Kommentar Dette er en kommentar, skrevet av en..."
247,Innvandring og bærekraft,https://www.rights.no/wp-content/uploads/2024/...,2024-11-11 13:14:33,https://www.rights.no/2024/11/innvandring-og-b...,Politikk AND Venstre,"Alle vet hva FrP vil i innvandringspolitikken,..."
248,Fornorskingspolitikken og nåtidas briller,https://g.acdn.no/obscura/API/dynamic/r1/ece5/...,2024-11-11 11:19:03,https://www.nordnorskdebatt.no/fornorskingspol...,Politikk AND Venstre,"Kronikk Dette er en kronikk, skrevet av en eks..."


In [500]:
# styles = ['Forward reference', 'Forward reference', 'Forward reference', 'Forward reference', 'Forward reference']
# sampled_df['style'] = styles

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sampled_df['style'] = styles


# Assign styles to the articles

## We need to assign each style to the articles in such a way that:

### Each category has a unique style assigned to it.
### Each style is assigned an equal number of times across the articles.

# UPDATE: This works if we have only three articles in each category.

# The next cell works if I have only 3 categories with 5 articles in each category. So, use the later cell.


In [476]:
styles = ['Metaphor', 'Forward reference', 'Question']
categories = sampled_df['category'].unique()

# Shuffle styles for randomness and repeat them to match the length of sampled_df
np.random.shuffle(styles)
style_assignments = np.tile(styles, len(sampled_df) // len(styles))

# Assign styles to each category without repeating within each category
sampled_df['style'] = np.nan
for category in categories:
    # Get the indices of articles in this category
    category_indices = sampled_df[sampled_df['category'] == category].index
    
    # Randomly select styles and assign them
    chosen_styles = np.random.choice(styles, len(category_indices), replace=False)
    sampled_df.loc[category_indices, 'style'] = chosen_styles

# Check that each style is equally distributed
assert sampled_df['style'].value_counts().nunique() == 1, "Styles are not equally distributed"


In [379]:
# import numpy as np

# # Define styles and calculate how many times we need to repeat them
# styles = ['Metaphor', 'Forward reference', 'Question']
# num_styles_needed = len(sampled_df)

# # Repeat the styles to cover all rows in sampled_df
# repeated_styles = np.tile(styles, num_styles_needed // len(styles) + 1)[:num_styles_needed]

# # Shuffle the repeated styles for randomness
# np.random.shuffle(repeated_styles)

# # Assign the shuffled styles to the 'style' column
# sampled_df['style'] = repeated_styles

# # Verify that the styles are equally distributed
# print(sampled_df['style'].value_counts())


Forward reference    4
Question             4
Metaphor             4
Name: style, dtype: int64


In [501]:
sampled_df.head(15)

Unnamed: 0_level_0,original_title,image_url,published_date,article_url,category,text,style
aid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
243,– Tror ikke Norge kommer til å innfri under kl...,https://www.dagsavisen.no/resizer/v2/66DGGJFF5...,2024-11-12 18:17:54,https://www.dagsavisen.no/nyheter/innenriks/20...,Politikk AND Venstre,– Det gir et uheldig signal når Jonas Gahr Stø...,Forward reference
245,Islam arbeider intenst for å overta svenske sk...,https://www.rights.no/wp-content/uploads/2020/...,2024-11-12 12:07:15,https://www.rights.no/2024/11/islam-arbeider-i...,Politikk AND Venstre,«Vi er maktesløse. Og ofte tenker jeg: De har ...,Forward reference
246,Ny sterk høyrebølge med Ap ved roret,https://g.acdn.no/obscura/API/dynamic/r1/ece5/...,2024-11-11 19:37:30,https://www.ostlendingen.no/ny-sterk-hoyrebolg...,Politikk AND Venstre,"Kommentar Dette er en kommentar, skrevet av en...",Forward reference
247,Innvandring og bærekraft,https://www.rights.no/wp-content/uploads/2024/...,2024-11-11 13:14:33,https://www.rights.no/2024/11/innvandring-og-b...,Politikk AND Venstre,"Alle vet hva FrP vil i innvandringspolitikken,...",Forward reference
248,Fornorskingspolitikken og nåtidas briller,https://g.acdn.no/obscura/API/dynamic/r1/ece5/...,2024-11-11 11:19:03,https://www.nordnorskdebatt.no/fornorskingspol...,Politikk AND Venstre,"Kronikk Dette er en kronikk, skrevet av en eks...",Forward reference


# Write the final set of news into a csv file

In [502]:
duplicates = sampled_df.duplicated()

In [503]:
duplicates

aid
243    False
245    False
246    False
247    False
248    False
dtype: bool

In [504]:
sampled_df.to_csv('../data/final_demo.csv')