# Exploring inference methods to plan inference pipeline

In [10]:
import hopsworks
from datetime import datetime
import pandas as pd

In [15]:
# Today's date ----- is this needed/helpful anywhere?
# today = pd.to_datetime('2023-12-13').date()
# today = datetime.now().strftime('%Y-%m-%d')
today = datetime.now().date()
print(type(today))
print(today)

<class 'datetime.date'>
2023-12-16


## Get stored news articles

In [12]:
project = hopsworks.login()
fs = project.get_feature_store()

news_fg = fs.get_feature_group(name="news_articles", version=4)
news_df = news_fg.read()
news_df.head()

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/187540
Connected. Call `.close()` to terminate connection gracefully.
Finished: Reading data from Hopsworks, using ArrowFlight (1.30s) 


Unnamed: 0,article_id,title,link,description,content,pubdate,source_id,source_priority,country,category,language
0,7ec5d34440998da7790f6d2486f93060,Lyon in wait: Spinner left one short of 500 Te...,https://www.brisbanetimes.com.au/sport/cricket...,The hosts are waiting on a fitness report for ...,Marnus Labuschagne suffered a finger injury an...,2023-12-16,brisbanetimes,189315,[australia],[sports],english
1,e9e929aada59f9c32fb1d46ed7820379,Voter apathy and concerns about violence mark ...,https://halifax.citynews.ca/2023/12/16/voter-a...,BAGHDAD (AP) — Iraqis began voting for the fir...,BAGHDAD (AP) — Iraqis began voting for the fir...,2023-12-16,halifaxtoday,38482437,[canada],[top],english
2,040e7684bacf2f150b86e7a6352e1731,Lyon in wait: Spinner left one short of 500 Te...,https://www.watoday.com.au/sport/cricket/lyon-...,The hosts are waiting on a fitness report for ...,Marnus Labuschagne suffered a finger injury an...,2023-12-16,watoday,347743,[australia],[top],english
3,6e886c9f14ee7d3c29b25810dd7f4a20,Lyon in wait: Spinner left one short of 500 Te...,https://www.theage.com.au/sport/cricket/lyon-i...,The hosts are waiting on a fitness report for ...,Marnus Labuschagne suffered a finger injury an...,2023-12-16,theage,62533,[australia],[sports],english
4,0e901c91ad12d3256272438a8a089a6b,Auckland A-League team: Big names on shortlist...,https://www.nzherald.co.nz/sport/auckland-a-le...,A wide pool of names on a whiteboard has been ...,The new Auckland A-League men’s team will appo...,2023-12-16,nzherald,7971,[new zealand],[sports],english


In [13]:
print("Number of articles:", len(news_df))

Number of articles: 49


In [14]:
news_df['pubdate'].dtype

dtype('O')


## Only keep today's articles

In [16]:
news_df = news_df[news_df['pubdate'] == today]
print("Number of articles:", len(news_df))

Number of articles: 49


## Add sentiments for articles

In [17]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [18]:
def format_sentiment(sentiment):
    if sentiment['label'] == 'NEGATIVE':
        return sentiment['score'] * -1
    else:
        return sentiment['score']

def get_sentiment_value(news_object, sentiment_pipeline):
    return format_sentiment(sentiment_pipeline(news_object['title'])[0])

In [19]:
news_df['sentiment'] = news_df.apply(get_sentiment_value, sentiment_pipeline=sentiment_pipeline, axis=1)

In [20]:
news_df['sentiment'] 

0    -0.996212
1    -0.918708
2    -0.996212
3    -0.996212
4     0.956115
5    -0.942974
6    -0.974615
7    -0.996346
8     0.993459
9    -0.982671
10   -0.996360
11   -0.997408
12   -0.558961
13   -0.966601
14   -0.996212
15   -0.999385
16   -0.827272
17   -0.976391
18   -0.537694
19   -0.996212
20   -0.967247
21   -0.897315
22    0.942749
23   -0.992619
24   -0.995881
25    0.961417
26   -0.985819
27    0.996294
28   -0.996212
29    0.998722
30   -0.976391
31    0.873144
32   -0.996212
33   -0.995884
34    0.958880
35    0.998954
36   -0.918708
37   -0.918708
38   -0.988610
39   -0.976391
40    0.881980
41    0.997205
42    0.995221
43   -0.953914
44   -0.996850
45    0.991797
46    0.939304
47   -0.999639
48   -0.981042
Name: sentiment, dtype: float64

In [42]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   article_id       49 non-null     object 
 1   title            49 non-null     object 
 2   link             49 non-null     object 
 3   description      49 non-null     object 
 4   content          49 non-null     object 
 5   pubdate          49 non-null     object 
 6   source_id        49 non-null     object 
 7   source_priority  49 non-null     int64  
 8   country          49 non-null     object 
 9   category         49 non-null     object 
 10  language         49 non-null     object 
 11  sentiment        49 non-null     float64
dtypes: float64(1), int64(1), object(10)
memory usage: 4.7+ KB


## Calculate today's average sentiment

In [23]:
avg_sentiment = news_df['sentiment'].mean()
avg_sentiment

-0.40221734314548724

## Find today's most positive article

In [40]:
most_positive = news_df.loc[news_df['sentiment'].idxmax()]

In [41]:
most_positive = pd.DataFrame(most_positive).T
most_positive.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, 35 to 35
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   article_id       1 non-null      object
 1   title            1 non-null      object
 2   link             1 non-null      object
 3   description      1 non-null      object
 4   content          1 non-null      object
 5   pubdate          1 non-null      object
 6   source_id        1 non-null      object
 7   source_priority  1 non-null      object
 8   country          1 non-null      object
 9   category         1 non-null      object
 10  language         1 non-null      object
 11  sentiment        1 non-null      object
dtypes: object(12)
memory usage: 104.0+ bytes


In [58]:
most_positive.pubdate

'2023-12-16'