# Exploring inference methods to plan inference pipeline

In [25]:
import hopsworks
from datetime import datetime
import pandas as pd

In [26]:
# Today's date ----- is this needed/helpful anywhere?
# today = pd.to_datetime('2023-12-13').date()
today = datetime.now().strftime('%Y-%m-%d')
#today = datetime.now().date()
print(type(today))
print(today)

<class 'str'>
2023-12-17


## Get stored news articles

In [27]:
project = hopsworks.login()
fs = project.get_feature_store()

news_fg = fs.get_feature_group(name="news_articles", version=5)
news_df = news_fg.read()
news_df.head()

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/187540
Connected. Call `.close()` to terminate connection gracefully.
Finished: Reading data from Hopsworks, using Hive (2.15s) from Hopsworks, using Hive.   


Unnamed: 0,article_id,title,link,description,content,pubdate,source_id,country,category,language
0,205ca5bd4914c3b038f85d3f784cea31,Miss France winner says her short hair a victo...,https://www.digitaljournal.com/?p=3699990,The woman elected Miss France 2024 has framed ...,The woman elected Miss France 2024 has framed ...,2023-12-17,digitaljournal,"[""canada""]","[""top""]",english
1,8e22d120e7216d8487bebb6ac47a9ac9,Erik ten Hag addresses Bruno Fernandes replace...,https://www.manchestereveningnews.co.uk/sport/...,Man United head to Anfield this afternoon but ...,Erik ten Hag has remained tight-lipped on how ...,2023-12-17,manchestereveningnews,"[""united kingdom""]","[""sports""]",english
2,786d5b9ae44de25430c98ecc78c21cd7,Dana White Receives Applauds for Banning N3on ...,https://thesportsrush.com/ufc-news-dana-white-...,The UFC CEO Dana White’s long-running relation...,The UFC CEO Dana White’s long-running relation...,2023-12-17,thesportsrush,"[""united kingdom"",""united states of america"",""...","[""sports""]",english
3,1bf9c9f093d6882cc76ee1054e5d2473,Nathan Lyon claims 500th Test wicket as Austra...,https://www.theguardian.com/sport/2023/dec/17/...,Spinner is third bowler to take 500 scalps in ...,Nathan Lyon has snared his 500th Test wicket a...,2023-12-17,theguardian,"[""united kingdom"",""singapore"",""canada"",""india""]","[""sports""]",english
4,4b3c4de1bbfcaf8279acc1d2bcc23fac,Prince George and Princess Charlotte’s ‘riotou...,https://www.goodto.com/entertainment/royal-new...,Who wouldn't enjoy 'silly presents' and 'fancy...,Christmas is full of traditions for families a...,2023-12-17,goodto,"[""united kingdom""]","[""top""]",english


In [28]:
print("Number of articles:", len(news_df))

Number of articles: 45


In [29]:
news_df['pubdate'].dtype

dtype('O')


## Only keep today's articles

In [30]:
news_df = news_df[news_df['pubdate'] == today]
print("Number of articles:", len(news_df))

Number of articles: 45


## Add sentiments for articles

In [31]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [32]:
def format_sentiment(sentiment):
    if sentiment['label'] == 'NEGATIVE':
        return sentiment['score'] * -1
    else:
        return sentiment['score']

def get_sentiment_value(news_object, sentiment_pipeline):
    return format_sentiment(sentiment_pipeline(news_object['title'])[0])

In [33]:
news_df['sentiment'] = news_df.apply(get_sentiment_value, sentiment_pipeline=sentiment_pipeline, axis=1)

In [34]:
news_df['sentiment'] 

0     0.998375
1    -0.639338
2    -0.630210
3     0.996409
4     0.999715
5     0.997273
6    -0.904386
7     0.997980
8     0.651037
9     0.985335
10   -0.932873
11   -0.904386
12   -0.966647
13    0.999764
14   -0.991861
15    0.954942
16   -0.993441
17    0.985232
18   -0.977052
19   -0.981800
20   -0.999704
21    0.996932
22   -0.989260
23    0.998353
24   -0.999704
25   -0.992547
26    0.999773
27   -0.986446
28   -0.990693
29    0.999021
30    0.961233
31   -0.997775
32    0.975764
33    0.980984
34   -0.997574
35    0.997870
36    0.999442
37    0.996006
38    0.954942
39   -0.998881
40    0.999544
41   -0.997574
42    0.996006
43    0.999218
44    0.996006
Name: sentiment, dtype: float64

In [35]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   article_id   45 non-null     object 
 1   title        45 non-null     object 
 2   link         45 non-null     object 
 3   description  45 non-null     object 
 4   content      45 non-null     object 
 5   pubdate      45 non-null     object 
 6   source_id    45 non-null     object 
 7   country      45 non-null     object 
 8   category     45 non-null     object 
 9   language     45 non-null     object 
 10  sentiment    45 non-null     float64
dtypes: float64(1), object(10)
memory usage: 4.0+ KB


## Calculate today's average sentiment

In [36]:
avg_sentiment = news_df['sentiment'].mean()
avg_sentiment

0.12322235239876642

## Find today's most positive article

In [37]:
most_positive = news_df.loc[news_df['sentiment'].idxmax()]

In [38]:
most_positive = pd.DataFrame(most_positive).T
most_positive.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, 26 to 26
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   article_id   1 non-null      object
 1   title        1 non-null      object
 2   link         1 non-null      object
 3   description  1 non-null      object
 4   content      1 non-null      object
 5   pubdate      1 non-null      object
 6   source_id    1 non-null      object
 7   country      1 non-null      object
 8   category     1 non-null      object
 9   language     1 non-null      object
 10  sentiment    1 non-null      object
dtypes: object(11)
memory usage: 96.0+ bytes


In [39]:
most_positive.pubdate

26    2023-12-17
Name: pubdate, dtype: object

In [40]:
most_positive.iloc[0]['title']

"'Wonderful' - Danny Murphy wowed by one Everton player on MOTD after win v Burnley"