# Exploring inference methods to plan inference pipeline

In [1]:
import hopsworks
from datetime import datetime
import pandas as pd

In [11]:
# Today's date ----- is this needed/helpful anywhere?
# today = pd.to_datetime('2023-12-13').date()
today = datetime.now().strftime('%Y-%m-%d')
#today = datetime.now().date()
print(type(today))
print(today)

<class 'str'>
2023-12-17


## Get stored news articles

In [12]:
project = hopsworks.login()
fs = project.get_feature_store()

news_fg = fs.get_feature_group(name="news_articles", version=5)
news_df = news_fg.read()
news_df.head()

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/187540
Connected. Call `.close()` to terminate connection gracefully.
Finished: Reading data from Hopsworks, using Hive (2.43s) from Hopsworks, using Hive.   


Unnamed: 0,article_id,title,link,description,content,pubdate,source_id,country,category,language
0,5f2249b77fe0b8d3d0486f9338b75166,Best SAD therapy lamps in 2023 (UK),https://news.knowledia.com/US/en/articles/best...,Beat the blues. - mashable.com,"The worst part of the winter isn't the rain, s...",2023-12-16,knowledia,"[""united states of america""]","[""top""]",english
1,28b92e828426e80c4c30e70e6f30fbd0,Opinion: Naturopathic doctors ready to relieve...,https://www.richmond-news.com/opinion/opinion-...,The time has come for naturopathic medicine to...,With 30 years practice experience in Prince Ge...,2023-12-16,richmond-news,"[""canada""]","[""top""]",english
2,e7412a114c364d3245584c54350a7f53,New Lenovo ThinkPad X1 Carbon hints at new 202...,https://news.knowledia.com/US/en/articles/new-...,The AI hype isn't dying down any time soon. - ...,I've seen AI injected in countless gadgets thi...,2023-12-16,knowledia,"[""united states of america""]","[""top""]",english
3,bb80a67b9ffd50c19c72790b247bec97,Dancing On Ice's Amber Davies links arms with ...,https://www.dailymail.co.uk/tvshowbiz/article-...,Amber Davies looks to be making incredible pro...,Dancing On Ice's Amber Davies links arms with ...,2023-12-16,dailymailuk,"[""united kingdom""]","[""entertainment""]",english
4,0e901c91ad12d3256272438a8a089a6b,Auckland A-League team: Big names on shortlist...,https://www.nzherald.co.nz/sport/auckland-a-le...,A wide pool of names on a whiteboard has been ...,The new Auckland A-League men’s team will appo...,2023-12-16,nzherald,"[""new zealand""]","[""sports""]",english


In [13]:
print("Number of articles:", len(news_df))

Number of articles: 96


In [14]:
news_df['pubdate'].dtype

dtype('O')


## Only keep today's articles

In [15]:
news_df = news_df[news_df['pubdate'] == today]
print("Number of articles:", len(news_df))

Number of articles: 47


## Add sentiments for articles

In [16]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [17]:
def format_sentiment(sentiment):
    if sentiment['label'] == 'NEGATIVE':
        return sentiment['score'] * -1
    else:
        return sentiment['score']

def get_sentiment_value(news_object, sentiment_pipeline):
    return format_sentiment(sentiment_pipeline(news_object['title'])[0])

In [18]:
news_df['sentiment'] = news_df.apply(get_sentiment_value, sentiment_pipeline=sentiment_pipeline, axis=1)

In [19]:
news_df['sentiment'] 

49    0.998790
50   -0.998881
51    0.991419
52   -0.996459
53    0.968694
54   -0.770034
55    0.819063
56    0.999378
57    0.994062
58    0.999213
59   -0.875670
60   -0.977877
61    0.963353
62    0.994062
63    0.690386
64    0.505701
65   -0.770034
66    0.998790
67    0.998790
68   -0.986446
69    0.999442
70    0.999021
71    0.919545
72    0.998790
73    0.998790
74    0.996372
75    0.975991
76   -0.770034
77    0.533480
78    0.994062
79    0.998790
80    0.999631
81   -0.994759
82   -0.998991
83    0.996932
84    0.998857
85   -0.828643
86    0.996372
87    0.533480
88    0.978127
89   -0.991861
90    0.999378
91    0.533480
92   -0.663785
93    0.755097
94    0.998790
95    0.998857
Name: sentiment, dtype: float64

In [20]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, 49 to 95
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   article_id   47 non-null     object 
 1   title        47 non-null     object 
 2   link         47 non-null     object 
 3   description  47 non-null     object 
 4   content      47 non-null     object 
 5   pubdate      47 non-null     object 
 6   source_id    47 non-null     object 
 7   country      47 non-null     object 
 8   category     47 non-null     object 
 9   language     47 non-null     object 
 10  sentiment    47 non-null     float64
dtypes: float64(1), object(10)
memory usage: 4.4+ KB


## Calculate today's average sentiment

In [21]:
avg_sentiment = news_df['sentiment'].mean()
avg_sentiment

0.41492582635676606

## Find today's most positive article

In [22]:
most_positive = news_df.loc[news_df['sentiment'].idxmax()]

In [23]:
most_positive = pd.DataFrame(most_positive).T
most_positive.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, 80 to 80
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   article_id   1 non-null      object
 1   title        1 non-null      object
 2   link         1 non-null      object
 3   description  1 non-null      object
 4   content      1 non-null      object
 5   pubdate      1 non-null      object
 6   source_id    1 non-null      object
 7   country      1 non-null      object
 8   category     1 non-null      object
 9   language     1 non-null      object
 10  sentiment    1 non-null      object
dtypes: object(11)
memory usage: 96.0+ bytes


In [24]:
most_positive.pubdate

80    2023-12-17
Name: pubdate, dtype: object