In [92]:
import requests
from urllib.parse import urlparse
from GoogleNews import GoogleNews
from os.path import isfile
import pandas as pd
from datetime import datetime

In [2]:
def skip_redirect(uri: str) -> str:
    """ Returns destination URI when given redirect URI"""
    return requests.get(uri).url


In [3]:
def guess_news_source(uri):        
    domain = urlparse(uri).netloc
    return domain.split('.')[-2]

In [4]:
def create_tweet_text(text: str, hashtags: str, url: str):
    cutoff = 280 - len(hashtags) - 5 # for buffer lets leave 5 out
    return text[:cutoff] + "\n" + hashtags + "\n" + url

In [93]:
def get_news(title = "Iran Protests"):
    googlenews = GoogleNews()
    googlenews.set_lang('en')
    #googlenews.set_time_range('12/01/2022','12/02/2022')
    googlenews = GoogleNews(period='4h')
    googlenews.set_encode('utf-8')
    googlenews.get_news('Iran Revolution')

    results = googlenews.results()
    if results is None:
        return [], results
    else:
        return [{
            'title':res['title'],
            'url': skip_redirect("https://" + res['link']),
            'datetime': res['datetime'],
            'retreived': datetime.now(),
            'tweeted': False,
        } for res in results], results

In [120]:
def recycle_data(data_new):
    yesterday = datetime.now() - pd.Timedelta("1D")
    return data_new[data_new['datetime']>=yesterday]


In [124]:
def update_data(data, filename):
    if isfile(filename):
        file_data = pd.read_csv(filename, parse_dates = ['datetime', 'retreived'])
        data_new = pd.concat([data.copy(), file_data], axis =0)
        
        #make sure tweeted column stays updated
        data_new['tweeted'] = data_new.groupby('url')['tweeted'].transform(lambda x: max(x))

        data_new = data_new.drop_duplicates(subset=['url'])
        data_new = recycle_data(data_new)
        data_new.to_csv(filename, index=False)
    else:
        data.to_csv(filename, index=False)
        data_new = data.copy()
    return data_new.sort_values(['datetime'], ascending=False)

    

In [71]:
# news_title = get_news()[0][0]['title']
# hashtags = "#IranRevolution"

# target_news_link = skip_redirect(
#     get_news()[0]['url']
# )
# news_source = guess_news_source(target_news_link).upper()

# hashtags = hashtags + " #" + news_source

# # print(
# #     create_tweet_text(news_title, hashtags, target_news_link)
# #     )

In [103]:
news_collected, results_list =  get_news()
news_df = pd.DataFrame(news_collected)

In [106]:
data_new = update_data(news_df, 'news_log.csv')

In [130]:
news_tile, news_url, _, _ = data_new.head(1).values[0]

print(news_tile, news_url)

Iranian protesters call for 3-day strike starting Monday https://www.cbc.ca/news/world/iran-protesters-strike-1.6673891


In [146]:
df1 = pd.DataFrame(
    {
        'url':[1, 1, 2], 
        'tweeted':[1, 0, 0]
    }
    )
df1

Unnamed: 0,url,tweeted
0,1,1
1,1,0
2,2,0


In [148]:
df1['tweeted'] = df1.groupby('url')['tweeted'].transform(lambda x: max(x))


In [149]:
df1

Unnamed: 0,url,tweeted
0,1,1
1,1,1
2,2,0
