Skip to content

Commit

Permalink
Merge pull request #48 from Iceloof/dev
Browse files Browse the repository at this point in the history
Added specific page result and total count
  • Loading branch information
HurinHu committed Feb 2, 2021
2 parents ad3d592 + 87b4baa commit 6f19399
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 3 deletions.
71 changes: 70 additions & 1 deletion GoogleNews/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def __init__(self,lang="en",period="",start="",end="",encode="utf-8"):
self.__texts = []
self.__links = []
self.__results = []
self.__totalcount = 0
self.user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'
self.headers = {'User-Agent': self.user_agent}
self.__lang = lang
Expand Down Expand Up @@ -85,6 +86,67 @@ def search(self, key):
self.__key = urllib.request.quote(self.__key.encode(self.__encode))
self.get_page()

def page_at(self, page=1):
"""
Retrieves a specific page from google.com in the news sections into __results.
Parameter:
page = number of the page to be retrieved
"""
results = []
try:
if self.__start != "" and self.__end != "":
self.url = "https://www.google.com/search?q={}&lr=lang_{}&biw=1920&bih=976&source=lnt&&tbs=lr:lang_1{},cdr:1,cd_min:{},cd_max:{},sbd:1&tbm=nws&start={}".format(self.__key,self.__lang,self.__lang,self.__start,self.__end,(10 * (page - 1)))
elif self.__period != "":
self.url = "https://www.google.com/search?q={}&lr=lang_{}&biw=1920&bih=976&source=lnt&&tbs=lr:lang_1{},qdr:{},,sbd:1&tbm=nws&start={}".format(self.__key,self.__lang,self.__lang,self.__period,(10 * (page - 1)))
else:
self.url = "https://www.google.com/search?q={}&lr=lang_{}&biw=1920&bih=976&source=lnt&&tbs=lr:lang_1{},sbd:1&tbm=nws&start={}".format(self.__key,self.__lang,self.__lang,(10 * (page - 1)))
except AttributeError:
raise AttributeError("You need to run a search() before using get_page().")
try:
self.req = urllib.request.Request(self.url, headers=self.headers)
self.response = urllib.request.urlopen(self.req)
self.page = self.response.read()
self.content = Soup(self.page, "html.parser")
stats = self.content.find_all("div", id="result-stats")[0].text
self.__totalcount = int(stats[stats.find('bout')+5:stats.find('results')-1].replace(',', ''))
result = self.content.find_all("div", id="search")[0].find_all("g-card")
for item in result:
try:
tmp_text = item.find("div", {"role" : "heading"}).text.replace("\n","")
except Exception:
tmp_text = ''
try:
tmp_link = item.find("a").get("href")
except Exception:
tmp_link = ''
try:
tmp_media = item.findAll("g-img")[1].parent.text
except Exception:
tmp_media = ''
try:
tmp_date = item.find("div", {"role" : "heading"}).next_sibling.findNext('div').findNext('div').text
tmp_date,tmp_datetime=lexical_date_parser(tmp_date)
except Exception:
tmp_date = ''
tmp_datetime=None
try:
tmp_desc = item.find("div", {"role" : "heading"}).next_sibling.findNext('div').text.replace("\n","")
except Exception:
tmp_desc = ''
try:
tmp_img = item.findAll("g-img")[0].find("img").get("src")
except Exception:
tmp_img = ''
self.__texts.append(tmp_text)
self.__links.append(tmp_link)
results.append({'title': tmp_text, 'media': tmp_media,'date': tmp_date,'datetime':tmp_datetime,'desc': tmp_desc, 'link': tmp_link,'img': tmp_img})
self.response.close()
except Exception as e_parser:
print(e_parser)
pass
return results

def get_page(self, page=1):
"""
Retrieves a specific page from google.com in the news sections into __results.
Expand All @@ -106,6 +168,8 @@ def get_page(self, page=1):
self.response = urllib.request.urlopen(self.req)
self.page = self.response.read()
self.content = Soup(self.page, "html.parser")
stats = self.content.find_all("div", id="result-stats")[0].text
self.__totalcount = int(stats[stats.find('bout')+5:stats.find('results')-1].replace(',', ''))
result = self.content.find_all("div", id="search")[0].find_all("g-card")
for item in result:
try:
Expand Down Expand Up @@ -138,7 +202,8 @@ def get_page(self, page=1):
self.__links.append(tmp_link)
self.__results.append({'title': tmp_text, 'media': tmp_media,'date': tmp_date,'datetime':tmp_datetime,'desc': tmp_desc, 'link': tmp_link,'img': tmp_img})
self.response.close()
except Exception:
except Exception as e_parser:
print(e_parser)
pass

def getpage(self, page=1):
Expand Down Expand Up @@ -220,6 +285,9 @@ def get_news(self, key="",deamplify=False):
print(e_parser)
pass

def total_count(self):
return self.__totalcount

def result(self,sort=False):
"""Don't remove this, will affect old version user when upgrade"""
return self.results(sort)
Expand Down Expand Up @@ -252,3 +320,4 @@ def clear(self):
self.__texts = []
self.__links = []
self.__results = []
self.__totalcount = 0
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,14 @@ Default return first page result, you don't need to get first page again, otherw
```
googlenews.get_page(2)
```
- If you only want to get specific page
```
result = googlenews.page_at(2)
```
- If you want to get the total result number of the search(this is approximate number, not exact number, it is the number showing on the google search page)
```
googlenews.total_count()
```
- Get results will return the list, `[{'title': '...', 'media': '...', 'date': '...', 'datetime': '...', 'desc': '...', 'link': '...', 'img': '...'}]`
```
googlenews.results()
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name="GoogleNews",
version="1.5.2",
version="1.5.3",
author="Hurin Hu",
author_email="hurin@live.ca",
description="Google News search for Python",
Expand Down
17 changes: 16 additions & 1 deletion test/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,22 @@ def testEncode(self):
length = len(googlenews.result())
self.assertNotEqual(length, 0)
print('Encoding result is not empty')


def testTotalCountGreaterThanZero(self):
googlenews = GoogleNews()
googlenews.search(keyword)
count = googlenews.total_count()
self.assertGreater(count, 0)
print('Total count is greater than zero')

def testResultNumberAtTwoPages(self):
googlenews = GoogleNews()
googlenews.search(keyword)
result = googlenews.page_at(2)
length = len(result)
self.assertEqual(length, 10)
print('Result length at two pages is correct')

class TestStringMethods(unittest.TestCase):

def testResultContainsKeyword(self):
Expand Down

0 comments on commit 6f19399

Please sign in to comment.