Merge pull request #48 from Iceloof/dev

Added specific page result and total count
Iceloof · Feb 2, 2021 · 6f19399 · 6f19399
2 parents ad3d592 + 87b4baa
commit 6f19399
Show file tree

Hide file tree

Showing 4 changed files with 95 additions and 3 deletions.
diff --git a/GoogleNews/__init__.py b/GoogleNews/__init__.py
@@ -37,6 +37,7 @@ def __init__(self,lang="en",period="",start="",end="",encode="utf-8"):
         self.__texts = []
         self.__links = []
         self.__results = []
+        self.__totalcount = 0
         self.user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'
         self.headers = {'User-Agent': self.user_agent}
         self.__lang = lang
@@ -85,6 +86,67 @@ def search(self, key):
             self.__key = urllib.request.quote(self.__key.encode(self.__encode))
         self.get_page()
 
+    def page_at(self, page=1):
+        """
+        Retrieves a specific page from google.com in the news sections into __results.
+
+        Parameter:
+        page = number of the page to be retrieved
+        """
+        results = []
+        try:
+            if self.__start != "" and self.__end != "":
+                self.url = "https://www.google.com/search?q={}&lr=lang_{}&biw=1920&bih=976&source=lnt&&tbs=lr:lang_1{},cdr:1,cd_min:{},cd_max:{},sbd:1&tbm=nws&start={}".format(self.__key,self.__lang,self.__lang,self.__start,self.__end,(10 * (page - 1)))
+            elif self.__period != "":
+                self.url = "https://www.google.com/search?q={}&lr=lang_{}&biw=1920&bih=976&source=lnt&&tbs=lr:lang_1{},qdr:{},,sbd:1&tbm=nws&start={}".format(self.__key,self.__lang,self.__lang,self.__period,(10 * (page - 1))) 
+            else:
+                self.url = "https://www.google.com/search?q={}&lr=lang_{}&biw=1920&bih=976&source=lnt&&tbs=lr:lang_1{},sbd:1&tbm=nws&start={}".format(self.__key,self.__lang,self.__lang,(10 * (page - 1))) 
+        except AttributeError:
+            raise AttributeError("You need to run a search() before using get_page().")
+        try:
+            self.req = urllib.request.Request(self.url, headers=self.headers)
+            self.response = urllib.request.urlopen(self.req)
+            self.page = self.response.read()
+            self.content = Soup(self.page, "html.parser")
+            stats = self.content.find_all("div", id="result-stats")[0].text
+            self.__totalcount = int(stats[stats.find('bout')+5:stats.find('results')-1].replace(',', ''))
+            result = self.content.find_all("div", id="search")[0].find_all("g-card")
+            for item in result:
+                try:
+                    tmp_text = item.find("div", {"role" : "heading"}).text.replace("\n","")
+                except Exception:
+                    tmp_text = ''
+                try:
+                    tmp_link = item.find("a").get("href")
+                except Exception:
+                    tmp_link = ''
+                try:
+                    tmp_media = item.findAll("g-img")[1].parent.text
+                except Exception:
+                    tmp_media = ''
+                try:
+                    tmp_date = item.find("div", {"role" : "heading"}).next_sibling.findNext('div').findNext('div').text
+                    tmp_date,tmp_datetime=lexical_date_parser(tmp_date)
+                except Exception:
+                    tmp_date = ''
+                    tmp_datetime=None
+                try:
+                    tmp_desc = item.find("div", {"role" : "heading"}).next_sibling.findNext('div').text.replace("\n","")
+                except Exception:
+                    tmp_desc = ''
+                try:
+                    tmp_img = item.findAll("g-img")[0].find("img").get("src")
+                except Exception:
+                    tmp_img = ''
+                self.__texts.append(tmp_text)
+                self.__links.append(tmp_link)
+                results.append({'title': tmp_text, 'media': tmp_media,'date': tmp_date,'datetime':tmp_datetime,'desc': tmp_desc, 'link': tmp_link,'img': tmp_img})
+            self.response.close()
+        except Exception as e_parser:
+            print(e_parser)
+            pass
+        return results
+
     def get_page(self, page=1):
         """
         Retrieves a specific page from google.com in the news sections into __results.
@@ -106,6 +168,8 @@ def get_page(self, page=1):
             self.response = urllib.request.urlopen(self.req)
             self.page = self.response.read()
             self.content = Soup(self.page, "html.parser")
+            stats = self.content.find_all("div", id="result-stats")[0].text
+            self.__totalcount = int(stats[stats.find('bout')+5:stats.find('results')-1].replace(',', ''))
             result = self.content.find_all("div", id="search")[0].find_all("g-card")
             for item in result:
                 try:
@@ -138,7 +202,8 @@ def get_page(self, page=1):
                 self.__links.append(tmp_link)
                 self.__results.append({'title': tmp_text, 'media': tmp_media,'date': tmp_date,'datetime':tmp_datetime,'desc': tmp_desc, 'link': tmp_link,'img': tmp_img})
             self.response.close()
-        except Exception:
+        except Exception as e_parser:
+            print(e_parser)
             pass
 
     def getpage(self, page=1):
@@ -220,6 +285,9 @@ def get_news(self, key="",deamplify=False):
             print(e_parser)
             pass
 
+    def total_count(self):
+        return self.__totalcount
+
     def result(self,sort=False):
         """Don't remove this, will affect old version user when upgrade"""
         return self.results(sort)
@@ -252,3 +320,4 @@ def clear(self):
         self.__texts = []
         self.__links = []
         self.__results = []
+        self.__totalcount = 0
diff --git a/README.md b/README.md
@@ -62,6 +62,14 @@ Default return first page result, you don't need to get first page again, otherw
 ```
 googlenews.get_page(2)
 ```
+- If you only want to get specific page
+```
+result = googlenews.page_at(2)
+```
+- If you want to get the total result number of the search(this is approximate number, not exact number, it is the number showing on the google search page)
+```
+googlenews.total_count()
+```
 - Get results will return the list, `[{'title': '...', 'media': '...', 'date': '...', 'datetime': '...', 'desc': '...', 'link': '...', 'img': '...'}]`
 ```
 googlenews.results()

diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="GoogleNews",
-    version="1.5.2",
+    version="1.5.3",
     author="Hurin Hu",
     author_email="hurin@live.ca",
     description="Google News search for Python",

diff --git a/test/test_search.py b/test/test_search.py
@@ -31,7 +31,22 @@ def testEncode(self):
     length = len(googlenews.result())
     self.assertNotEqual(length, 0)
     print('Encoding result is not empty')
-
+
+  def testTotalCountGreaterThanZero(self):
+    googlenews = GoogleNews()
+    googlenews.search(keyword)
+    count = googlenews.total_count()
+    self.assertGreater(count, 0)
+    print('Total count is greater than zero')
+
+  def testResultNumberAtTwoPages(self):
+    googlenews = GoogleNews()
+    googlenews.search(keyword)
+    result = googlenews.page_at(2)
+    length = len(result)
+    self.assertEqual(length, 10)
+    print('Result length at two pages is correct')
+
 class TestStringMethods(unittest.TestCase):
 
   def testResultContainsKeyword(self):