Skip to content

Commit

Permalink
Merge 77b49d7 into a61cc19
Browse files Browse the repository at this point in the history
  • Loading branch information
jacobhtye committed Feb 7, 2022
2 parents a61cc19 + 77b49d7 commit b370793
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 10 deletions.
2 changes: 2 additions & 0 deletions .travis.yml
Expand Up @@ -7,6 +7,8 @@ install:
- pip install beautifulsoup4
- pip install coverage
- pip install coveralls
- pip install selenium
- pip install webdriver-manager
# command to run tests
script:
- coverage run -m unittest discover 'test' 'test*.py'
Expand Down
23 changes: 14 additions & 9 deletions GoogleNews/__init__.py
Expand Up @@ -5,6 +5,9 @@
import dateparser, copy
from bs4 import BeautifulSoup as Soup, ResultSet
from dateutil.parser import parse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

import datetime
from dateutil.relativedelta import relativedelta
Expand Down Expand Up @@ -121,9 +124,11 @@ def search(self, key):
self.get_page()

def build_response(self):
self.req = urllib.request.Request(self.url.replace("search?","search?hl=en&gl=en&"), headers=self.headers)
self.response = urllib.request.urlopen(self.req)
self.page = self.response.read()
options = Options()
options.headless = True
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
driver.get(self.url.replace("search?","search?hl=en&gl=en&"))
self.page = driver.page_source
self.content = Soup(self.page, "html.parser")
stats = self.content.find_all("div", id="result-stats")
if stats and isinstance(stats, ResultSet):
Expand All @@ -133,6 +138,7 @@ def build_response(self):
#TODO might want to add output for user to know no data was found
return
result = self.content.find_all("div", id="search")[0].find_all("g-card")
driver.close()
return result

def page_at(self, page=1):
Expand Down Expand Up @@ -184,7 +190,6 @@ def page_at(self, page=1):
self.__texts.append(tmp_text)
self.__links.append(tmp_link)
results.append({'title': tmp_text, 'media': tmp_media,'date': tmp_date,'datetime':define_date(tmp_date),'desc': tmp_desc, 'link': tmp_link,'img': tmp_img})
self.response.close()
except Exception as e_parser:
print(e_parser)
pass
Expand Down Expand Up @@ -238,7 +243,6 @@ def get_page(self, page=1):
self.__texts.append(tmp_text)
self.__links.append(tmp_link)
self.__results.append({'title': tmp_text, 'media': tmp_media,'date': tmp_date,'datetime':define_date(tmp_date),'desc': tmp_desc, 'link': tmp_link,'img': tmp_img})
self.response.close()
except Exception as e_parser:
print(e_parser)
pass
Expand All @@ -256,9 +260,11 @@ def get_news(self, key="",deamplify=False):
else:
self.url = 'https://news.google.com/?hl={}'.format(self.__lang)
try:
self.req = urllib.request.Request(self.url, headers=self.headers)
self.response = urllib.request.urlopen(self.req)
self.page = self.response.read()
options = Options()
options.headless = True
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
driver.get(self.url.replace("search?","search?hl=en&gl=en&"))
self.page = driver.page_source
self.content = Soup(self.page, "html.parser")
articles = self.content.select('div[class="NiLAwe y6IFtc R7GTQ keNKEd j7vNaf nID9nc"]')
for article in articles:
Expand Down Expand Up @@ -319,7 +325,6 @@ def get_news(self, key="",deamplify=False):
'site':site})
except Exception as e_article:
print(e_article)
self.response.close()
except Exception as e_parser:
print(e_parser)
pass
Expand Down
8 changes: 7 additions & 1 deletion setup.py
Expand Up @@ -13,7 +13,13 @@
long_description_content_type="text/markdown",
url="https://github.com/Iceloof/GoogleNews",
packages=setuptools.find_packages(),
install_requires=['beautifulsoup4','dateparser','python-dateutil'],
install_requires=[
"beautifulsoup4",
"dateparser",
"python-dateutil",
"selenium",
"webdriver-manager",
],
classifiers=[
"Programming Language :: Python :: 3.6",
"License :: OSI Approved :: MIT License",
Expand Down

0 comments on commit b370793

Please sign in to comment.