-
Notifications
You must be signed in to change notification settings - Fork 0
/
newsScrape.py
55 lines (49 loc) · 1.61 KB
/
newsScrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import os
from os import path
import requests
from newspaper import Article
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import random
from GoogleNews import GoogleNews
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
def getWordForCategory(topic, links):
ps = PorterStemmer()
topicRoot = ps.stem(topic)
def convert(lst):
wordList = lst.split()
for i, j in enumerate( wordList ):
wordList[i] = j.lower()
return wordList
articleWords = list() # aggregated words
for article in links:
try:
articleReqested = Article(article, language="en")
articleReqested.download()
articleReqested.parse()
articleWords.append( convert(articleReqested.text) )
fileName = topic + ".txt"
with open(fileName, 'w') as f:
for article in articleWords:
for word in article:
rootWord = ps.stem(word)
if (rootWord.lower() != topicRoot.lower()):
f.write("%s" % word)
f.write(" ")
except:
print("Couldn't Read File")
def googleLinks(topic):
googlenews = GoogleNews()
googlenews.set_lang('en')
googlenews.set_period('1d')
googlenews.set_encode('utf-8')
article = googlenews.get_news(topic)
links = googlenews.get_links()[:5]
actualLinks = list()
for l in links:
l = "http://" + l
print(l)
actualLinks.append( requests.get(l).url )
return actualLinks