#  Web scraping News Classification from Reuters

In [1]:
import pandas as pd
import lxml
import requests
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import warnings
warnings.filterwarnings("ignore")

In [2]:
url_list = ["http://feeds.reuters.com/reuters/businessNews",
    "http://feeds.reuters.com/reuters/technologyNews",
    "http://feeds.reuters.com/reuters/sportsNews",
    ]
documents = []

In [3]:
from lxml import etree

In [5]:
for url in url_list:
    response = requests.get(url)
    xml_page = response.text
    parser = lxml.etree.XMLParser(recover=True, encoding='utf-8')
    documents.append(lxml.etree.fromstring(xml_page.encode("utf-8"), parser=parser))
def print_tag(node):
    print("<%s %s>%s" % (node.tag, " ".join(["%s=%s" % (k,v) for k,v in node.attrib.iteritems()]), node.text))
    for item in node[:25]:
        print(" <%s %s>%s</%s>" % (item.tag, " ".join(["%s=%s" % (k,v)for k,v in item.attrib.iteritems()]), item.text, item.tag))
    print("</%s>" % node.tag)

In [6]:
temp_node = documents[0]
print_tag(temp_node)

<rss version=2.0>

 <channel >
</channel>
</rss>


In [7]:
temp_node = temp_node[0]
print_tag(temp_node)

<channel >

 <title >Reuters: Business News</title>
 <link >https://www.reuters.com</link>
 <description >Reuters.com is your source for breaking news, business, financial and investing news, including personal finance and stocks.  Reuters is the leading global provider of news, financial information and technology solutions to the world's media, financial institutions, businesses and individuals.</description>
 <image >
	</image>
 <language >en-us</language>
 <lastBuildDate >Wed, 13 Nov 2019 11:15:00 -0500</lastBuildDate>
 <copyright >All rights reserved. Users may download and print extracts of content from this website for their own personal and non-commercial use only. Republication or redistribution of Reuters content, including by framing or similar means, is expressly prohibited without the prior written consent of Reuters. Reuters and the Reuters sphere logo are registered trademarks or trademarks of the Reuters group of companies around the world. © Reuters 2019</copyright>
 <

In [8]:
temp_node = temp_node.xpath("item")[0]
print_tag(temp_node)

<item >
		
 <title >Oil is our gold and we aim to use all of it, ADNOC official says</title>
 <description >Abu Dhabi National Oil Co aims to exhaust its vast oil and gas reserves even as many consumers switch to cleaner sources of energy, a senior executive in the Gulf oil company said.<div class="feedflare">
<a href="http://feeds.reuters.com/~ff/reuters/businessNews?a=UYRQwh9yTlg:rDiLmHHLWoY:yIl2AUoC8zA"><img src="http://feeds.feedburner.com/~ff/reuters/businessNews?d=yIl2AUoC8zA" border="0"></img></a> <a href="http://feeds.reuters.com/~ff/reuters/businessNews?a=UYRQwh9yTlg:rDiLmHHLWoY:F7zBnMyn0Lo"><img src="http://feeds.feedburner.com/~ff/reuters/businessNews?i=UYRQwh9yTlg:rDiLmHHLWoY:F7zBnMyn0Lo" border="0"></img></a> <a href="http://feeds.reuters.com/~ff/reuters/businessNews?a=UYRQwh9yTlg:rDiLmHHLWoY:V_sGLiPBpWU"><img src="http://feeds.feedburner.com/~ff/reuters/businessNews?i=UYRQwh9yTlg:rDiLmHHLWoY:V_sGLiPBpWU" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r

In [9]:
from lxml import html

In [10]:
title_list = []
description_list = []
category_list = []

In [11]:
for xml_doc in documents:
    articles = xml_doc.xpath("//item")
    for article in articles:
        title_list.append(article[0].text)
        description_list.append(article[1].text)
        category_list.append(article[4].text)
news_data = pd.DataFrame(title_list, columns=["title"])
news_data["description"] = description_list
news_data["category"] = category_list
print(len(news_data))
news_data

60


Unnamed: 0,title,description,category
0,"Oil is our gold and we aim to use all of it, A...",Abu Dhabi National Oil Co aims to exhaust its ...,businessNews
1,Tesla CEO says will build Gigafactory 4 in 'Be...,Tesla will build its first European factory a...,businessNews
2,Brexit drove Tesla to pick Berlin over Britain...,Britain missed its chance to host the first Eu...,businessNews
3,Biotech-for-hire PeptiDream charts new path an...,The promise of PeptiDream Inc's drug-discovery...,businessNews
4,London forex trader accuses Citigroup of unfai...,"A former top Citigroup trader, who was fired ...",businessNews
5,German military refuses to take delivery of tw...,Germany's air force said on Wednesday it had d...,businessNews
6,Airbus says A400M technical issues not safety ...,Airbus said on Wednesday that technical issue...,businessNews
7,"Wall Street slips on trade worries, Hong Kong ...",Wall Street edged lower on Wednesday as Presid...,businessNews
8,"Mediation between Bayer, plaintiffs seeks to c...",Mediation between Bayer and plaintiffs in the ...,businessNews
9,German auto lobby to pick utility manager Hild...,The German carmakers lobby VDA is set to appoi...,businessNews


In [12]:
news_data["description"].head()

0    Abu Dhabi National Oil Co aims to exhaust its ...
1    Tesla  will build its first European factory a...
2    Britain missed its chance to host the first Eu...
3    The promise of PeptiDream Inc's drug-discovery...
4    A former top Citigroup  trader, who was fired ...
Name: description, dtype: object

In [13]:
print(news_data["description"][0])

Abu Dhabi National Oil Co aims to exhaust its vast oil and gas reserves even as many consumers switch to cleaner sources of energy, a senior executive in the Gulf oil company said.<div class="feedflare">
<a href="http://feeds.reuters.com/~ff/reuters/businessNews?a=UYRQwh9yTlg:rDiLmHHLWoY:yIl2AUoC8zA"><img src="http://feeds.feedburner.com/~ff/reuters/businessNews?d=yIl2AUoC8zA" border="0"></img></a> <a href="http://feeds.reuters.com/~ff/reuters/businessNews?a=UYRQwh9yTlg:rDiLmHHLWoY:F7zBnMyn0Lo"><img src="http://feeds.feedburner.com/~ff/reuters/businessNews?i=UYRQwh9yTlg:rDiLmHHLWoY:F7zBnMyn0Lo" border="0"></img></a> <a href="http://feeds.reuters.com/~ff/reuters/businessNews?a=UYRQwh9yTlg:rDiLmHHLWoY:V_sGLiPBpWU"><img src="http://feeds.feedburner.com/~ff/reuters/businessNews?i=UYRQwh9yTlg:rDiLmHHLWoY:V_sGLiPBpWU" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r/reuters/businessNews/~4/UYRQwh9yTlg" height="1" width="1" alt=""/>


In [14]:
news_data["short_description"] = [item[item.find(" - ")+3:item.find("<")]
    for item in news_data["description"]]
news_data

Unnamed: 0,title,description,category,short_description
0,"Oil is our gold and we aim to use all of it, A...",Abu Dhabi National Oil Co aims to exhaust its ...,businessNews,u Dhabi National Oil Co aims to exhaust its va...
1,Tesla CEO says will build Gigafactory 4 in 'Be...,Tesla will build its first European factory a...,businessNews,sla will build its first European factory and...
2,Brexit drove Tesla to pick Berlin over Britain...,Britain missed its chance to host the first Eu...,businessNews,itain missed its chance to host the first Euro...
3,Biotech-for-hire PeptiDream charts new path an...,The promise of PeptiDream Inc's drug-discovery...,businessNews,e promise of PeptiDream Inc's drug-discovery t...
4,London forex trader accuses Citigroup of unfai...,"A former top Citigroup trader, who was fired ...",businessNews,"former top Citigroup trader, who was fired in..."
5,German military refuses to take delivery of tw...,Germany's air force said on Wednesday it had d...,businessNews,rmany's air force said on Wednesday it had dec...
6,Airbus says A400M technical issues not safety ...,Airbus said on Wednesday that technical issue...,businessNews,rbus said on Wednesday that technical issues ...
7,"Wall Street slips on trade worries, Hong Kong ...",Wall Street edged lower on Wednesday as Presid...,businessNews,ll Street edged lower on Wednesday as Presiden...
8,"Mediation between Bayer, plaintiffs seeks to c...",Mediation between Bayer and plaintiffs in the ...,businessNews,diation between Bayer and plaintiffs in the Un...
9,German auto lobby to pick utility manager Hild...,The German carmakers lobby VDA is set to appoi...,businessNews,e German carmakers lobby VDA is set to appoint...


In [15]:
news_data["short_description"][0]

'u Dhabi National Oil Co aims to exhaust its vast oil and gas reserves even as many consumers switch to cleaner sources of energy, a senior executive in the Gulf oil company said.'

In [16]:
corpus = news_data["short_description"]
vectorizer = CountVectorizer(min_df=1)
X = vectorizer.fit_transform(corpus).toarray()
print(X.shape)
X

(60, 838)


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [17]:
vectorizer.get_feature_names()[:25]
categories = news_data["category"].unique()
categories

array(['businessNews', 'technologyNews', 'sportsNews'], dtype=object)

In [18]:
category_dict = {value:index for index, value in enumerate(categories)}
category_dict

{'businessNews': 0, 'technologyNews': 1, 'sportsNews': 2}

In [19]:
news_data.head()

Unnamed: 0,title,description,category,short_description
0,"Oil is our gold and we aim to use all of it, A...",Abu Dhabi National Oil Co aims to exhaust its ...,businessNews,u Dhabi National Oil Co aims to exhaust its va...
1,Tesla CEO says will build Gigafactory 4 in 'Be...,Tesla will build its first European factory a...,businessNews,sla will build its first European factory and...
2,Brexit drove Tesla to pick Berlin over Britain...,Britain missed its chance to host the first Eu...,businessNews,itain missed its chance to host the first Euro...
3,Biotech-for-hire PeptiDream charts new path an...,The promise of PeptiDream Inc's drug-discovery...,businessNews,e promise of PeptiDream Inc's drug-discovery t...
4,London forex trader accuses Citigroup of unfai...,"A former top Citigroup trader, who was fired ...",businessNews,"former top Citigroup trader, who was fired in..."


In [20]:
results = news_data["category"].map(category_dict)
results

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    1
21    1
22    1
23    1
24    1
25    1
26    1
27    1
28    1
29    1
30    1
31    1
32    1
33    1
34    1
35    1
36    1
37    1
38    1
39    1
40    2
41    2
42    2
43    2
44    2
45    2
46    2
47    2
48    2
49    2
50    2
51    2
52    2
53    2
54    2
55    2
56    2
57    2
58    2
59    2
Name: category, dtype: int64

In [21]:
print("corpus size: %s" % len(vectorizer.get_feature_names()))

corpus size: 838


In [22]:
x_train,x_test, y_train,y_test = train_test_split(X, results, test_size=0.2, random_state=1, )
clf = MultinomialNB()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)
y_test

39    1
41    2
2     0
48    2
50    2
44    2
33    1
35    1
40    2
27    1
24    1
53    2
Name: category, dtype: int64

In [23]:
clf.score(x_test, y_test)

0.8333333333333334

In [24]:
clf.predict(x_test)

array([1, 2, 1, 2, 2, 2, 1, 1, 2, 0, 1, 2], dtype=int64)

In [25]:
text = ["Who won the Superbowl?"]
vec_text = vectorizer.transform(text).toarray()
clf.predict(vec_text)[0]

2

In [26]:
category_dict

{'businessNews': 0, 'technologyNews': 1, 'sportsNews': 2}