## Text Classification With Machine Learning In Python
+ News Category Classifier
+ - Aim: To Classify and Predict news feeds into their appropiate Categories
+ Credit to Roshan

In [1]:
### EDA packages and Web
import pandas as pd
import lxml
import requests

In [2]:
url_list = ["https://www.reuters.com/news/health",
            "https://www.reuters.com/politics",
            "https://www.reuters.com/finance",
            "https://www.reuters.com/news/sports",
            "https://www.reuters.com/news/technology"]
            
feeds_list = [
            "http://feeds.reuters.com/reuters/businessNews",
            "http://feeds.reuters.com/reuters/technologyNews",
            "http://feeds.reuters.com/reuters/sportsNews",
            "http://feeds.reuters.com/reuters/healthNews",
            "http://feeds.reuters.com/reuters/politicsNews",]

In [3]:
# Using LXML 
from lxml import etree

In [4]:
# Scraping and Parsing Data From Feeds_list
datafeeds = []
for feed in feeds_list:
    response = requests.get(feed)
    xml_page = response.text
    parser = etree.XMLParser(recover=True, encoding='utf-8')
    datafeeds.append(etree.fromstring(xml_page.encode("utf-8"), parser=parser))

In [5]:
# Function for Building Node
def print_tag(node):
    print("<%s %s>%s" % (node.tag, " ".join(["%s=%s" % (k,v)for k,v in node.attrib.iteritems()]), node.text))
    for item in node[:25]:
        print("  <%s %s>%s</%s>" % (item.tag, " ".join(["%s=%s" % (k,v)for k,v in item.attrib.iteritems()]), item.text, item.tag))
    print("</%s>" % node.tag)



In [6]:
# What we want to select
general_node = datafeeds[0]
print_tag(general_node)

<rss version=2.0>

  <channel >
</channel>
</rss>


In [7]:
# Selecting Node
general_node = general_node[0]
print_tag(general_node)

<channel >

  <title >Reuters: Business News</title>
  <link >http://www.reuters.com</link>
  <description >Reuters.com is your source for breaking news, business, financial and investing news, including personal finance and stocks.  Reuters is the leading global provider of news, financial information and technology solutions to the world's media, financial institutions, businesses and individuals.</description>
  <image >
	</image>
  <language >en-us</language>
  <lastBuildDate >Sat, 14 Apr 2018 10:04:42 -0400</lastBuildDate>
  <copyright >All rights reserved. Users may download and print extracts of content from this website for their own personal and non-commercial use only. Republication or redistribution of Reuters content, including by framing or similar means, is expressly prohibited without the prior written consent of Reuters. Reuters and the Reuters sphere logo are registered trademarks or trademarks of the Reuters group of companies around the world. © Reuters 2018</copyrig

In [8]:
# Specific Selection of Item
general_node = general_node.xpath("item")[0]
print_tag(general_node)

<item >
		
  <title >Wall Street eyes earnings stabilizer after FAANG stocks wobble</title>
  <description >(Reuters) - Wall Street is hoping that first-quarter earnings growth and corporate forecasts are strong enough to bring the FAANG group of stocks back into favor and take the spotlight off worries that caused the recent sell-off in the high-flying group.<div class="feedflare">
<a href="http://feeds.reuters.com/~ff/reuters/businessNews?a=WBFIN23VpFE:YCyjiaDuEFA:yIl2AUoC8zA"><img src="http://feeds.feedburner.com/~ff/reuters/businessNews?d=yIl2AUoC8zA" border="0"></img></a> <a href="http://feeds.reuters.com/~ff/reuters/businessNews?a=WBFIN23VpFE:YCyjiaDuEFA:F7zBnMyn0Lo"><img src="http://feeds.feedburner.com/~ff/reuters/businessNews?i=WBFIN23VpFE:YCyjiaDuEFA:F7zBnMyn0Lo" border="0"></img></a> <a href="http://feeds.reuters.com/~ff/reuters/businessNews?a=WBFIN23VpFE:YCyjiaDuEFA:V_sGLiPBpWU"><img src="http://feeds.feedburner.com/~ff/reuters/businessNews?i=WBFIN23VpFE:YCyjiaDuEFA:V_sGLiP

In [9]:
# Grouping them  into List and Array
title_list = []
description_list = []
category_list = []

for xml_doc in datafeeds:
    articles = xml_doc.xpath("//item")
    for article in articles: #0,1,4 instead of 0,2,3
        title_list.append(article[0].text)
        description_list.append(article[1].text)
        category_list.append(article[4].text)
        



In [10]:
# Putting Data Into DataFrame
news_df = pd.DataFrame(title_list, columns=["Title"])
news_df["Description"] = description_list
news_df["Category"] = category_list
print(len(news_df))
news_df

50


Unnamed: 0,Title,Description,Category
0,Wall Street eyes earnings stabilizer after FAA...,(Reuters) - Wall Street is hoping that first-q...,businessNews
1,"Musk insists Tesla does not need more capital,...",(Reuters) - Tesla Inc will be profitable in t...,businessNews
2,Trump says U.S. will only rejoin Pacific trade...,WASHINGTON/TOKYO (Reuters) - U.S. President Do...,businessNews
3,Wells Fargo faces $1 billion fine from loan ab...,(Reuters) - Two U.S. regulators have proposed ...,businessNews
4,Facebook CEO's compensation jumps to $8.9 mill...,(Reuters) - Facebook Inc Chief Executive Mark...,businessNews
5,"GE books $4.2 billion charge, restates earning...",NEW YORK (Reuters) - General Electric Co said...,businessNews
6,U.S. bank executives see delayed boost from ta...,NEW YORK (Reuters) - Banks have not reaped the...,businessNews
7,U.S. lowers NAFTA key auto content demand: aut...,MEXICO CITY/WASHINGTON (Reuters) - U.S. trade ...,businessNews
8,German interior minister rejects union's six p...,BERLIN (Reuters) - German Interior Minister Ho...,businessNews
9,Bratz maker's CEO bids $890 million for Toys '...,(Reuters) - Bratz doll maker MGA Entertainment...,businessNews


In [11]:

news_df["Description"].head()

0    (Reuters) - Wall Street is hoping that first-q...
1    (Reuters) - Tesla Inc  will be profitable in t...
2    WASHINGTON/TOKYO (Reuters) - U.S. President Do...
3    (Reuters) - Two U.S. regulators have proposed ...
4    (Reuters) - Facebook Inc  Chief Executive Mark...
Name: Description, dtype: object

In [12]:
print(news_df["Description"][0])

(Reuters) - Wall Street is hoping that first-quarter earnings growth and corporate forecasts are strong enough to bring the FAANG group of stocks back into favor and take the spotlight off worries that caused the recent sell-off in the high-flying group.<div class="feedflare">
<a href="http://feeds.reuters.com/~ff/reuters/businessNews?a=WBFIN23VpFE:YCyjiaDuEFA:yIl2AUoC8zA"><img src="http://feeds.feedburner.com/~ff/reuters/businessNews?d=yIl2AUoC8zA" border="0"></img></a> <a href="http://feeds.reuters.com/~ff/reuters/businessNews?a=WBFIN23VpFE:YCyjiaDuEFA:F7zBnMyn0Lo"><img src="http://feeds.feedburner.com/~ff/reuters/businessNews?i=WBFIN23VpFE:YCyjiaDuEFA:F7zBnMyn0Lo" border="0"></img></a> <a href="http://feeds.reuters.com/~ff/reuters/businessNews?a=WBFIN23VpFE:YCyjiaDuEFA:V_sGLiPBpWU"><img src="http://feeds.feedburner.com/~ff/reuters/businessNews?i=WBFIN23VpFE:YCyjiaDuEFA:V_sGLiPBpWU" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r/reuters/businessNews/~4/WBFIN23Vp

In [13]:
%%HTML
NEW YORK (Reuters) - Financial stocks led a drop on Wall Street on Friday, as results from big banks failed to enthuse and geopolitical tensions in Syria and Russia further unnerved investors.<div class="feedflare">
<a href="http://feeds.reuters.com/~ff/reuters/businessNews?a=G0SWmsbH8M8:33ar9b0m6EQ:yIl2AUoC8zA"><img src="http://feeds.feedburner.com/~ff/reuters/businessNews?d=yIl2AUoC8zA" border="0"></img></a> <a href="http://feeds.reuters.com/~ff/reuters/businessNews?a=G0SWmsbH8M8:33ar9b0m6EQ:F7zBnMyn0Lo"><img src="http://feeds.feedburner.com/~ff/reuters/businessNews?i=G0SWmsbH8M8:33ar9b0m6EQ:F7zBnMyn0Lo" border="0"></img></a> <a href="http://feeds.reuters.com/~ff/reuters/businessNews?a=G0SWmsbH8M8:33ar9b0m6EQ:V_sGLiPBpWU"><img src="http://feeds.feedburner.com/~ff/reuters/businessNews?i=G0SWmsbH8M8:33ar9b0m6EQ:V_sGLiPBpWU" border="0"></img></a>
</div><img src="http://feeds.feedburner.com/~r/reuters/businessNews/~4/G0SWmsbH8M8" height="1" width="1" alt=""/>

### Extracting text from the description

In [14]:
# Create A Short Description
news_df["Short_description"] = [item[item.find(" - ")+3:item.find("<")] for item in news_df["Description"]]
news_df

Unnamed: 0,Title,Description,Category,Short_description
0,Wall Street eyes earnings stabilizer after FAA...,(Reuters) - Wall Street is hoping that first-q...,businessNews,Wall Street is hoping that first-quarter earni...
1,"Musk insists Tesla does not need more capital,...",(Reuters) - Tesla Inc will be profitable in t...,businessNews,Tesla Inc will be profitable in the third and...
2,Trump says U.S. will only rejoin Pacific trade...,WASHINGTON/TOKYO (Reuters) - U.S. President Do...,businessNews,U.S. President Donald Trump said the United St...
3,Wells Fargo faces $1 billion fine from loan ab...,(Reuters) - Two U.S. regulators have proposed ...,businessNews,Two U.S. regulators have proposed Wells Fargo ...
4,Facebook CEO's compensation jumps to $8.9 mill...,(Reuters) - Facebook Inc Chief Executive Mark...,businessNews,Facebook Inc Chief Executive Mark Zuckerberg'...
5,"GE books $4.2 billion charge, restates earning...",NEW YORK (Reuters) - General Electric Co said...,businessNews,General Electric Co said on Friday it took a ...
6,U.S. bank executives see delayed boost from ta...,NEW YORK (Reuters) - Banks have not reaped the...,businessNews,Banks have not reaped the full benefit of U.S....
7,U.S. lowers NAFTA key auto content demand: aut...,MEXICO CITY/WASHINGTON (Reuters) - U.S. trade ...,businessNews,U.S. trade negotiators have significantly soft...
8,German interior minister rejects union's six p...,BERLIN (Reuters) - German Interior Minister Ho...,businessNews,German Interior Minister Horst Seehofer said o...
9,Bratz maker's CEO bids $890 million for Toys '...,(Reuters) - Bratz doll maker MGA Entertainment...,businessNews,Bratz doll maker MGA Entertainment said on Fri...


In [15]:
news_df

Unnamed: 0,Title,Description,Category,Short_description
0,Wall Street eyes earnings stabilizer after FAA...,(Reuters) - Wall Street is hoping that first-q...,businessNews,Wall Street is hoping that first-quarter earni...
1,"Musk insists Tesla does not need more capital,...",(Reuters) - Tesla Inc will be profitable in t...,businessNews,Tesla Inc will be profitable in the third and...
2,Trump says U.S. will only rejoin Pacific trade...,WASHINGTON/TOKYO (Reuters) - U.S. President Do...,businessNews,U.S. President Donald Trump said the United St...
3,Wells Fargo faces $1 billion fine from loan ab...,(Reuters) - Two U.S. regulators have proposed ...,businessNews,Two U.S. regulators have proposed Wells Fargo ...
4,Facebook CEO's compensation jumps to $8.9 mill...,(Reuters) - Facebook Inc Chief Executive Mark...,businessNews,Facebook Inc Chief Executive Mark Zuckerberg'...
5,"GE books $4.2 billion charge, restates earning...",NEW YORK (Reuters) - General Electric Co said...,businessNews,General Electric Co said on Friday it took a ...
6,U.S. bank executives see delayed boost from ta...,NEW YORK (Reuters) - Banks have not reaped the...,businessNews,Banks have not reaped the full benefit of U.S....
7,U.S. lowers NAFTA key auto content demand: aut...,MEXICO CITY/WASHINGTON (Reuters) - U.S. trade ...,businessNews,U.S. trade negotiators have significantly soft...
8,German interior minister rejects union's six p...,BERLIN (Reuters) - German Interior Minister Ho...,businessNews,German Interior Minister Horst Seehofer said o...
9,Bratz maker's CEO bids $890 million for Toys '...,(Reuters) - Bratz doll maker MGA Entertainment...,businessNews,Bratz doll maker MGA Entertainment said on Fri...


In [16]:
# Save to A CSV File
news_df.to_csv("ReutersNewsDataFinal2.csv")

###  Feature Extraction

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB



In [18]:
corpus = news_df["Short_description"]
vectorizer = CountVectorizer(min_df=1)
X = vectorizer.fit_transform(corpus).toarray()


In [19]:
# Shape of Our Data
print(X.shape)


(50, 846)


In [None]:
# Features
X

In [20]:
# Names of Vectorized Features
vectorizer.get_feature_names()[:25]

['000',
 '10',
 '100',
 '11',
 '11th',
 '2016',
 '2017',
 '2020',
 '24',
 '25th',
 '30',
 '33',
 '4x100m',
 '53',
 '64',
 '65',
 '890',
 'abortion',
 'about',
 'absence',
 'abuses',
 'access',
 'according',
 'accounting',
 'achieved']

In [21]:
# Building a Map of Categories =Making Categories Numerical since ML understands numbers better
categories = news_df["Category"].unique()
category_dict = {value:index for index, value in enumerate(categories)}
results = news_df["Category"].map(category_dict)
category_dict



{'businessNews': 0,
 'healthNews': 3,
 'politicsNews': 4,
 'sportsNews': 2,
 'technologyNews': 1}

In [22]:
print("corpus size: %s" % len(vectorizer.get_feature_names()))

corpus size: 846


In [23]:
# Labels
results

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    1
11    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    1
20    2
21    2
22    2
23    2
24    2
25    2
26    2
27    2
28    2
29    2
30    3
31    3
32    3
33    3
34    3
35    3
36    3
37    3
38    3
39    3
40    4
41    4
42    4
43    4
44    4
45    4
46    4
47    4
48    4
49    4
Name: Category, dtype: int64

In [24]:
# Split Dataset into Test and Training Data
x_train,x_test, y_train,y_test = train_test_split(X, results, test_size=0.2, random_state=1, )

In [26]:
# Using NaiveBaiyes Multinomial Classifier
clf = MultinomialNB()
clf.fit(x_train, y_train)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [27]:
print("Accuracy of our model score: ",clf.score(x_test, y_test))

Accuracy of our model score:  0.5


In [28]:
clf.predict(x_test)

array([4, 3, 4, 1, 4, 0, 4, 2, 1, 0])

In [29]:
category_dict

{'businessNews': 0,
 'healthNews': 3,
 'politicsNews': 4,
 'sportsNews': 2,
 'technologyNews': 1}

In [30]:
### Sample Prediction of Category of News

In [31]:
text = ["Russian Hackers hijack US Election "]

In [32]:
# Vectorize and Transform text
vec_text = vectorizer.transform(text).toarray()


In [33]:
# Predict
clf.predict(vec_text)

array([1])

In [None]:
#category_dict.keys()[category_dict.values().index(clf.predict(vec_text)[0])]

In [39]:
# A function to do it
def newscategorifier(a):
    test_name1 = [a]
    transform_vect =vectorizer.transform(text).toarray()
    if clf.predict(transform_vect) == 0:
        print("Business News")
    elif clf.predict(transform_vect) == 1:
        print("Technology News")
    elif clf.predict(transform_vect) == 2:
        print("Sport News")
    elif clf.predict(transform_vect) == 3:
        print("Health News")
    else:
        print("Politcs News")

In [41]:
newscategorifier("Python and Julia for Computer Scientist")

Technology News


In [35]:
## Save Our Model to be used
from sklearn.externals import joblib

In [42]:
NaiveBayModel = open("newsclassifierNBmodel.pkl","wb")

In [43]:
joblib.dump(clf,NaiveBayModel)