# Project - Web Scrapping & Data Preparation of News Articles Using KMeans Clustering

# Scraped data from Opindia News Website

In [1]:
import requests
from bs4 import BeautifulSoup
url = "https://www.opindia.com/latest-news/"
data = requests.get(url)

In [2]:
data

<Response [200]>

In [3]:
soup = BeautifulSoup(data.content,"html.parser")

# Created function for fetching article with URL

In [4]:
def fetch_article(url):
    data = requests.get(url)
    soup = BeautifulSoup(data.content,"html.parser")
    articles = []
    for i in soup.find_all("h3",class_ = ["entry-title td-module-title"]):
        articles.append(i.find('a')['title'])     
    return articles

# Created URL list

In [5]:
urllist = []
for i in range(2,21,1):
    url = "https://www.opindia.com/latest-news/page/" + str(i) + "/"
    urllist.append(url)

# By fetch articles function fetched the data

In [6]:
all_articles = []
for i in urllist:
    all_articles.extend(fetch_article(i))

In [7]:
#all_articles

# Tokenization and Stemming

In [8]:
p_art =[]
for i in all_articles:
    q = i.upper()
    import re
    q = re.sub("[^A-Z0-9 ]","",q)
    from nltk.stem import PorterStemmer
    tk_q = q.split(" ")
    sent = ""
    for j in tk_q:
        ps = PorterStemmer()
        sent = sent + " " + ps.stem(j).upper()
    p_art.append(sent)

# Create a TF-IDF Vectorizer

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()
A = tf.fit_transform(p_art).toarray()

In [10]:
A

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# K-Means Clustering

In [11]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=5)
cl_res = km.fit(A)

# Created DataFrame with cluster_labels and articles

In [12]:
import pandas as pd

In [13]:
Q = pd.DataFrame(p_art,columns=["Article"])
Q['Cluster']=cl_res.labels_

In [14]:
Q.head()

Unnamed: 0,Article,Cluster
0,AS KARNATAKA RELEAS FINAL COPI OF REVIS HISTO...,3
1,DONT CANCEL PROJECT BECAUS OF CORRUPT ALLEG B...,0
2,MADRASA ASSAMES HISTORI AND RAHUL GANDHI HERE...,3
3,WATER CRISI IN MAHARASHTRA AURANGABAD SUPPLI ...,0
4,ASSAM BULLDOZ IN ACTION 5 ILLEG HOUS OF RIOTE...,3


# Create a Cluster names

In [15]:
E = {1 : "politics",
2 : "religion",
3 : "sports",
4 : "entertainment",
0 : "geopolitics"}

In [16]:
R = []
for i in Q.Cluster:
    R.append(E[i])

Q['category'] = R

In [17]:
Q

Unnamed: 0,Article,Cluster,category
0,AS KARNATAKA RELEAS FINAL COPI OF REVIS HISTO...,3,sports
1,DONT CANCEL PROJECT BECAUS OF CORRUPT ALLEG B...,0,geopolitics
2,MADRASA ASSAMES HISTORI AND RAHUL GANDHI HERE...,3,sports
3,WATER CRISI IN MAHARASHTRA AURANGABAD SUPPLI ...,0,geopolitics
4,ASSAM BULLDOZ IN ACTION 5 ILLEG HOUS OF RIOTE...,3,sports
...,...,...,...
1971,INDIA RAIS SECUR CONCERN PAKISTAN MISUS THE K...,3,sports
1972,KID MAY THROW STONE WHILE PLAY JAMA MASJID CO...,3,sports
1973,ASSAM COURT IN KOKRAJHAR REJECT GUJARAT MLA J...,3,sports
1974,TESLA CHIEF ELON MUSK SECUR 465 BILLION FUND ...,3,sports
