# Web scraping from Indian Express website

In [1]:
import requests
from bs4 import BeautifulSoup
from newspaper import Article
import pandas as pd
import numpy as np
import re

In [2]:
def scraper(url):
    c=0;article_links=[]
    r = requests.get(url);soup = BeautifulSoup(r.content, 'html5lib')
    
    while(soup.find('div',{'class':'pagination'}).find('a',{'class':'page-numbers'}) and c<2):
    #for page navigation ,however we have limited manumber of pages visited to 2
        
        articles=soup.find('div',{'class':'nation'}).findAll('div',{'class':'articles'})
        for news in articles:
            data={}
            article_links.append(news.find('a')['href'])

        url=soup.find('div',{'class':'pagination'}).find('a',{'class':'next page-numbers'})['href']
        r = requests.get(url);soup = BeautifulSoup(r.content, 'html5lib')
        c+=1
    return article_links

list=['world','lifestyle','business','entertainment']

df=[]
for l in list:
    url = "https://indianexpress.com/section/"+l+"/"
    links=scraper(url)
    for link in links:dat={'links':link,'Category':l};df.append(dat)

for l in df:
    article = Article(l['links'], language="en")
    article.download();article.parse();article.nlp()
    l['Title']=article.title
    l['Text']=article.text
    l['keywords']=article.keywords
    l['date']=article.publish_date

In [7]:
df=pd.DataFrame(df)
df.to_csv('News.csv')#saving scraped data to News.csv
df.head()

Unnamed: 0,Category,Text,Title,date,keywords,links
0,world,Dr Lorna Green was a medical director at NewYo...,"New York doctor, who resumed work after Covid-...",2020-04-28 16:10:23+05:30,"[doctor, york, breen, hospital, dr, takes, cov...",https://indianexpress.com/article/world/lorna-...
1,world,The Spain government has announced that if the...,"Lifting lockdowns, European nations go their o...",2020-04-28 15:05:07+05:30,"[lifting, open, way, children, prime, restrict...",https://indianexpress.com/article/world/liftin...
2,world,"In this May 21, 1987, then North Korean Presid...",A look at past disappearances of North Korean ...,2020-04-28 15:00:27+05:30,"[jong, sung, leaders, koreas, officials, disap...",https://indianexpress.com/article/world/north-...
3,world,Japan is also in the same scramble for protect...,"On Japan’s stretched frontline, doctors and nu...",2020-04-28 15:00:25+05:30,"[icu, stretched, hospital, japans, frontline, ...",https://indianexpress.com/article/world/on-jap...
4,world,"US coronavirus deaths surpassed 56,000 on Mond...","US coronavirus deaths projected at over 74,000...",2020-04-28 14:52:52+05:30,"[deaths, death, projected, 74000, according, u...",https://indianexpress.com/article/world/us-cor...


# Converting scraped data into UCI dataset format for analysis

In [21]:
import datetime 
import calendar
from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 

def findDay(dt): 
    d_truncated = datetime.date(dt.year, dt.month, dt.day)
    return calendar.day_name[d_truncated.weekday()]

In [None]:
def cleaner(text):#to format the input text
    text=' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", text).split())
    text=text.lower()
    return text

def rate_words(text):#to find rate_positive_words ,polarity ,min/max_negative_polarity and ...
    text=cleaner(text)
    w_p=[];w_n=[];l1=[];l2=[]
    text=word_tokenize(text)
    for word in text:
        t=TextBlob(word).sentiment.polarity
        if(t>0):w_p.append(t)
        elif(t<0):w_n.append(t)
    le1=len(w_p);le2=len(w_n);let=len(text)
    if(w_p):l1=[le1/let,le1/(le2+le1),sum(w_p)/le1,min(w_p),max(w_p)]
    if(w_n):l2=[le2/let,le2/(le2+le1),sum(w_n)/le2,min(w_n),max(w_n)]
    return l1,l2

def token_counter(text):#to find n_tokens_content,non stop words and ...
    text=cleaner(text)
    text=word_tokenize(text)
    stop_words = set(stopwords.words('english')) 
    filtered_text = [w for w in text if not w in stop_words]
    n_non_stop_words=len(filtered_text)/len(text)
    n_unique_tokens=len(set(text))/len(text)
    n_non_stop_unique_tokens=len(set(filtered_text))/len(text)
    average = sum(len(word) for word in text) / len(text)
    return len(text),n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,average
    
def sentiment(text): # to find subjectivity and polaity 
    text=cleaner(text)
    analysis = TextBlob(text) 
    return (analysis.sentiment.polarity,analysis.sentiment.subjectivity)

### Using utility functions to convert data to UCI dataset format

In [41]:
dat_fr=[]
for ind in df.index: 
    dat={}
    tr,_,_,_,_=token_counter(df['Title'][ind])
    dat[' n_tokens_title']=tr
    
    a,b,c,d,e=token_counter(df['Text'][ind])
    dat[' n_tokens_content']=a;dat[' n_unique_tokens']=b
    dat[' n_non_stop_words']=c;dat[' n_non_stop_unique_tokens']=d;dat[' average_token_length']=e
    
    dat[' num_keywords']=len(df['keywords'][ind])
    
    cat=df['Category'][ind]
    if(cat=='lifestyle'):dat[' data_channel_is_lifestyle']=1
    if(cat=='business'):dat[' data_channel_is_bus']=1
    if(cat=='world'):dat[' data_channel_is_world']=1
    if(cat=='entertainment'):dat[' data_channel_is_entertainment']=1
        
    day=findDay(df['date'][ind])
    dat[' weekday_is_monday']=1 if(day[0]=='Monday') else 0
    dat[' weekday_is_tuesday']=1 if(day[0]=='Tuesday') else 0
    dat[' weekday_is_wednesday']=1 if(day[0]=='Wednesday') else 0
    dat[' weekday_is_thursday']=1 if(day[0]=='Thursday') else 0
    dat[' weekday_is_friday']=1 if(day[0]=='Friday') else 0
    dat[' weekday_is_saturday']=1 if(day[0]=='Saturday') else 0
    dat[' weekday_is_sunday']=1 if(day[0]=='Sunday') else 0
    dat[' is_weekend']=1 if(day[0]=='Saturday' or day[0]=='Sunday') else 0
   
    x,y=sentiment(df['Text'][ind])
    dat[' global_subjectivity']=y;dat[' global_sentiment_polarity']=x
    q,w=sentiment(df['Title'][ind])
    dat[' title_subjectivity']=w;dat[' title_sentiment_polarity']=q
    
    l1,l2=rate_words(df['Text'][ind])
    
    if(l1):
        dat[' global_rate_positive_words']=l1[0];dat[' rate_positive_words']=l1[1]
        dat[' avg_positive_polarity']=l1[2];dat[' min_positive_polarity']=l1[3]
        dat[' max_positive_polarity']=l1[4]
    
    if(l2):
        dat[' global_rate_negative_words']=l2[0];dat[' rate_negative_words']=l2[1]
        dat[' avg_negative_polarity']=l2[2];dat[' min_negative_polarity']=l2[3]
        dat[' max_negative_polarity']=l2[4]
    
    dat_fr.append(dat)

In [42]:
dat_fr=pd.DataFrame(dat_fr)
dat_fr.head()

Unnamed: 0,average_token_length,avg_negative_polarity,avg_positive_polarity,data_channel_is_bus,data_channel_is_entertainment,data_channel_is_lifestyle,data_channel_is_world,global_rate_negative_words,global_rate_positive_words,global_sentiment_polarity,...,rate_positive_words,title_sentiment_polarity,title_subjectivity,weekday_is_friday,weekday_is_monday,weekday_is_saturday,weekday_is_sunday,weekday_is_thursday,weekday_is_tuesday,weekday_is_wednesday
0,4.453368,-0.173333,0.305966,,,,1.0,0.012953,0.041451,0.1259,...,0.761905,0.368182,0.727273,0,0,0,0,0,1,0
1,4.989575,-0.20156,0.343895,,,,1.0,0.020048,0.028067,0.06122,...,0.583333,0.3,0.5,0,0,0,0,0,1,0
2,4.768786,-0.26707,0.317523,,,,1.0,0.029865,0.022158,-0.032316,...,0.425926,-0.25,0.25,0,0,0,0,0,1,0
3,4.944444,-0.270859,0.339461,,,,1.0,0.017504,0.028158,0.047909,...,0.616667,-0.05,0.0,0,0,0,0,0,1,0
4,4.85614,-0.251852,0.323636,,,,1.0,0.010526,0.038596,0.127475,...,0.785714,0.0,0.0,0,0,0,0,0,1,0


# Predictive model based on UCI dataset

In [44]:
database=pd.read_csv('UCI dataset\\OnlineNewsPopularity\\OnlineNewsPopularity.csv')
target=database[' shares']
l1=['url',' data_channel_is_tech',' data_channel_is_socmed',' num_imgs',' num_videos',' self_reference_min_shares',
    ' self_reference_max_shares',' self_reference_avg_sharess',' LDA_00',' LDA_01',' LDA_02',' LDA_03',' LDA_04',
    ' num_self_hrefs',' num_hrefs',' shares',' kw_min_max', ' kw_max_max', ' kw_avg_max', ' kw_min_avg',
    ' kw_max_avg', ' kw_avg_avg',' kw_min_min', ' kw_max_min', ' kw_avg_min',' abs_title_subjectivity', 
    ' abs_title_sentiment_polarity',' timedelta']
database=database.drop(columns=l1)#droping features which may not be significant in our case

database.to_csv('UCI_dataset.csv')
database.head()

Unnamed: 0,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,average_token_length,num_keywords,data_channel_is_lifestyle,data_channel_is_entertainment,data_channel_is_bus,...,rate_positive_words,rate_negative_words,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity
0,12.0,219.0,0.663594,1.0,0.815385,4.680365,5.0,0.0,1.0,0.0,...,0.769231,0.230769,0.378636,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875
1,9.0,255.0,0.604743,1.0,0.791946,4.913725,4.0,0.0,0.0,1.0,...,0.733333,0.266667,0.286915,0.033333,0.7,-0.11875,-0.125,-0.1,0.0,0.0
2,9.0,211.0,0.57513,1.0,0.663866,4.393365,6.0,0.0,0.0,1.0,...,0.857143,0.142857,0.495833,0.1,1.0,-0.466667,-0.8,-0.133333,0.0,0.0
3,9.0,531.0,0.503788,1.0,0.665635,4.404896,7.0,0.0,1.0,0.0,...,0.666667,0.333333,0.385965,0.136364,0.8,-0.369697,-0.6,-0.166667,0.0,0.0
4,13.0,1072.0,0.415646,1.0,0.54089,4.682836,7.0,0.0,0.0,0.0,...,0.860215,0.139785,0.411127,0.033333,1.0,-0.220192,-0.5,-0.05,0.454545,0.136364


In [57]:
from sklearn.model_selection import train_test_split
#splitting of dataset for training and validation
x_train, x_test, y_train, y_test = train_test_split(database,target, test_size=0.15, random_state=0)

from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(random_state=0,n_estimators=200)
regr.fit(x_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=200,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [74]:
pred=regr.predict(x_test)
y_pred=regr.predict(x_train)
data={'Pred:':y_pred,'Actual:':y_train}
d1=pd.DataFrame(data)
d1.head()

Unnamed: 0,Pred:,Actual:
0,3094.45,2800
1,2373.42,1900
2,4334.785,2800
3,6639.385,7900
4,2217.815,1200


# Using model on scraped data

In [82]:
df=pd.read_csv("News.csv")
dat_fr=dat_fr.fillna(0)
prediction=regr.predict(dat_fr)
data={'Title':df['Title'],'Shares ':prediction}
d2=pd.DataFrame(data)
d2.head(100)

Unnamed: 0,Title,Shares
0,"New York doctor, who resumed work after Covid-...",28518.450
1,"Lifting lockdowns, European nations go their o...",23942.635
2,A look at past disappearances of North Korean ...,24962.290
3,"On Japan’s stretched frontline, doctors and nu...",22466.270
4,"US coronavirus deaths projected at over 74,000...",21554.260
5,"Conflict, disasters sparked 50.8 mn internal d...",19129.955
6,COVID-19: over 2 million people download conta...,19880.930
7,Health officials ready new guidelines as restr...,23286.390
8,Ohio’s mail-in primary tests voting during vir...,23683.815
9,Covid-19: UK sets up new insurance scheme for ...,25874.225


In [84]:
average=sum(target)/len(target)
print("Average shares from UCI dataset",average)
print("Average shares from scraped data",sum(prediction)/len(prediction))

Average shares from UCI dataset 3395.3801836343455
Average shares from scraped data 22937.23990625


### It is clearly seen the virality(avg) of the scraped data is much higher than UCI dataset indicating that our scraped data is biased (due to limitations of resources) ,data has been scraped from top pages from website resulting in higher share