In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

%matplotlib inline
from urllib.request import urlopen

In [3]:
urls_list=[ url links separated by commas] #list of links to work on

In [4]:
def datasetPrepare(urls_list):
    news_data = []
    for url in urls_list:
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0',}
        data = requests.get(url,headers=headers)
        soup = BeautifulSoup(data.content, 'html.parser')
        headlines = soup.find_all('h1', class_=['entry-title'])
        article = soup.find_all('div', class_=['td-post-content'])
        news_articles = [{'news_headline': headlines[0].text,
                  'news_article': article[0].text}]
        news_data.extend(news_articles) 
    df =  pd.DataFrame(news_data)
    df = df[['news_headline', 'news_article']]
    return df

In [5]:
news_df = datasetPrepare(urls_list)

In [7]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170 entries, 0 to 169
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   news_headline  170 non-null    object
 1   news_article   170 non-null    object
dtypes: object(2)
memory usage: 2.8+ KB


In [None]:
news_df.head(5)

# Text Wrangling and Pre-processing

In [17]:
import spacy
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import re
import unicodedata

In [18]:
nlp = spacy.load('en_core_web_sm')

In [19]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maheg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

## Remove HTML tags

In [21]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

strip_html_tags('<html><h2>Some important text</h2></html>')

'Some important text'

## Remove accented characters

In [22]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

remove_accented_chars('Sómě Áccěntěd těxt')

'Some Accented text'

## Remove special characters

In [23]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

In [25]:
remove_special_characters("Well this was fun! What do you think? 123#@!", remove_digits=True)

'Well this was fun What do you think '

## Text lemmatization

In [24]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [26]:
lemmatize_text("My system keeps crashing! his crashed yesterday, ours crashes daily")

'my system keep crash ! his crashed yesterday , ours crash daily'

## Text stemming

In [27]:
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

simple_stemmer("My system keeps crashing his crashed yesterday, ours crashes daily")

'my system keep crash hi crash yesterday, our crash daili'

## Remove stopwords

In [28]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

remove_stopwords("The, and, if are stopwords, computer is not")

', , stopwords , computer not'

## Building a text normalizer

In [29]:
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:
            doc = strip_html_tags(doc)
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

In [39]:
news_df["URL_ID"]=[x for x in range(1,len(news_df["news_headline"])+1)]

In [46]:
news_df.columns

Index(['news_article', 'URL_ID', 'news_headline'], dtype='object')

In [48]:
temp_cols=news_df.columns.tolist()
temp_cols

['URL_ID', 'news_headline', 'news_article']

In [None]:
news_df=news_df[temp_cols]
news_df

## Pre-process and normalize news articles

In [52]:
news_df['full_text'] = news_df["news_headline"].map(str)+ '. ' + news_df["news_article"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_df['full_text'] = news_df["news_headline"].map(str)+ '. ' + news_df["news_article"]


In [None]:
news_df['clean_text'] = normalize_corpus(news_df['full_text'])
norm_corpus = list(news_df['clean_text'])
news_df.iloc[1][['full_text', 'clean_text']].to_dict()

In [58]:
news_df.iloc[1]

URL_ID                                                           2
news_headline    How does AI help to monitor Retail Shelf watches?
news_article     \nWith increasing computing power and more dat...
full_text        How does AI help to monitor Retail Shelf watch...
clean_text       ai help monitor retail shelf watch increase co...
Name: 1, dtype: object

In [59]:
news_df.to_csv('news.csv', index=False, encoding='utf-8')

In [3]:
news_df = pd.read_csv('news.csv')

In [None]:
news_df

In [None]:
news_df1=news_df.copy()
news_df1

In [32]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import spacy
from textstat.textstat import textstatistics

In [33]:
def positive_score(text):
    return SentimentIntensityAnalyzer().polarity_scores(text)["pos"]
positive_score(news_df1["clean_text"][1])

0.221

In [34]:
def negative_score(text):
    return SentimentIntensityAnalyzer().polarity_scores(text)["neg"]
negative_score(news_df1["clean_text"][1])

0.046

In [83]:
TextBlob(news_df1["clean_text"][1]).sentiment

Sentiment(polarity=0.13297950348970758, subjectivity=0.4593453485290221)

In [46]:
def polarity(text):
    return TextBlob(text).sentiment[0]
polarity(news_df1["clean_text"][1])

0.13297950348970758

In [47]:
def subjectivity(text):
    return TextBlob(text).sentiment[1]
subjectivity(news_df1["clean_text"][1])

0.4593453485290221

In [48]:
def break_sentences(text):
	nlp = spacy.load('en_core_web_sm')
	doc = nlp(text)
	return list(doc.sents)

def word_count(text):
	sentences = break_sentences(text)
	words = 0
	for sentence in sentences:
		words += len([token for token in sentence])
	return words

def sentence_count(text):
	sentences = break_sentences(text)
	return len(sentences)

def avg_sentence_length(text):
    words = word_count(text)
    sentences = sentence_count(text)
    average_sentence_length = float(words / sentences)
    return average_sentence_length
avg_sentence_length((news_df1["clean_text"][1]))

195.5

In [50]:
def difficult_words(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    # Find all words in the text
    words = []
    sentences = break_sentences(text)
    for sentence in sentences:
        words += [str(token) for token in sentence]
    diff_words_set = set()
    for word in words:
        syllable_count = syllables_count(word)
        if word not in nlp.Defaults.stop_words and syllable_count >= 2:
            diff_words_set.add(word)
    return len(diff_words_set)
difficult_words((news_df1["clean_text"][1]))

162

In [51]:
#Percentage of complex or difficult words
def per_comp_words(text):
    per_diff_words = (difficult_words(text) / word_count(text) * 100)
    return round((per_diff_words),2)
per_comp_words(news_df1["clean_text"][1])

41.43

In [52]:
def Fog_index(text):
    per_diff_words = (difficult_words(text) / word_count(text) * 100)
    grade = 0.4 * (avg_sentence_length(text) + per_diff_words)
    return round(grade,2)
Fog_index((news_df1["clean_text"][1]))  #fog index

94.77

In [53]:
def avg_num_of_words_per_sentence(text):
    return word_count(text)/sentence_count(text)
avg_num_of_words_per_sentence((news_df1["clean_text"][1]))

195.5

In [54]:
word_count((news_df1["clean_text"][1]))

391

In [55]:
import spacy
from textstat.textstat import textstatistics

def syllables_count(word):
    return textstatistics().syllable_count(word)

def syllable_per_word(text):
    syllable = syllables_count(text)
    words = word_count(text)
    ASPW = float(syllable) / float(words)
    return round(ASPW, 1)
syllable_per_word((news_df1["clean_text"][1]))

2.0

In [56]:
def Count_Personal_Pronouns(text):
    count=0
    for i in TextBlob(text).pos_tags:
        if i[-1]=="VBP":
            count+=1
    return count
Count_Personal_Pronouns(news_df1["clean_text"][1])

18

In [63]:
def avg_word_length(text):
    char_count=0
    for i in range(len(TextBlob(text).words)):
        char_count+=len(TextBlob(text).words[i])
    return round(char_count/(len(TextBlob(text).words)),2)
avg_word_length(news_df1["clean_text"][1])

6.43

In [None]:
news_df1["clean_text"]

In [64]:
scores = [{'Positive_score': positive_score(news_df1["clean_text"][i]),
           'Negative_score': negative_score(news_df1["clean_text"][i]),
           "Polarity_score": polarity(news_df1["clean_text"][i]),
           "Subjectivity_score": subjectivity(news_df1["clean_text"][i]),
           "Average_Sentence_Length": avg_sentence_length(news_df1["clean_text"][i]),
           "Percentage_of_Complex_words": per_comp_words(news_df1["clean_text"][i]),
           "Fog_Index": Fog_index(news_df1["clean_text"][i]),
           "Avg_num_of_words_per_Sentence": avg_num_of_words_per_sentence(news_df1["clean_text"][i]),
           "Complex_word_count": difficult_words(news_df1["clean_text"][i]),
           "Word_count": word_count(news_df1["clean_text"][i]),
           "Syllable_per_word": syllable_per_word(news_df1["clean_text"][i]),
           "Personal_Pronouns": Count_Personal_Pronouns(news_df1["clean_text"][i]),
           "Avg_word_Length": avg_word_length(news_df1["clean_text"][i])} for i in range(len(news_df1))]

In [66]:
Output=pd.DataFrame(scores)
Output

Unnamed: 0,Positive_score,Negative_score,Polarity_score,Subjectivity_score,Average_Sentence_Length,Percentage_of_Complex_words,Fog_Index,Avg_num_of_words_per_Sentence,Complex_word_count,Word_count,Syllable_per_word,Personal_Pronouns,Avg_word_Length
0,0.098,0.060,0.076840,0.481237,145.666667,37.99,73.46,145.666667,166,437,1.9,24,6.11
1,0.221,0.046,0.132980,0.459345,195.500000,41.43,94.77,195.500000,162,391,2.0,18,6.43
2,0.198,0.033,0.059490,0.583649,553.000000,35.62,235.45,553.000000,394,1106,2.0,74,6.37
3,0.161,0.006,0.038203,0.475654,264.000000,45.08,123.63,264.000000,119,264,2.1,14,6.66
4,0.196,0.039,0.014539,0.490402,118.666667,35.11,61.51,118.666667,250,712,2.0,57,6.23
...,...,...,...,...,...,...,...,...,...,...,...,...,...
165,0.139,0.050,0.098422,0.389233,426.500000,30.36,182.75,426.500000,259,853,2.0,41,6.51
166,0.143,0.053,0.045331,0.495914,153.500000,30.46,73.58,153.500000,187,614,2.1,17,6.63
167,0.141,0.154,-0.042580,0.508566,628.000000,30.73,263.49,628.000000,193,628,2.2,28,6.67
168,0.055,0.087,0.027413,0.492608,327.000000,34.86,144.74,327.000000,114,327,2.0,16,6.49


In [None]:
Final_Output=pd.concat([news_df,Output],axis=1)
Final_Output

In [70]:
Final_Output.to_csv('Final_Output_BC.csv', index=False, encoding='utf-8')