In [20]:
import requests
from bs4 import BeautifulSoup

In [21]:
def Class_exist_count(url,Class):
  response = requests.get(url)
  html_content = response.text
  soup = BeautifulSoup(html_content, 'html.parser')
  elements_with_target_class = soup.find_all(class_=Class)
  return len(elements_with_target_class)

In [22]:
def is_header_or_footer(element):
    header_tags = ['header']
    footer_tags = ['footer']
    header_classes = ['td-header-template-wrap','td-header-wrap']
    footer_classes = ['td-footer-template-wrap','td-footer-wrap']

    element_tag = element.name
    element_classes = element.get('class', [])

    if element_tag in header_tags or any(class_name in header_classes for class_name in element_classes):
        return True

    if element_tag in footer_tags or any(class_name in footer_classes for class_name in element_classes):
        return True

    return False

In [23]:
def extract_title(url):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        title_element = soup.find('title')
        title = title_element.string if title_element and not is_header_or_footer(title_element) else "No title found"


        return title
    else:
        return None

In [24]:
def extract_article(url):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        paragraphs = []
        for paragraph in soup.find_all('p'):
            if not is_header_or_footer(paragraph):
                paragraphs.append(paragraph.text)

        return '\n'.join(paragraphs)
    else:
        return None

In [26]:
import pandas as pd
import numpy  as np

In [27]:
import seaborn as sns

In [28]:
df = pd.read_excel('/content/drive/MyDrive/Black coffer/Input.xlsx')

In [29]:
df.head()

Unnamed: 0,URL_ID,URL
0,123.0,https://insights.blackcoffer.com/rise-of-telem...
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...
4,432.0,https://insights.blackcoffer.com/rise-of-telem...


In [30]:
df['head_count']=df['URL'].apply(lambda url: Class_exist_count(url,'td-header-template-wrap'))

In [31]:
df['head_count'].describe()

count    114.0
mean       1.0
std        0.0
min        1.0
25%        1.0
50%        1.0
75%        1.0
max        1.0
Name: head_count, dtype: float64

In [32]:
df['foot_count']=df['URL'].apply(lambda url: Class_exist_count(url,'td-footer-template-wrap'))

In [33]:
df['foot_count'].describe()

count    114.0
mean       1.0
std        0.0
min        1.0
25%        1.0
50%        1.0
75%        1.0
max        1.0
Name: foot_count, dtype: float64

In [34]:
df['title'] = df['URL'].apply(lambda url:extract_title(url))
df['article_text'] = df['URL'].apply(lambda url:extract_article(url))

In [None]:
df.head()

In [36]:
import os

In [37]:
txt_files = [file for file in os.listdir('/content/drive/MyDrive/Black coffer/StopWords') if file.endswith(".txt")]
stopwords_list = []
for file_name in txt_files:
    with open(os.path.join('/content/drive/MyDrive/Black coffer/StopWords', file_name), "r",encoding='latin-1') as file:
        stopwords = [word.strip() for word in file]
        stopwords_list.append(stopwords)

In [49]:
def filter_stopwords(text, stopwords):
    if (text== None):
      return None
    else:
      words = text.split()
      filtered_words = [word for word in words if word not in stopwords]
      return " ".join(filtered_words)

In [50]:
df['stop_filtered']=df['article_text'].apply(lambda txt: filter_stopwords(txt,stopwords_list))

In [54]:
def Calculate_score(txt,score_type):
  if (txt==None):
    return 0
  else:
    words = txt.split()
    with open(os.path.join('/content/drive/MyDrive/Black coffer/MasterDictionary', score_type + '.txt'),"r",encoding='latin-1') as file:
      dictionary = [word.strip() for word in file]
    score = 0
    for word in words:
       if word in dictionary:
        score = score + 1
    return score

In [55]:
df['POSITIVE SCORE']=df['stop_filtered'].apply(lambda txt:Calculate_score(txt,'positive-words'))
df['NEGATIVE SCORE']=df['stop_filtered'].apply(lambda txt:Calculate_score(txt,'negative-words'))

In [57]:
df['POSITIVE SCORE'].describe()

count    114.000000
mean      31.491228
std       18.211270
min        0.000000
25%       20.000000
50%       28.000000
75%       42.000000
max      103.000000
Name: POSITIVE SCORE, dtype: float64

In [58]:
df['NEGATIVE SCORE'].describe()

count    114.000000
mean      23.114035
std       17.237415
min        0.000000
25%        9.250000
50%       21.000000
75%       32.500000
max       79.000000
Name: NEGATIVE SCORE, dtype: float64

In [59]:
df.columns

Index(['URL_ID', 'URL', 'head_count', 'foot_count', 'title', 'article_text',
       'stop_filtered', 'POSITIVE SCORE', 'NEGATIVE SCORE'],
      dtype='object')

In [69]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('word_tokenize')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Error loading word_tokenize: Package 'word_tokenize' not
[nltk_data]     found in index


False

In [96]:
def nltk_filter(txt):
  txt = re.sub(r"[^a-zA-Z]", " ", txt)
  stop_words = set(stopwords.words('english'))
  words = word_tokenize(txt)
  filtered_words = [word for word in words if word.lower() not in stop_words]
  return filtered_words

In [95]:
def Total_words(txt):
   if (txt==None):
    return 0
   else:
    return len(nltk_filter(txt))

In [97]:
df['WORD COUNT'] = df['article_text'].apply(lambda txt:Total_words(txt))

In [76]:
df['POLARITY SCORE'] = df.apply(lambda df: (df['POSITIVE SCORE']-df['NEGATIVE SCORE'])/((df['POSITIVE SCORE']+df['NEGATIVE SCORE'])+0.000001),axis=1)
df['SUBJECTIVITY SCORE'] = df.apply(lambda df: (df['POSITIVE SCORE']+df['NEGATIVE SCORE'])/((df['WORD COUNT'])+0.000001),axis=1)

In [77]:
from nltk.tokenize import sent_tokenize

In [80]:
def avg_sen_len(txt):
  if (txt==None):
    return 0
  else:
    return sum(len(sentence) for sentence in sent_tokenize(txt))/len(sent_tokenize(txt))

In [81]:
def avg_words_per_sen(txt,wrd_count):
  if (txt==None):
    return 0
  else:
    return wrd_count/len(sent_tokenize(txt))

In [82]:
df['AVG SENTENCE LENGTH'] = df['article_text'].apply(lambda txt:avg_sen_len(txt))
df['AVG NUMBER OF WORDS PER SENTENCE'] = df.apply(lambda df:avg_words_per_sen(df['article_text'],df['WORD COUNT']),axis=1)

In [84]:
pip install pyphen

Collecting pyphen
  Downloading pyphen-0.14.0-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen
Successfully installed pyphen-0.14.0


In [85]:
import pyphen

In [86]:
def syllable_count(word):
    dictionary = pyphen.Pyphen(lang='en')
    if word.endswith("es") or word.endswith("ed"):
        return len(dictionary.inserted(word[:-2]).split('-'))
    else:
        return len(dictionary.inserted(word).split('-'))

In [87]:
def syllables_per_word(txt):
  if (txt==None):
    return 0
  else:
    words = nltk_filter(txt)
    syllable_counts = [syllable_count(word) for word in words]
    return sum(syllable_counts) / len(words)

In [88]:
df['SYLLABLE PER WORD'] = df['article_text'].apply(lambda txt: syllables_per_word(txt))


In [89]:
def complex_count(txt):
  if (txt==None):
    return 0
  else:
    words = nltk_filter(txt)
    count=0
    for word in words:
      if syllable_count(word) >= 2:
        count = count + 1
    return count

In [91]:
df['COMPLEX WORD COUNT'] = df['article_text'].apply(lambda txt: complex_count(txt))

In [93]:
df['PERCENTAGE OF COMPLEX WORDS'] = df.apply(lambda df:0 if df['WORD COUNT']==0 else df['COMPLEX WORD COUNT']/df['WORD COUNT'],axis=1)

In [94]:
df['FOG INDEX'] = df.apply(lambda df: 0.4 * (df['AVG SENTENCE LENGTH']+df['PERCENTAGE OF COMPLEX WORDS']),axis=1)

In [100]:
df['AVG WORD LENGTH'] = df.apply(lambda df :0 if df['WORD COUNT']==0 else sum(len(word) for word in nltk_filter(df['article_text']))/df['WORD COUNT'],axis=1)

In [103]:
def Personal_pronoun(txt):
  if (txt==None):
    return 0
  else:
    pattern = r'\b(I|we|my|ours|us)\b'
    matches = re.findall(pattern, txt, flags=re.IGNORECASE)
    filtered_matches = []
    for match in matches:
      if match == "US":
        continue
      filtered_matches.append(match)
    return len(filtered_matches)

In [104]:
df['PERSONAL PRONOUNS'] = df['article_text'].apply(lambda txt: Personal_pronoun(txt))

In [105]:
df.describe()

Unnamed: 0,URL_ID,head_count,foot_count,POSITIVE SCORE,NEGATIVE SCORE,WORD COUNT,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,AVG NUMBER OF WORDS PER SENTENCE,SYLLABLE PER WORD,COMPLEX WORD COUNT,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG WORD LENGTH,PERSONAL PRONOUNS
count,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0
mean,26676.5,1.0,1.0,31.491228,23.114035,730.307018,0.186476,0.072436,159.283491,15.124626,1.778787,396.052632,0.526675,63.924066,6.429447,8.903509
std,15267.04828,0.0,0.0,18.21127,17.237415,331.354157,0.341241,0.024807,89.87802,8.322342,0.258575,187.68623,0.089018,35.955579,0.908722,8.698458
min,123.0,1.0,1.0,0.0,0.0,0.0,-0.487179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,13630.65,1.0,1.0,20.0,9.25,514.0,-0.116157,0.060085,129.769072,11.927741,1.75224,265.25,0.500529,52.124016,6.36734,3.25
50%,26676.5,1.0,1.0,28.0,21.0,732.0,0.220636,0.071068,151.635529,14.264069,1.811903,379.5,0.535473,60.847262,6.546751,6.0
75%,39722.35,1.0,1.0,42.0,32.5,967.25,0.465385,0.08709,171.133333,16.328125,1.872673,525.75,0.568909,68.66354,6.709282,10.75
max,52768.2,1.0,1.0,103.0,79.0,2175.0,0.875,0.137771,972.333333,87.0,2.014925,989.0,0.650133,389.137165,7.204606,48.0


In [107]:
df_final = df[['URL_ID','URL','POSITIVE SCORE','NEGATIVE SCORE','POLARITY SCORE','SUBJECTIVITY SCORE','AVG SENTENCE LENGTH',
              'PERCENTAGE OF COMPLEX WORDS','FOG INDEX','AVG NUMBER OF WORDS PER SENTENCE','COMPLEX WORD COUNT','WORD COUNT','SYLLABLE PER WORD','PERSONAL PRONOUNS','AVG WORD LENGTH']]

In [109]:
df_final.to_excel('output.xlsx',index=False)