### Importing necessary packages

In [1]:
# !pip install -U spacy
# !python -m spacy download en_core_web_sm
# !pip install -U spacy_syllables
# !python -m spacy download en_core_web_md

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import os
import numpy as np
import spacy
from spacy_syllables import SpacySyllables

2023-08-03 10:09:43.488348: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


#### Defining path of the files

In [3]:
basepath = os.getcwd()
print(basepath)
output_file = pd.read_excel('Output Data Structure.xlsx')

/Users/hari/Documents/GitHub/Job-search-2023/BlackCoffer


#### Defing function for web scraping

In [4]:
def get_data(url, url_id):
    filename = basepath + "/Scraped_files/URL_" + str(url_id).rjust(3,'0') + ".txt"
    text_file = open(filename, "wt")
    strings_in_content = ""

    r = requests.get(url)
    htmlContent = r.content

    soup = BeautifulSoup(htmlContent, 'html.parser')
    title = soup.title
    strings_in_content = strings_in_content + title.string + "\n"

    for para in soup.find_all("p"):
        string2 = str(para.get_text())
        strings_in_content = strings_in_content + '\n' + string2

    n = text_file.write(strings_in_content)
    text_file.close()

#### Uncomment the below block to do web scraping

In [5]:
# input_file = pd.read_excel(basepath+'/Input.xlsx')
# url_id = input_file.iloc[:,0]
# url = input_file.iloc[:,1]
# for i in range (len(url_id)):
#     get_data(url[i],url_id[i])


In [6]:
nlp = spacy.load("en_core_web_sm")

In [7]:
from glob import glob
filenames = glob(basepath + "/Scraped_files/*.txt")
stopwords_files = glob(basepath + "/StopWords/*.txt")
# updated_files = glob(basepath + "/Updated_Files/*.txt")
number_of_files = len(filenames)
num_stopwords_files = len(stopwords_files)
# num_updated_files = len(updated_files)
print(number_of_files, num_stopwords_files)

114 7


### Importing text files and making dataframe for further calculations

##### Defining function to help in sentence calculation

In [8]:
# para = df_process["Content"][0]
# doc = nlp(para)
# len(list(doc.sents))

In [9]:
df_process = pd.DataFrame(columns=('URL_ID', 'Content', 'Num_sentence'))

for file in filenames:
    filenumber = file[-7:-4]
#     print(file)
#     print(filenumber)
    file_read = open(file, "r")
    lines = file_read.readlines()
    file_read.close()

    text = ''''''
    for index, line in enumerate(lines):
        lines[index] = line.strip()
        text = text + str(lines[index])
    doc = nlp(text)
    num_sent = len(list(doc.sents))
#     print(file, "    ", num_sent)
    df_process.loc[file] = [filenumber, text, num_sent]
df_process = df_process.sort_values(by=['URL_ID'])
df_process = df_process.reset_index()
df_process.head(2)

Unnamed: 0,index,URL_ID,Content,Num_sentence
0,/Users/hari/Documents/GitHub/Job-search-2023/B...,37,AI in healthcare to Improve Patient Outcomes |...,79
1,/Users/hari/Documents/GitHub/Job-search-2023/B...,38,What if the Creation is Taking Over the Creato...,83


### Extract stopwords from given files and store it to variable "temp_stopwrds"

In [10]:
temp_stopwrds = []
for file in stopwords_files:
    file_read = open(file, "r", encoding='latin-1')
    lines = file_read.readlines()
    file_read.close()
    for index, line in enumerate(lines):
        lines[index] = line.strip()
        temp_stopwrds.append(str(lines[index]))
temp_stopwrds = set(temp_stopwrds)

In [11]:
positive_words = []
neg_words = []

pos_file_read = open(basepath + "/MasterDictionary/positive-words.txt", "r", encoding='latin-1')
pos_lines = pos_file_read.readlines()
pos_file_read.close()
neg_file_read = open(basepath + "/MasterDictionary/negative-words.txt", "r", encoding='latin-1')
neg_lines = neg_file_read.readlines()
neg_file_read.close()
for index, line in enumerate(pos_lines):
    pos_lines[index] = line.strip()
    positive_words.append(str(pos_lines[index]))
positive_words = set(positive_words)
for index, line in enumerate(neg_lines):
    neg_lines[index] = line.strip()
    neg_words.append(str(neg_lines[index]))
neg_words = set(neg_words)

In [12]:
def char_counts(x):
    s = x.split()
    x = ''.join(s)
    return len(x)

In [13]:
nlp_syll = spacy.load('en_core_web_md')
syllables = SpacySyllables(nlp_syll)
nlp_syll.add_pipe('syllables', after='tagger')

def spacy_syllablize(word):
    token = nlp_syll(word)[0]
    return token._.syllables

def count_syllables(x):
    count = 0
    for test_word in (str(x).split()):
        try:
#             print("yes ", end = "")
            if len(spacy_syllablize(test_word)) > 2:
                count += 1
        except TypeError:
            pass
    return count

In [14]:
df_process["URL"] = output_file["URL"]
df_process['Content'] = df_process['Content'].apply(lambda x: ' '.join([t for t in x.split() if t not in temp_stopwrds]))
df_process['Content'] = df_process['Content'].apply(lambda x: re.sub('[^.,a-zA-Z0-9 \n\.]', '',x))
df_process["Word_count"] = df_process["Content"].apply(lambda x: len(str(x).split()))
df_process["Char_count"] = df_process["Content"].apply(lambda x: char_counts(x))
df_process["Avg_word_len"] = df_process["Char_count"] / df_process["Word_count"]
df_process['Pos_score'] = df_process['Content'].apply(lambda x: len(' '.join(t for t in x.split() if t in positive_words)))
df_process['Neg_score'] = df_process['Content'].apply(lambda x: len(' '.join(t for t in x.split() if t in neg_words)))
df_process['Polarity_score'] = (df_process['Pos_score'] - df_process['Neg_score']) / (df_process['Pos_score'] + df_process['Neg_score'] +0.000001)
df_process['Subjectivity_score'] = (df_process['Pos_score'] + df_process['Neg_score']) / (df_process['Word_count'] +0.000001)
df_process["Avg_sent_len"] = df_process["Word_count"] / df_process["Num_sentence"]
df_process["Complex_count"] = df_process["Content"].apply(lambda x: count_syllables(str(x)))


In [15]:
df_process["Perc_complex_wrds"] = df_process["Complex_count"] / df_process["Word_count"]
df_process["Fog_index"] = 0.4 * (df_process["Avg_sent_len"] + df_process["Perc_complex_wrds"])
df_process.columns

Index(['index', 'URL_ID', 'Content', 'Num_sentence', 'URL', 'Word_count',
       'Char_count', 'Avg_word_len', 'Pos_score', 'Neg_score',
       'Polarity_score', 'Subjectivity_score', 'Avg_sent_len', 'Complex_count',
       'Perc_complex_wrds', 'Fog_index'],
      dtype='object')

In [16]:
df_process = df_process.loc[:,['URL_ID', 'URL', 'Content', 'Num_sentence', 'Word_count', 'Char_count',
       'Avg_word_len', 'Pos_score', 'Neg_score', 'Polarity_score',
       'Subjectivity_score', 'Avg_sent_len', 'Complex_count',
       'Perc_complex_wrds', 'Fog_index']]
df_process.head(2)

Unnamed: 0,URL_ID,URL,Content,Num_sentence,Word_count,Char_count,Avg_word_len,Pos_score,Neg_score,Polarity_score,Subjectivity_score,Avg_sent_len,Complex_count,Perc_complex_wrds,Fog_index
0,37,https://insights.blackcoffer.com/ai-in-healthc...,healthcare Improve Patient Outcomes Blackcoff...,79,1144,8946,7.81993,511,204,0.429371,0.625,14.481013,389,0.340035,5.928419
1,38,https://insights.blackcoffer.com/what-if-the-c...,What Creation Taking Over Creator Blackcoffer...,83,789,5622,7.125475,371,213,0.270548,0.740177,9.506024,213,0.269962,3.910394


In [21]:
df_process_final = df_process.copy()
df_process_final = df_process_final.drop(["Content"], axis = 1)
df_process_final.head()

Unnamed: 0,URL_ID,URL,Num_sentence,Word_count,Char_count,Avg_word_len,Pos_score,Neg_score,Polarity_score,Subjectivity_score,Avg_sent_len,Complex_count,Perc_complex_wrds,Fog_index
0,37,https://insights.blackcoffer.com/ai-in-healthc...,79,1144,8946,7.81993,511,204,0.429371,0.625,14.481013,389,0.340035,5.928419
1,38,https://insights.blackcoffer.com/what-if-the-c...,83,789,5622,7.125475,371,213,0.270548,0.740177,9.506024,213,0.269962,3.910394
2,39,https://insights.blackcoffer.com/what-jobs-wil...,89,1030,7938,7.706796,570,242,0.403941,0.78835,11.573034,354,0.343689,4.766689
3,40,https://insights.blackcoffer.com/will-machine-...,97,846,6058,7.160757,396,123,0.526012,0.613475,8.721649,244,0.288416,3.604026
4,41,https://insights.blackcoffer.com/will-ai-repla...,87,986,7187,7.289047,395,146,0.460259,0.548682,11.333333,288,0.292089,4.650169


In [22]:
df_process_final.to_excel("Hari_Output_file.xlsx", index=False)