# Variables for Text Analysis

## Import Libraries

In [1]:
# importing necessary libraries
!pip install nltk
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
nltk.download('punkt')
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
import string
from openpyxl import Workbook, load_workbook

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting regex>=2021.8.3
  Downloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m774.0/774.0 kB[0m [31m52.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: regex, nltk
Successfully installed nltk-3.8.1 regex-2023.12.25


[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


##### REQUIREMENTS: 
1. All Webpages must be article/blog webpage. And all should be the same company webpages.
2. Need to upload the "Input_file"(excel file which contain the links of webpages of particular company) inside environment. 
3. Also Required to upload positive words & negative words txt file in “MastersDictionary” folder and stop words txt files in “StopWords” folder (remember to give the same name to the folder as it mentioned here, otherwise it creates discrepancies) inside the environment.
4. Provide the number of wepages link that excel file contain in "num_of_files".
5. Provide "folder_name" (make sure folder is create first at environment with same name). Here, we give "extracted_articles" as folder name.

## Create Functions

In [2]:
#function to load the input.xlsx file inside python environment

def store_articles(Input_file, num_of_files, folder_name):
    
    # make a dataframe where we load data present in input file
    input_df = pd.read_excel(Input_file)

    # through iteration we get url from dataframe we made
    for i in range(num_of_files):
        url = input_df.URL[i]
        page = requests.get(url)

        # using BeautifulSoup for scraping the data
        soup = BeautifulSoup(page.content, 'html.parser')
        
        #change the class according to the webpage
        title = soup.find('h1', attrs={'class': 'entry-title'})
        if title is None:
            title = soup.find('h1', attrs={'class':'tdb-title-text'})

        content = soup.find('div', attrs={'class': 'td-post-content tagdiv-type'})
        if content is None:
            content = soup.find('div', attrs={'class': 'tdb-block-inner td-fix-index'})

        # check if title and content are found
        if title is not None and content is not None:
            f_title = title.text
            f_content = content.text.replace('\n', '')

            # storing each Article in folder "extracted_articles" with name "webpage0001, webpage0002,....,etc"
            name = folder_name + '/webpage' + str("{:04d}".format(i+1)) + '.txt'
            with open(name, "w") as file:
                file.write(f_title + "\n\n" + f_content)
                file.flush()
        else:
            print(f'Webpage {str("{:04d}".format(i+1))} have 404 Error')
            
    print(f"\n\n All {num_of_files} webpages loaded as txt in {folder_name}")

In [3]:
#functions to create a list of stop words
def make_stopword_list(stopword_file_list):
    # store extracted data from Stop_words txt files into a list
    ex_words = []
    for i in stopword_file_list:
        name = 'StopWords/' + i + '.txt'
        with open(name, 'r', encoding='latin-1') as file:
            for word in file.readlines():
                lines = word.strip().split('|')
                clean_word = [line.strip() for line in lines]
                ex_words.extend(clean_word)
    return ex_words

#functions to create a list of positive words
def make_pos_words_list(pos_file):
    # store extracted data from positive_words txt files into a list
    pos_words = []
    with open(pos_file, 'r', encoding='latin-1') as pfile:
        for word in pfile.readlines():
            pos_words.append(word.replace('\n',''))
    return pos_words

#functions to create a list of negative words
def make_neg_words_list(neg_file):
    # store extracted data from negative_words txt files into a list
    neg_words = []
    with open(neg_file, 'r', encoding='latin-1') as nfile:
        for word in nfile.readlines():
            neg_words.append(word.replace('\n',''))
    return neg_words

#functions to create a dictionary of positive words
def make_positive_dict(stopword_file_list, pos_file):
    stop_words = make_stopword_list(stopword_file_list)
    positive_words = make_pos_words_list(pos_file)
    pos_dict = {}
    i = 1
    for word in positive_words:
        if word not in stop_words:
            pos_dict[i] = word
            i = i+1
    return pos_dict

#functions to create a dictionary of negative words
def make_negative_dict(stopword_file_list, neg_file):
    stop_words = make_stopword_list(stopword_file_list)
    negative_words = make_neg_words_list(neg_file)
    neg_dict = {}
    i = 1
    for word in negative_words:
        if word not in stop_words:
            neg_dict[i] = word
            i = i+1
    return neg_dict

#function to create a list of words(excluding stop words) in a webpage
def content_without_stop_words(content):
    #removing all types of punctuation marks, whitespaces,etc.
    content = content.translate(str.maketrans('','',string.punctuation))
    content_words = content.split()
    
    stop_words = make_stopword_list(stopword_file_list)
    #removing all the stop_words present in article content 
    for word in stop_words:
        while word in content_words:
            content_words.remove(word)
    new_content = ' '.join(content_words)
    
    # using nltk libraries to use word_tokenize
    tokens = word_tokenize(new_content)      
    return tokens

#functions to generate a positive score of a webpage
def positive_score(stopword_file_list, pos_file, content):
    positive_dict = make_positive_dict(stopword_file_list, pos_file)
    tokens = content_without_stop_words(content)
    positive_score = 0
    for word in tokens:
        if word in positive_dict.values():
            positive_score += 1

    return positive_score

#functions to generate a negative score of a webpage
def negative_score(stopword_file_list, neg_file, content):
    negative_dict = make_negative_dict(stopword_file_list, neg_file)
    tokens = content_without_stop_words(content)
    negative_score = 0
    for word in tokens:
        if word in negative_dict.values():
            negative_score -= 1

    return negative_score * -1

#function to count a numbers of words(including stop words) in a webpage
def count_content_words(content):
    words = word_tokenize(content)
    words = [word for word in words if word not in string.punctuation]
    total_no_of_words = len(words)
    return total_no_of_words

#function to count a numbers of sentences(including stop words) in a webpage
def count_content_sentences(content):
    sentences = nltk.sent_tokenize(content)
    return len(sentences)

#function to count a numbers of syllables(including stop words) in a word
#Note: as it specified "Complex words are words in the text that contain more than two syllables."
def count_syallbles(word):
    if word == "’":
        return 0
    vowel = 'aeiouy'
    count = 0
    previous_character = False
    for character in word:
        if character.lower() in vowel:
            if not previous_character:
                count +=1
            previous_character = True
        else:
            previous_character = False
    
    if character.endswith(('es','ed')) and not character.endswith(('le','ue')):
        count -=1
    if character.endswith('e') and not character.endswith(('le','ue')):
        count -=1
        
    # Ensure at least one syllable
    count = max(count, 1)
    return count

#function to count a numbers of complex words(including stop words) in a webpage
def count_complex_words(content):
    words = word_tokenize(content)
    word_list = [word for word in words if word not in string.punctuation]
    complex_list = []
    for word in word_list:
        count = count_syallbles(word)
        if count > 2:
            complex_list.append(word)
    return len(complex_list)

#Function to count the personal pronouns in a webpage
def count_personal_pronouns(content):
    # Define a regular expression pattern to match personal pronouns
    pattern = r'\b(?:I|we|my|our(?:s)?|us)\b'
    
    # Find all matches of the pattern in the text
    matches = re.findall(pattern, content, flags=re.IGNORECASE)
    
    if ('US' in matches):
        matches.remove('US')
    if ('Us' in matches):
        matches.remove('Us')
    # Return the count of matches
    return len(matches)

#Function to calculate the average word length in a webpage
def average_word_length(content):
    # Remove punctuation from text first
    text_without_punctuation = content.translate(str.maketrans('', '', string.punctuation))
    
    words = word_tokenize(text_without_punctuation)
    total_characters = sum(len(word) for word in words)
    total_words = len(words)
    
    if total_words > 0:
        average_length = total_characters / total_words
    else:
        average_length = 0
    
    return average_length

#Function to count the Syllable Per Word in a webpage
def Syllable_Count_Per_Word(content):
    text_without_punctuation = content.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text_without_punctuation)
    no_of_syallble = {}
    for word in words:
        if not word.isdigit() and not (word == "’"):
            no_of_syallble[word] = count_syallbles(word)
    syllable_per_word = sum(no_of_syallble.values()) / len(no_of_syallble)
    return syllable_per_word

#Function to calculate the average of words per sentences
def average_words_per_sentence(content):
    sentences = sent_tokenize(content)

    total_words = 0
    for sent in sentences:
        words = len(word_tokenize(sent))
        total_words += words

    total_sentences = len(sentences)

    if total_sentences > 0:
        average = total_words / total_sentences
    else:
        average = 0
    
    return average

#function to save file in Excel
def write_to_excel(column_names, row_values, filename='Output_Data_Structure.xlsx'):
    try:
        # Check if the file exists
        wb = load_workbook(filename)
    except FileNotFoundError:
        wb = Workbook()
        
    ws = wb.active
    # If the sheet is empty, write the header row
    if ws.max_row == 0:
        ws.append(column_names)
    ws.append(row_values)
    wb.save(filename)

In [4]:
# function to calculate all values and store it in Excel file  as "Output_Data_Structure.xlsx"

def save_output(folder_name, num_of_files):
    for i in range(num_of_files):
        try:
            content_file = folder_name + '/webpage'+ str("{:04d}".format(i+1)) + '.txt'
            file_name = 'webpage'+ str("{:04d}".format(i+1))

            with open(content_file, 'r') as article:
                # there is an anomaly : "’s"
                content = (article.read()).replace("’s", "") 

            #storing lists of words
            stop_words = make_stopword_list(stopword_file_list)
            positive_words = make_pos_words_list(pos_file)
            negative_words = make_neg_words_list(neg_file)

            #storing a dictionary
            positive_dict = make_positive_dict(stopword_file_list, pos_file)
            negative_dict = make_negative_dict(stopword_file_list, neg_file)

            #list of list of words(excluding stop words) in a webpage
            tokens = content_without_stop_words(content)

            #store a positive and negative store
            pos_score = positive_score(stopword_file_list, pos_file, content)
            neg_score = negative_score(stopword_file_list, neg_file, content)

            #calculate and store Polarity Score and Subjectivity Score
            Polarity_Score = (pos_score - neg_score)/ ((pos_score + neg_score) + 0.000001)
            Subjectivity_Score = (pos_score + neg_score)/ (len(tokens) + 0.000001)

            #store the numbers of complex word, sentences, total words(included complex word) in a webpage
            complex_word_count = count_complex_words(content)
            content_sentences = count_content_sentences(content)
            content_words = count_content_words(content)

            #calculate and store Average Sentence Length, Percentage of Complex words and Fog Index
            Average_Sentence_Length = content_words / content_sentences
            Percentage_of_Complex_words = complex_word_count / content_words
            Fog_Index = 0.4 * (Average_Sentence_Length + Percentage_of_Complex_words)

            #calculate and store average words per sentence
            avg_words_per_sentence = average_words_per_sentence(content)

            #calculate and store personal pronouns in a content
            personal_pronouns_count = count_personal_pronouns(content)

            #calculate and store average of word length in a content
            avg_word_length = average_word_length(content)

            #calculate and store Syllable Count Per Word in a content
            Syllable_Per_Word = Syllable_Count_Per_Word(content)

            #store the output in excel file as "Output_Data_Structure.xlsx"
            column_names = ["Input_file_variables","POSITIVE_SCORE","NEGATIVE_SCORE","POLARITY_SCORE","SUBJECTIVITY_SCORE","AVG_SENTENCE_LENGTH","PERCENTAGE_OF_COMPLEX_WORDS","FOG_INDEX","AVG_NUMBER_OF_WORDS_PER_SENTENCE","COMPLEX_WORD_COUNT","WORD_COUNT","SYLLABLE_PER_WORD","PERSONAL_PRONOUNS","AVG_WORD_LENGTH"]
            row_values = [ file_name , pos_score , neg_score , Polarity_Score , Subjectivity_Score , Average_Sentence_Length , Percentage_of_Complex_words , Fog_Index , avg_words_per_sentence , complex_word_count, content_words, Syllable_Per_Word , personal_pronouns_count , avg_word_length ] 
            write_to_excel(column_names, row_values)
        
        except FileNotFoundError:
            print("Input file not found:", file_name)

## Execute Program

##### Note: 
##### To avoid duplicacy, first delete the output excel file(if program executed once already), then execute/run again.

In [5]:
#main function

if __name__ == "__main__" :
    #load the input.xlsx file inside python environment            
    #[Note: Each articles store inside "extracted_articles" folder with their specified "URL_ID"]
    Input_file = 'Input.xlsx'
    num_of_files = 100
    folder_name = 'extracted_articles'
    store_articles(Input_file, num_of_files, folder_name) 

    # Required parameters for functions
    stopword_file_list = ['StopWords_Auditor','StopWords_Currencies','StopWords_DatesandNumbers','StopWords_Generic','StopWords_GenericLong','StopWords_Geographic','StopWords_Names']
    pos_file = 'MastersDictionary/positive-words.txt'
    neg_file = 'MastersDictionary/negative-words.txt'
    
    #save the file in system as Output_Data_Structure.xlsx (default argument)
    save_output(folder_name, num_of_files)

Webpage 0036 have 404 Error
Webpage 0049 have 404 Error


 All 100 webpages loaded as txt in extracted_articles
Input file not found: webpage0036
Input file not found: webpage0049


# Result output
* As a code result, it generate an excel file as a "output_data_structure.xlsx", then we can download it to our local system.

* For Text Analysis we find out these variables here
1. All input variables in “Input.xlsx”
2. POSITIVE SCORE
3. NEGATIVE SCORE
4. POLARITY SCORE
5. SUBJECTIVITY SCORE
6. AVG SENTENCE LENGTH
7. PERCENTAGE OF COMPLEX WORDS
8. FOG INDEX
9. AVG NUMBER OF WORDS PER SENTENCE
10. COMPLEX WORD COUNT
11. WORD COUNT
12. SYLLABLE PER WORD
13. PERSONAL PRONOUNS
14. AVG WORD LENGTH