In [2]:
import numpy as np
import pandas as pd
import os
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import syllapy

## Positive and Negative Import

In [3]:
with open('../MasterDictionary/positive-words.txt', 'r') as file:
            # Read the contents of the file
            text = file.read()
            # Split the text into words
            p_words = text.split()
    
with open('../MasterDictionary/negative-words.txt', 'r') as file:
            # Read the contents of the file
            text = file.read()
            # Split the text into words
            n_words = text.split()

## Stopwords Import

In [4]:
all_stopword_filepath = os.listdir('../StopWords/')
# Create an empty list to store the words
words = []
for file in all_stopword_filepath:
    filepath = os.path.join('../StopWords/',file)
    with open(filepath, 'r') as file:
        text = file.read()
        word= text.split()
        # Loop through each line in the file
        for line in word:
            if line=='|':continue
            words.append(line.strip())


In [5]:
stopword_from_drive=[]
for i in words:
    stopword_from_drive.append(i.lower())

# Data Analysis

## Stopword Removal Function and Punctuation

In [6]:
##Remove Stopword from the text ,with reference to stopwords given in drive
def remove_stopword_drive(text):
    new_text=[]
    token_word = word_tokenize(text)
    for i in token_word:
        if i in stopword_from_drive:
            pass
        else:
            new_text.append(i)
    return new_text

##Remove Stopword with reference to nltk datasets
def remove_stopword_nltk(text):
    new_text=[]
    for i in text:
        if i in stopwords.words('english'):
            pass
        else:
            new_text.append(i)
    return new_text

def Remove_punctuation(text):
    new_text=[]
    exclude = string.punctuation
    for i in text:
        if i in exclude:
            pass
        else:
            new_text.append(i)
    return new_text

## Score- Positive,Negative,Polarity,Subjectivity

In [7]:
def positive_score(tokenize_text):
    positive_val = 0  # initiate with 0
    for word in tokenize_text:
        if word in p_words:    
            positive_val += 1
    return positive_val

def negative_score(tokenize_text):
    negative_val = 0  # initiate with 0
    for word in tokenize_text:
        if word in n_words:     
            negative_val -= 1
    return negative_val

def polarity_score(tokenize_text):
    positive_value = positive_score(tokenize_text)
    negative_value = negative_score(tokenize_text)
    polarity_score = (positive_value - negative_value) / ((positive_value + negative_value) + 0.000001)
    return polarity_score

def subjectivity_score(tokenize_text):
    positive_value = positive_score(tokenize_text)
    negative_value = negative_score(tokenize_text)
    subjectivity_score = (positive_value + negative_value) / (len(tokenize_text) + 0.000001)
    return subjectivity_score

## Average Sentence Length and Word length

In [8]:
def Avg_Sentence_length(text,no_words):
    sentences = re.split(r'[.?\n]', text)
    ##For removing ' '
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    Avg_sen_length = len(no_words)/(len(sentences))
    return Avg_sen_length

def Avg_Word_Sentence(text,no_words):
    sentences = re.split(r'[.?\n]', text)
    ##For removing ' '
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    Avg_sen_length = len(no_words)/(len(sentences))
    return Avg_sen_length
 

## Complex words and Fog Index

In [9]:
def count_complex_words(text):
    complex_word_count = 0
    for word in text:
        # Calculate the number of syllables in the word
        syllable_count = syllapy.count(word)
        # Define a threshold for what constitutes a complex word
        # For example, words with more then 2 syllables may be considered complex
        if syllable_count > 2:
            complex_word_count += 1

    return complex_word_count

def percentage_complex_words(text):
    return (count_complex_words(text)/len(text))

def fog_index(avg_sentence_length,percentage_complex_no):
    return 0.4*(avg_sentence_length+percentage_complex_no)

## Syllables per word

In [10]:
def num_syllables(word):
    vowel='aeiou'
    num_vowel = 0
    for i in word:
        if i in vowel:
            num_vowel+=1
    # Substracting the Addition of vowel due to ed and es
    if word.endswith('ed') or word.endswith('es'):
        num_vowel -=1 
    return num_vowel

def count_syllables_per_word(text):
    ##Creating list of vowel per word
    counts = [num_syllables(word) for word in text]
    return counts

## Personal Pronouns

In [11]:
def count_personal_pronous(whole_text):
    pattern = re.compile(r'(?<!\bUS\b)\b(I|we|my|ours|us)\b', re.IGNORECASE)
    matches = pattern.findall(whole_text)
    return len(matches)

## Average Word Length

In [12]:
def avg_word_length(text):
    total_char = 0
    total_word = len(text)

    for word in text:
        total_char+=len(word)
    return total_char/total_word

# Results

In [13]:
input = pd.read_excel('../Input.xlsx')
URLS= input['URL']
URLS_ID = input['URL_ID']

Data = [
    'URL_ID','URL','POSITIVE SCORE','NEGATIVE SCORE','POLARITY SCORE','SUBJECTIVITY SCORE','AVG SENTENCE LENGTH',
    'PERCENTAGE OF COMPLEX WORDS','FOG INDEX','AVG NUMBER OF WORDS PER SENTENCE','COMPLEX WORD COUNT',
    'WORD COUNT','SYLLABLE PER WORD','PERSONAL PRONOUNS','AVG WORD LENGTH'
]
df=pd.DataFrame(columns=Data)

file = os.listdir('./Text_File/')
mark = -1
for i in file:
    mark+=1
    ## To fix the numbering due to page not found during web scraping
    if URLS_ID[mark] == 'blackassign0036' or URLS_ID[mark] == 'blackassign0049':
        mark+=1
    file_path = os.path.join('./Text_File/',i)
    with open(file_path,encoding='utf-8') as f:
        text_for_pronouns= f.read().strip() # Creating new text for pronous count (with stopwords)
        text = text_for_pronouns.lower() # Implementing lower beacause to remove stopwords and punctuation
        tokenize_1 = remove_stopword_drive(text) #tokenize word after removing stopword (comparing with drive)
        tokenize_2 = remove_stopword_nltk(tokenize_1) #tokenize word after removing stopword(comparing with nltk data)
        tokenize_3 = Remove_punctuation(tokenize_2) #tokenize word after removing punctuation
        
        dict ={
            'URL_ID': [URLS_ID[mark]],
            'URL': [URLS[mark]],
            'POSITIVE SCORE' : [positive_score(tokenize_3)] ,
            'NEGATIVE SCORE' : [negative_score(tokenize_3)],
            'POLARITY SCORE' : [polarity_score(tokenize_3)],
            'SUBJECTIVITY SCORE' : [subjectivity_score(tokenize_3)],
            'AVG SENTENCE LENGTH' : [Avg_Sentence_length(text,tokenize_3)],
            'PERCENTAGE OF COMPLEX WORDS' : [percentage_complex_words(tokenize_3)],
            'FOG INDEX' : [fog_index(Avg_Sentence_length(text,tokenize_3),percentage_complex_words(tokenize_3))],
            'AVG NUMBER OF WORDS PER SENTENCE' : [Avg_Word_Sentence(text,tokenize_3)],
            'COMPLEX WORD COUNT' : [count_complex_words(tokenize_3)],
            'WORD COUNT' : [len(tokenize_3)],
            'SYLLABLE PER WORD' : [count_syllables_per_word(tokenize_3)],
            'PERSONAL PRONOUNS' : [count_personal_pronous(text)],
            'AVG WORD LENGTH' : [avg_word_length(tokenize_3)]
        }

        df2 = pd.DataFrame(dict)
        df = pd.concat([df,df2])

  df = pd.concat([df,df2])


In [14]:
df

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,26,-6,1.600000,0.041237,5.511364,0.323711,2.334030,5.511364,157,485,"[2, 2, 3, 4, 5, 2, 0, 2, 4, 4, 3, 3, 2, 2, 4, ...",12,6.800000
0,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,53,-31,3.818182,0.032211,8.130952,0.478770,3.443889,8.130952,327,683,"[2, 2, 3, 4, 5, 2, 3, 2, 4, 5, 0, 2, 4, 3, 3, ...",6,7.682284
0,blackassign0003,https://insights.blackcoffer.com/internet-dema...,36,-23,4.538461,0.020934,9.553846,0.552335,4.042472,9.553846,343,621,"[3, 2, 0, 5, 6, 2, 0, 0, 5, 2, 5, 1, 2, 3, 4, ...",13,8.330113
0,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,37,-74,-3.000000,-0.061770,9.359375,0.522538,3.952765,9.359375,313,599,"[2, 3, 2, 3, 3, 2, 1, 5, 6, 1, 2, 2, 5, 3, 2, ...",5,8.095159
0,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,19,-8,2.454545,0.032070,8.365854,0.457726,3.529432,8.365854,157,343,"[2, 2, 5, 2, 3, 0, 2, 2, 3, 5, 1, 5, 2, 4, 2, ...",6,7.930029
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,blackassign0096,https://insights.blackcoffer.com/what-is-the-r...,26,-54,-2.857143,-0.051471,9.890909,0.430147,4.128422,9.890909,234,544,"[2, 2, 5, 4, 4, 3, 2, 3, 1, 2, 3, 3, 2, 3, 2, ...",4,7.380515
0,blackassign0097,https://insights.blackcoffer.com/impact-of-cov...,22,-35,-4.384616,-0.031785,10.225000,0.371638,4.238655,10.225000,152,409,"[2, 2, 3, 3, 2, 3, 3, 2, 0, 2, 1, 1, 3, 1, 2, ...",7,6.735941
0,blackassign0098,https://insights.blackcoffer.com/contribution-...,5,-3,3.999998,0.009479,6.806452,0.450237,2.902675,6.806452,95,211,"[5, 3, 3, 1, 5, 3, 3, 2, 1, 1, 3, 3, 3, 1, 3, ...",0,7.469194
0,blackassign0099,https://insights.blackcoffer.com/how-covid-19-...,11,-3,1.750000,0.034483,7.733333,0.301724,3.214023,7.733333,70,232,"[2, 3, 2, 3, 1, 0, 3, 2, 3, 2, 0, 4, 2, 3, 2, ...",4,6.681034


## Saving File

In [70]:
df.to_excel('../Output_File.xlsx', index=False)