<h4 style="color: #00698f; font-weight: bold; text-align: center;"> <span style="color: #2ecc71;">DATA</span> <span style="color: #ff9900;">EXTRACTION</span> <span style="color: #3498db;">AND</span> <span style="color: #8e44ad;">NLP</span> </h4>

### Importing the Required Libraries

In [41]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
# Load NLTK resources (you might need to download NLTK resources)
nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lokesh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\lokesh\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lokesh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Read the Data

In [42]:
data=pd.read_excel("C:\\Users\\lokesh\\Downloads\\Input.xlsx")
data

Unnamed: 0,URL_ID,URL
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...
...,...,...
95,blackassign0096,https://insights.blackcoffer.com/what-is-the-r...
96,blackassign0097,https://insights.blackcoffer.com/impact-of-cov...
97,blackassign0098,https://insights.blackcoffer.com/contribution-...
98,blackassign0099,https://insights.blackcoffer.com/how-covid-19-...


### Data Extraction and save the data

In [43]:
%%time
# This magic funtion is used to prin the execution of the time
# Function to extract the article title and text from a URLs given in Input.xlxs file and extracting each articles into txt file

def extract_article_text(url):
    try:
        response=requests.get(url)
        soup=BeautifulSoup(response.text,'html.parser')
        
        # Find and remove unwanted elements (e.g., header, footer, etc.)
        for element in soup(["header","footer"]):
            element.decompose()
        
        # Extract article title and text
        article_title=soup.find('title').text.strip()
        article_text=""
        
        # Extract text from <div class="td-post-content tagdiv-type">
        article_div=soup.find('div',class_='td-post-content tagdiv-type')
        if article_div:
            article_text=article_div.get_text()
        return article_title,article_text
    
    except Exception:
        print(f"Error while extracting article from {url}: {Exception}")
        return None,None

# Function to save the article title and text to a text file
def save_article_to_file(url_id,article_title,article_text):
    if not os.path.exists("articles"):
        os.mkdir("articles")
    with open(f"articles/{url_id}.txt", "w", encoding="utf-8") as file:
        file.write(f"Title:{article_title}\n\n")
        file.write(article_text)
def main():
    input_file="C:\\Users\\lokesh\\Downloads\\Input.xlsx"
    df=pd.read_excel(input_file)
    for index, row in df.iterrows():
        url_id=row["URL_ID"]
        url=row["URL"]
 
        # Extract article title and text4
        article_title,article_text=extract_article_text(url)

        # Check if extraction was successful
        if article_title and article_text:
            save_article_to_file(url_id,article_title,article_text)
            print(f"Article {url_id} extracted and saved successfully.")
        else:
            print(f"Failed to extract article {url_id}.")

if __name__ == "__main__":
    main()

Article blackassign0001 extracted and saved successfully.
Article blackassign0002 extracted and saved successfully.
Article blackassign0003 extracted and saved successfully.
Article blackassign0004 extracted and saved successfully.
Article blackassign0005 extracted and saved successfully.
Article blackassign0006 extracted and saved successfully.
Article blackassign0007 extracted and saved successfully.
Article blackassign0008 extracted and saved successfully.
Article blackassign0009 extracted and saved successfully.
Article blackassign0010 extracted and saved successfully.
Article blackassign0011 extracted and saved successfully.
Article blackassign0012 extracted and saved successfully.
Article blackassign0013 extracted and saved successfully.
Failed to extract article blackassign0014.
Article blackassign0015 extracted and saved successfully.
Article blackassign0016 extracted and saved successfully.
Article blackassign0017 extracted and saved successfully.
Article blackassign0018 extra

### Data Analysis

In [44]:
# Function to load positive and negative dictionaries from files
def load_dictionaries(positive_dict_file,negative_dict_file):
    with open(positive_dict_file,'r') as file:
        positive_words=set(file.read().splitlines())
    with open(negative_dict_file,'r') as file:
        negative_words=set(file.read().splitlines())
    return positive_words,negative_words

# Function to perform sentiment analysis and calculate scores
def calculate_sentiment_scores(text, positive_words,negative_words):
    sia=SentimentIntensityAnalyzer()
    tokens=word_tokenize(text)
    positive_score=0
    negative_score=0
    for word in tokens:
        # Remove punctuation and convert to lowercase
        word=word.lower()
        if word.isalpha():
            # Check if the word is in the positive dictionary
            if word in positive_words:
                positive_score+=1
            # Check if the word is in the negative dictionary
            if word in negative_words:
                negative_score+=1
    
    # Calculate sentiment analysis metrics
    polarity_score=(positive_score-negative_score)/((positive_score+negative_score)+0.000001)
    subjectivity_score=(positive_score+negative_score)/(len(tokens)+0.000001)
    return positive_score,negative_score,polarity_score,subjectivity_score

def main():
    input_data_file="C:\\Users\\lokesh\\Downloads\\20211030 Test Assignment-20240604T071155Z-001\\20211030 Test Assignment\\Output Data Structure.xlsx"
    positive_dict_file="C:\\Users\\lokesh\\Downloads\\20211030 Test Assignment-20240604T071155Z-001\\20211030 Test Assignment\\MasterDictionary\\positive-words.txt"
    negative_dict_file="C:\\Users\\lokesh\\Downloads\\20211030 Test Assignment-20240604T071155Z-001\\20211030 Test Assignment\\MasterDictionary\\negative-words.txt"
    articles_dir="articles"
    
    # Load dictionaries
    positive_words,negative_words=load_dictionaries(positive_dict_file,negative_dict_file)

    # Read output data structure Excel file
    output_data=pd.read_excel(input_data_file)
    results=[]
    for index,row in output_data.iterrows():
        url_id=row["URL_ID"]
        url=row["URL"]
        article_file=os.path.join(articles_dir,f"{url_id}.txt")
        if os.path.exists(article_file):
            # Read article text from file
            with open(article_file,'r',encoding='utf-8') as article:
                article_text=article.read()
            # Perform sentiment analysis
            positive_score, negative_score, polarity_score, subjectivity_score = calculate_sentiment_scores(article_text, positive_words, negative_words)
            results.append({
                "URL_ID":url_id,
                "URL":url,
                "Positive_Score":positive_score,
                "Negative_Score":negative_score,
                "Polarity_Score":polarity_score,
                "Subjectivity_Score":subjectivity_score
            })
    
    # Create DataFrame from results
    result_df=pd.DataFrame(results)
    
    # Save results to Excel
    result_df.to_excel("sentiment_analysis_results.xlsx",index=False)

if __name__ == "__main__":
    main()

### Sentiment Analysis data

In [45]:
sentiment_analysis=pd.read_excel("sentiment_analysis_results.xlsx")
sentiment_analysis

Unnamed: 0,URL_ID,URL,Positive_Score,Negative_Score,Polarity_Score,Subjectivity_Score
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,44,6,0.760000,0.035765
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,64,31,0.347368,0.055458
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,40,24,0.250000,0.051780
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,39,75,-0.315789,0.092457
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,24,8,0.500000,0.041078
...,...,...,...,...,...,...
84,blackassign0094,https://insights.blackcoffer.com/gaming-disord...,34,50,-0.190476,0.064319
85,blackassign0095,https://insights.blackcoffer.com/what-is-the-r...,13,25,-0.315789,0.051007
86,blackassign0096,https://insights.blackcoffer.com/what-is-the-r...,30,57,-0.310345,0.070789
87,blackassign0097,https://insights.blackcoffer.com/impact-of-cov...,32,35,-0.044776,0.054828


### Text Analysis

In [46]:
# Function to calculate average sentence length
def calculate_avg_sentence_length(sentences):
    total_words=sum(len(word_tokenize(sentence)) for sentence in sentences)
    total_sentences=len(sentences)
    return total_words/total_sentences

# Function to calculate percentage of complex words
def calculate_percentage_complex_words(text):
    words=word_tokenize(text)
    complex_words=[word for word in words if len(word)>2]
    return len(complex_words)/len(words)

# Function to calculate fog index
def calculate_fog_index(avg_sentence_length,percentage_complex_words):
    return 0.4*(avg_sentence_length+percentage_complex_words)

# Function to calculate average number of words per sentence
def calculate_avg_words_per_sentence(words,sentences):
    return len(words)/len(sentences)

# Function to calculate complex word count
def calculate_complex_word_count(text):
    words=word_tokenize(text)
    complex_words=[word for word in words if len(word)>2]
    return len(complex_words)

# Function to calculate word count
def calculate_word_count(text):
    words=word_tokenize(text)
    stop_words=set(stopwords.words("english"))
    cleaned_words=[word for word in words if word not in stop_words and word.isalpha()]
    return len(cleaned_words)

# Function to count syllables in a word
def count_syllables(word):
    vowels="AEIOUaeiou"
    count=0
    if word[-1] in ['E','eE'] and word[-2:]!='LE' and word[-2:]!='le':
        word=word[:-1]
    for index,letter in enumerate(word):
        if index==0 and letter in vowels:
            count+=1
        elif letter in vowels and word[index-1] not in vowels:
            count+=1
    return count

# Function to calculate syllable count per word
def calculate_syllable_count_per_word(text):
    words=word_tokenize(text)
    syllable_count=sum(count_syllables(word) for word in words)
    return syllable_count/len(words)

# Function to calculate personal pronoun count
def calculate_personal_pronouns(text):
    pronouns=["I","we","my","ours","us"]
    pattern=r'\b(?:'+'|'.join(pronouns)+r')\b'
    matches=re.findall(pattern,text)
    return len(matches)

# Function to calculate average word length
def calculate_avg_word_length(text):
    words=word_tokenize(text)
    total_characters=sum(len(word) for word in words)
    return total_characters/len(words)

def main():
    output_data_file="C:\\Users\\lokesh\\Downloads\\20211030 Test Assignment-20240604T071155Z-001\\20211030 Test Assignment\\Output Data Structure.xlsx"
    articles_dir="articles"
    
    # Read output data structure Excel file
    output_data=pd.read_excel(output_data_file)
    results_=[]
    for index,row in output_data.iterrows():
        url_id=row["URL_ID"]
        article_file=os.path.join(articles_dir,f"{url_id}.txt")
        if os.path.exists(article_file):
           
            # Read article text from file
            with open(article_file,'r',encoding='utf-8') as article:
                article_text=article.read()
            # Tokenize sentences for text analysis
            sentences=sent_tokenize(article_text)
            words=word_tokenize(article_text)
        
            # Calculate text analysis metrics
            avg_sentence_length=calculate_avg_sentence_length(sentences)
            percentage_complex_words=calculate_percentage_complex_words(article_text)
            fog_index=calculate_fog_index(avg_sentence_length, percentage_complex_words)
            avg_words_per_sentence=calculate_avg_words_per_sentence(words, sentences)
            complex_word_count=calculate_complex_word_count(article_text)
            word_count=calculate_word_count(article_text)
            syllable_count_per_word=calculate_syllable_count_per_word(article_text)
            personal_pronoun_count=calculate_personal_pronouns(article_text)
            avg_word_length=calculate_avg_word_length(article_text)
            results_.append({
                "URL_ID":url_id,
                "Avg_Sentence_Length":avg_sentence_length,
                "Percentage_Complex_Words":percentage_complex_words,
                "Fog_Index":fog_index,
                "Avg_Words_Per_Sentence":avg_words_per_sentence,
                "Complex_Word_Count":complex_word_count,
                "Word_Count":word_count,
                "Syllable_Count_Per_Word":syllable_count_per_word,
                "Personal_Pronoun_Count":personal_pronoun_count,
                "Avg_Word_Length":avg_word_length
            })
    
    # Create DataFrame from results
    result_df2=pd.DataFrame(results_)
    
    # Save results to Excel
    result_df2.to_excel("text_analysis_results.xlsx",index=False)

if __name__ == "__main__":
    main()

### Text Analysis data

In [47]:
text_analysis=pd.read_excel("text_analysis_results.xlsx")
text_analysis

Unnamed: 0,URL_ID,Avg_Sentence_Length,Percentage_Complex_Words,Fog_Index,Avg_Words_Per_Sentence,Complex_Word_Count,Word_Count,Syllable_Count_Per_Word,Personal_Pronoun_Count,Avg_Word_Length
0,blackassign0001,17.696203,0.671674,7.347151,17.696203,939,733,1.413448,6,4.195994
1,blackassign0002,21.412500,0.717455,8.851982,21.412500,1229,896,1.653824,3,4.872154
2,blackassign0003,21.684211,0.735437,8.967859,21.684211,909,696,1.875405,13,5.492718
3,blackassign0004,23.711538,0.716139,9.771071,23.711538,883,690,1.742903,4,5.298459
4,blackassign0005,19.475000,0.752246,8.090899,19.475000,586,426,1.658537,6,5.077022
...,...,...,...,...,...,...,...,...,...,...
84,blackassign0094,20.406250,0.702910,8.443664,20.406250,918,666,1.475498,11,4.376723
85,blackassign0095,20.694444,0.693960,8.555362,20.694444,517,374,1.477852,4,4.495302
86,blackassign0096,24.580000,0.739626,10.127850,24.580000,909,657,1.661513,2,4.943857
87,blackassign0097,31.333333,0.711948,12.818112,31.333333,870,539,1.450082,5,4.327332


### Merge the Sentiment Analysis and Text Analysis data Based on the URL ID

In [48]:
merged_df=pd.merge(sentiment_analysis, text_analysis, on='URL_ID')
merged_df

Unnamed: 0,URL_ID,URL,Positive_Score,Negative_Score,Polarity_Score,Subjectivity_Score,Avg_Sentence_Length,Percentage_Complex_Words,Fog_Index,Avg_Words_Per_Sentence,Complex_Word_Count,Word_Count,Syllable_Count_Per_Word,Personal_Pronoun_Count,Avg_Word_Length
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,44,6,0.760000,0.035765,17.696203,0.671674,7.347151,17.696203,939,733,1.413448,6,4.195994
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,64,31,0.347368,0.055458,21.412500,0.717455,8.851982,21.412500,1229,896,1.653824,3,4.872154
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,40,24,0.250000,0.051780,21.684211,0.735437,8.967859,21.684211,909,696,1.875405,13,5.492718
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,39,75,-0.315789,0.092457,23.711538,0.716139,9.771071,23.711538,883,690,1.742903,4,5.298459
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,24,8,0.500000,0.041078,19.475000,0.752246,8.090899,19.475000,586,426,1.658537,6,5.077022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,blackassign0094,https://insights.blackcoffer.com/gaming-disord...,34,50,-0.190476,0.064319,20.406250,0.702910,8.443664,20.406250,918,666,1.475498,11,4.376723
85,blackassign0095,https://insights.blackcoffer.com/what-is-the-r...,13,25,-0.315789,0.051007,20.694444,0.693960,8.555362,20.694444,517,374,1.477852,4,4.495302
86,blackassign0096,https://insights.blackcoffer.com/what-is-the-r...,30,57,-0.310345,0.070789,24.580000,0.739626,10.127850,24.580000,909,657,1.661513,2,4.943857
87,blackassign0097,https://insights.blackcoffer.com/impact-of-cov...,32,35,-0.044776,0.054828,31.333333,0.711948,12.818112,31.333333,870,539,1.450082,5,4.327332


### Final Output

In [49]:
merged_df.to_excel("OutputDataStructure.xlsx")
output=pd.read_excel("OutputDataStructure.xlsx")
output

Unnamed: 0.1,Unnamed: 0,URL_ID,URL,Positive_Score,Negative_Score,Polarity_Score,Subjectivity_Score,Avg_Sentence_Length,Percentage_Complex_Words,Fog_Index,Avg_Words_Per_Sentence,Complex_Word_Count,Word_Count,Syllable_Count_Per_Word,Personal_Pronoun_Count,Avg_Word_Length
0,0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,44,6,0.760000,0.035765,17.696203,0.671674,7.347151,17.696203,939,733,1.413448,6,4.195994
1,1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,64,31,0.347368,0.055458,21.412500,0.717455,8.851982,21.412500,1229,896,1.653824,3,4.872154
2,2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,40,24,0.250000,0.051780,21.684211,0.735437,8.967859,21.684211,909,696,1.875405,13,5.492718
3,3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,39,75,-0.315789,0.092457,23.711538,0.716139,9.771071,23.711538,883,690,1.742903,4,5.298459
4,4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,24,8,0.500000,0.041078,19.475000,0.752246,8.090899,19.475000,586,426,1.658537,6,5.077022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,84,blackassign0094,https://insights.blackcoffer.com/gaming-disord...,34,50,-0.190476,0.064319,20.406250,0.702910,8.443664,20.406250,918,666,1.475498,11,4.376723
85,85,blackassign0095,https://insights.blackcoffer.com/what-is-the-r...,13,25,-0.315789,0.051007,20.694444,0.693960,8.555362,20.694444,517,374,1.477852,4,4.495302
86,86,blackassign0096,https://insights.blackcoffer.com/what-is-the-r...,30,57,-0.310345,0.070789,24.580000,0.739626,10.127850,24.580000,909,657,1.661513,2,4.943857
87,87,blackassign0097,https://insights.blackcoffer.com/impact-of-cov...,32,35,-0.044776,0.054828,31.333333,0.711948,12.818112,31.333333,870,539,1.450082,5,4.327332
