# Data Extraction and NLP

In [1]:
#importing libraries

import pandas as pd
from bs4 import BeautifulSoup
import requests
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tokenize import RegexpTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob


In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mohit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mohit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Step 1: Data Input

input_df = pd.read_excel(r"C:\Users\Mohit\Desktop\blackcoffer\Input.xlsx")


In [4]:
# Step 2: Crawl and Extract Text


for index, row in input_df.iterrows():
    url = row['URL']
    url_id = row['URL_ID']

    # Make a request to the URL
    response = requests.get(url)

    # Parse HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract article title and text
    title = soup.title.text if soup.title else ""
    article_text = " ".join([p.text for p in soup.find_all('p')])

    # Save extracted article text in a text file
    with open(f"{url_id}.txt", 'w', encoding='utf-8') as file:
        file.write(f"{title}\n{article_text}")

In [5]:
# Step 3: Text Analysis


output_data = []

for index, row in input_df.iterrows():
    url_id = row['URL_ID']

    # Step 4: Read Extracted Text
    
    
    with open(f"{url_id}.txt", 'r', encoding='utf-8') as file:
        article_content = file.read()

    # Step 5: Perform Text Analysis
    # Tokenize words
    words = nltk.word_tokenize(article_content)

    
    # Step 6: Calculate Variables
    # Variable 1: Positive and Negative Scores (using SentimentIntensityAnalyzer from nltk)
    
    
    
    sia = SentimentIntensityAnalyzer()
    sentiment_scores = sia.polarity_scores(article_content)
    positive_score = sentiment_scores['pos']
    negative_score = sentiment_scores['neg']

    # Variable 2: Polarity Score
    polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)

    
    # Variable 3: Subjectivity Score
    subjectivity_score = (positive_score + negative_score) / (len(words) + 0.000001)

    
    # Variable 4: Average Sentence Length
    sentences = nltk.sent_tokenize(article_content)
    avg_sentence_length = len(words) / len(sentences)

    
    # Variable 5: Percentage of Complex Words
    complex_words = [word for word in words if len(word) > 2]  # Assuming > 2 syllables is complex
    percentage_complex_words = len(complex_words) / len(words)

    
    # Variable 6: Fog Index
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    
    # Add other variables as per your requirements

    # Step 7: Save Results
    output_data.append([url_id, positive_score, negative_score, polarity_score, subjectivity_score,
                        avg_sentence_length, percentage_complex_words, fog_index])


# Create DataFrame for the output
columns = ['URL_ID', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE',
           'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX']

output_df = pd.DataFrame(output_data, columns=columns)

# Save output to Excel file
output_df.to_excel(r"C:\Users\Mohit\Desktop\blackcoffer\Output.xlsx", index=False)