<a href="https://colab.research.google.com/github/Matlup45/Text-Analysis/blob/main/Text_Extraction_%26_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Extracting the data from given URLs**

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

def extract_article_content(url):
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the parent container of the article content
        parent_container = soup.find('article')

        # Extract the article title from the parent container
        title_element = parent_container.find('h1')
        title = title_element.get_text().strip() if title_element else None

        # Extract the article text from the parent container
        text_elements = parent_container.find_all('p')
        text = ' '.join([element.get_text().strip() for element in text_elements])

        return title, text
    else:
        # If the request was unsuccessful, print the status code
        print(f"Error: {response.status_code}")
        return None, None

data = pd.read_excel('urls.xlsx')

# Iterate over the rows in the Excel file
for index, row in data.iterrows():
    file_id = row['URL_ID']
    url = row['URL']

    # Extracting the article content
    title, text = extract_article_content(url)

    if title and text:
        file_name = f"{file_id}.txt"
        with open(file_name, "w", encoding="utf-8") as file:
            file.write(f"Title: {title}\n")
            file.write(f"Text: {text}\n")

        print(f"Extraction successful. The text for File ID '{file_id}' has been saved to {file_name}.")
    else:
        print(f"Extraction failed for File ID '{file_id}'.")

**Step 2: Analysis the data**

In [None]:
import os
import pandas as pd
import nltk
nltk.download('punkt')

# Folder path containing the text files
folder_path = r'Folder location'

# Initializing an empty list to store the analysis results
results = []

for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):

        # Reading the text file

        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        # Tokenize the text into sentences
        sentences = nltk.sent_tokenize(text)

        # Split the text into words
        words = nltk.word_tokenize(text)

        # Calculate average sentence length
        num_sentences = len(sentences)
        num_words = len(words)
        average_sentence_length = num_words / num_sentences

        # Define a function to check if a word is complex
        def is_complex_word(word):
            # Considering criteria for a complex word
            return len(word) >= 6

        # Count the number of complex words
        num_complex_words = sum(1 for word in words if is_complex_word(word))

        # Calculating the percentage of complex words
        percentage_complex_words = (num_complex_words / num_words) * 100

        # Calculating the Gunning Fog index
        fog_index = 0.4 * (average_sentence_length + percentage_complex_words)

        # Store the analysis results for the current text file
        results.append({
            'File': filename,
            'Average Sentence Length': average_sentence_length,
            'Percentage of Complex Words': percentage_complex_words,
            'Fog Index': fog_index
        })

df = pd.DataFrame(results)

# Sav the DataFrame to an Excel file
output_file_path = 'readability_analysis.xlsx'
df.to_excel(output_file_path, index=False)

print(f"Readability analysis saved to '{output_file_path}'")

**step 3: calculating Average Number of Words Per Sentence**

In [None]:
import os
import nltk
import pandas as pd

nltk.download('punkt')

folder_path = r'C:\Users\91976\Blackcoffer_assignment'

results = []

for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        # Reading the text file
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        # Tokenize the text into sentences
        sentences = nltk.sent_tokenize(text)

        # Split the text into words
        words = nltk.word_tokenize(text)

        # Calculate average number of words per sentence
        num_sentences = len(sentences)
        num_words = len(words)
        average_words_per_sentence = num_words / num_sentences

        # Store the analysis results for the current text file
        results.append({
            'File': filename,
            'Average Words Per Sentence': average_words_per_sentence
        })

df = pd.DataFrame(results)

# Saving the DataFrame to an Excel file
output_file_path = 'average_words_per_sentence.xlsx'
df.to_excel(output_file_path, index=False)

print(f"Average words per sentence analysis saved to '{output_file_path}'")

**step 4: calculating Syllable Count Per Word**

In [None]:
import os
import pandas as pd

folder_path = r'C:\Users\91976\Blackcoffer_assignment'

# Initialize a variable to store the total count of syllables
total_syllable_count = 0

results = []

# Function to count the number of syllables in a word
def count_syllables(word):
    vowels = 'aeiou'
    exceptions = ['es', 'ed']

    # Remove trailing "es" and "ed" from the word
    for exception in exceptions:
        if word.endswith(exception):
            word = word[:-len(exception)]
            break

    # Count the number of vowels
    syllable_count = 0
    prev_char_vowel = False

    for char in word:
        if char.lower() in vowels:
            if not prev_char_vowel:
                syllable_count += 1
            prev_char_vowel = True
        else:
            prev_char_vowel = False

    return syllable_count

for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        # Reading the text file
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        # Tokenize the text into words
        words = text.split()

        # Calculate the number of syllables for each word
        syllable_counts = [count_syllables(word) for word in words]

        # Calculate the total count of syllables
        total_syllable_count += sum(syllable_counts)

        # Store the analysis results for the current text file
        file_results = {
            'File': filename,
            'Total Syllable Count': sum(syllable_counts)
        }
        results.append(file_results)

df = pd.DataFrame(results)

# Saving the DataFrame to an Excel file
output_file_path = 'syllable_count.xlsx'
df.to_excel(output_file_path, index=False)

# Print the total count of syllables
print(f"Total count of syllables: {total_syllable_count}")
print(f"Syllable count analysis saved to '{output_file_path}'")

**step 5: calculating Complex Word Count**

In [None]:
import os
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd

# Function to count the number of syllables in a word
def count_syllables(word):
    vowels = 'aeiou'
    exceptions = ['es', 'ed']

    # Remove trailing "es" and "ed" from the word
    for exception in exceptions:
        if word.endswith(exception):
            word = word[:-len(exception)]
            break

    # Count the number of vowels
    syllable_count = 0
    prev_char_vowel = False

    for char in word:
        if char.lower() in vowels:
            if not prev_char_vowel:
                syllable_count += 1
            prev_char_vowel = True
        else:
            prev_char_vowel = False

    return syllable_count

# Folder path containing the text files
folder_path = r'C:\Users\91976\Blackcoffer_Assignment'

# Set of stop words
stop_words = set(stopwords.words('english'))

# Create a list to store the results
results = []

# Process each text file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        # Read the text file
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        # Tokenize the text into words
        words = word_tokenize(text)

        # Remove punctuation from each word and convert to lowercase
        cleaned_words = [word.lower().translate(str.maketrans('', '', string.punctuation)) for word in words]

        # Remove stop words
        cleaned_words = [word for word in cleaned_words if word not in stop_words]

        # Find complex words with more than two syllables
        complex_words = [word for word in cleaned_words if count_syllables(word) > 2]

        # Count the number of complex words
        complex_word_count = len(complex_words)

        # Append the results to the list
        results.append({'Filename': filename, 'Complex Word Count': complex_word_count})

# Convert the results to a DataFrame
df = pd.DataFrame(results)

# Save the DataFrame to an Excel file
output_file = 'complex_word_count.xlsx'
df.to_excel(output_file, index=False)

print(f"Complex word count for each file saved to {output_file}")

**step 6: calculating Word Count**

In [None]:
import os
import string
from nltk.corpus import stopwords
import pandas as pd
nltk.download('stopwords')

folder_path = r'C:\Users\91976\Blackcoffer_assignment'

# Initialize a variable to store the total count of cleaned words
total_cleaned_word_count = 0

# Set of stop words
stop_words = set(stopwords.words('english'))

results = []

for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        # Reading the text file
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        # Tokenize the text into words
        words = text.split()

        # Removing punctuation from each word and convert to lowercase
        cleaned_words = [word.lower().translate(str.maketrans('', '', string.punctuation)) for word in words]

        # Removing stop words
        cleaned_words = [word for word in cleaned_words if word not in stop_words]

        # Counting the total number of cleaned words
        total_cleaned_word_count += len(cleaned_words)

        # Store the analysis results for the current text file
        file_results = {
            'File': filename,
            'Cleaned Word Count': len(cleaned_words)
        }
        results.append(file_results)

df = pd.DataFrame(results)

# Saving the DataFrame to an Excel file
output_file_path = 'cleaned_word_count.xlsx'
df.to_excel(output_file_path, index=False)

print(f"Total count of cleaned words: {total_cleaned_word_count}")
print(f"Cleaned word count analysis saved to '{output_file_path}'")

**step 7: calculating Personal Pronouns**

In [None]:
import os
import re
import pandas as pd

folder_path = r'C:\Users\91976\Blackcoffer_assignment'

results = []

# Define the personal pronouns pattern using regex
pronouns_pattern = r'\b(I|we|my|ours|us)\b(?!\s*US\b)'

for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        # Reading the text file
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        # Find all matches of personal pronouns in the text
        matches = re.findall(pronouns_pattern, text, flags=re.IGNORECASE)

        # Count the number of personal pronouns
        personal_pronoun_count = len(matches)

        # Store the analysis results for the current text file
        results.append({
            'File': filename,
            'Personal Pronoun Count': personal_pronoun_count
        })

df = pd.DataFrame(results)

# Saving the DataFrame to an Excel file
output_file_path = 'personal_pronoun_count.xlsx'
df.to_excel(output_file_path, index=False)

print(f"Personal pronoun count analysis saved to '{output_file_path}'")

**step 8: calculating Average Word Length**

In [None]:
import os
import pandas as pd

folder_path = r'C:\Users\91976\Blackcoffer_assignment'

results = []

for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        # Reading the text file
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        # Tokenize the text into words
        words = text.split()

        # Calculate the total number of words
        total_words = len(words)

        # Calculate the sum of the total number of characters in each word
        total_characters = sum(len(word) for word in words)

        # Calculate the average word length
        average_word_length = total_characters / total_words

        # Store the analysis results for the current text file
        results.append({
            'File': filename,
            'Average Word Length': average_word_length
        })

df = pd.DataFrame(results)

# Saving the DataFrame to an Excel file
output_file_path = 'average_word_length.xlsx'
df.to_excel(output_file_path, index=False)

print(f"Average word length analysis saved to '{output_file_path}'")

**step 9: Sentimental Analysis**

**9.1: 	Cleaning the data**

In [None]:
import nltk
from nltk.tokenize import word_tokenize
import os

#Instead of for loop you can give the path of entire extract file location by "input_file_path = r'folder_location'

# Path to the input text file
for i in range(37,151):
    input_file_path = f'{i}''.txt'


    # Location to the stopwords file
    stopwords_file_path = 'stopwords.txt'

    # Reading the input text file
    with open(input_file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    # Tokenize the text into individual words
    tokens = word_tokenize(text)

    # Reading the stopwords from the file
    with open(stopwords_file_path, 'r', encoding='utf-8') as file:
        stopwords = file.read().splitlines()

    # Removeing stopwords from the tokens
    filtered_tokens = [token for token in tokens if token.lower() not in stopwords]

    # Join the filtered tokens back into a single string
    cleaned_text = ' '.join(filtered_tokens)

    # Rewriting the input text file with the cleaned text
    with open(input_file_path, 'w', encoding='utf-8') as file:
        file.write(cleaned_text)

    print("Text file cleaned and rewritten successfully!")

9.2 : 	Extracting derived variables

In [None]:
    input_file_path = 'file.txt'

    # Path to the positive words file
    positive_words_file = 'positive-words.txt'

    # Path to the negative words file
    negative_words_file = 'negative-words.txt'

    positive_words = [] 	#Initializing Positive Word
    negative_words = []		#Initializing Negative Word

    # Reading words from positive word file
    with open(positive_words_file, 'r', encoding='latin-1') as file:
        positive_words = file.read().splitlines()

    # Reading words from negative word file
    with open(negative_words_file, 'r', encoding='latin-1') as file:
        negative_words = file.read().splitlines()

    # Reading input text file
    with open(input_file_path, 'r', encoding='latin-1') as file:
        text = file.read()

    # Initialize counters
    positive_count = 0
    negative_count = 0

    for word in text.split():
        if word.lower() in positive_words:
            positive_count += 1
        elif word.lower() in negative_words:
            negative_count += 1

    numerator = positive_count - negative_count
    denominator = positive_count + negative_count

    Polarity_Score = (numerator / denominator) + 0.000001

    Subjectivity_Score = denominator / 1232 + 0.000001

    # Create a dictionary of positive and negative word counts
    word_counts = {'Positive': positive_count, 'Negative': negative_count}

    # Print the word counts
    print(f'{i}''Word Counts:')
    for word, count in word_counts.items():
        print(f"{word}: {count}")

    print(Polarity_Score)
    print(Subjectivity_Score)
