In [8]:
import streamlit as st
import pandas as pd
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import re

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')

# Function to extract article text from URL
def extract_article_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        # Extract article text
        article_text = ""
        for paragraph in soup.find_all('p'):
            article_text += paragraph.get_text() + "\n"
        return article_text.strip()
    except Exception as e:
        return f"Error occurred while extracting text from {url}: {e}"

# Function to compute variables
def compute_variables(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())

    # Compute word count
    word_count = len(tokens)

    # Compute average sentence length
    sentences = sent_tokenize(text)
    avg_sentence_length = sum(len(word_tokenize(sentence)) for sentence in sentences) / len(sentences)

    # Compute percentage of complex words
    complex_words = [word for word in tokens if len(word) > 7]  # Assuming complex words are longer than 7 characters
    percentage_complex_words = (len(complex_words) / word_count) * 100

    # Compute fog index
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    # Compute average number of words per sentence
    avg_words_per_sentence = word_count / len(sentences)

    # Compute complex word count
    complex_word_count = len(complex_words)

    # Compute syllable per word
    syllable_count = sum(len(re.findall(r'[aeiouy]+', word)) for word in tokens)
    syllable_per_word = syllable_count / word_count

    return (word_count, avg_sentence_length, percentage_complex_words, fog_index,
            avg_words_per_sentence, complex_word_count, syllable_per_word)

# Streamlit app
def main():
    st.title("Text Analysis Web App")

    # Input data
    st.header("Input Data")
    uploaded_file = st.file_uploader("Upload input Excel file", type=["xlsx"])
    if uploaded_file is not None:
        input_df = pd.read_excel(uploaded_file)
        st.write(input_df)

        # Compute variables for each URL
        output_data = []
        for index, row in input_df.iterrows():
            url_id = row['URL_ID']
            url = row['URL']
            article_text = extract_article_text(url)
            if article_text:
                variables = compute_variables(article_text)
                output_data.append([url_id, url] + list(variables))

        # Output data
        st.header("Output Data")
        output_df = pd.DataFrame(output_data, columns=['URL_ID', 'URL', 'WORD COUNT', 'AVG SENTENCE LENGTH', 
                                                        'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX', 
                                                        'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 
                                                        'SYLLABLE PER WORD'])
        st.write(output_df)

        # Download output Excel file
        st.markdown(get_excel_output(output_df), unsafe_allow_html=True)

# Function to generate a download link for the output Excel file
def get_excel_output(df):
    output = df.to_excel(index=False)
    b64 = base64.b64encode(output.encode()).decode()
    href = f'<a href="data:file/xlsx;base64,{b64}" download="output.xlsx">Download Output Excel File</a>'
    return href

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lokes\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
