In [1]:
import re
from collections import Counter

# Sample text
text = """Text mining is the process of extracting meaningful information from text data. 
It involves transforming unstructured data into structured data for analysis."""

# Step 1: Preprocess the text (convert to lowercase and remove punctuation)
text_cleaned = re.sub(r'[^\w\s]', '', text.lower())

# Step 2: Tokenize the text (split into words)
tokens = text_cleaned.split()
print("Tokens:", tokens)

# Step 3: Remove stop words manually
stop_words = {"is", "the", "of", "from", "it", "for", "and", "into"}
filtered_tokens = [word for word in tokens if word not in stop_words]
print("Filtered Tokens (No Stop Words):", filtered_tokens)

# Step 4: Count word frequencies
word_counts = Counter(filtered_tokens)
print("Word Frequencies:", word_counts)

# Step 5: Identify the most common words
most_common_words = word_counts.most_common(5)
print("Most Common Words:", most_common_words)


Tokens: ['text', 'mining', 'is', 'the', 'process', 'of', 'extracting', 'meaningful', 'information', 'from', 'text', 'data', 'it', 'involves', 'transforming', 'unstructured', 'data', 'into', 'structured', 'data', 'for', 'analysis']
Filtered Tokens (No Stop Words): ['text', 'mining', 'process', 'extracting', 'meaningful', 'information', 'text', 'data', 'involves', 'transforming', 'unstructured', 'data', 'structured', 'data', 'analysis']
Word Frequencies: Counter({'data': 3, 'text': 2, 'mining': 1, 'process': 1, 'extracting': 1, 'meaningful': 1, 'information': 1, 'involves': 1, 'transforming': 1, 'unstructured': 1, 'structured': 1, 'analysis': 1})
Most Common Words: [('data', 3), ('text', 2), ('mining', 1), ('process', 1), ('extracting', 1)]


In [1]:
# Required Libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
import spacy

# Load SpaCy Model for NLP
nlp = spacy.load("en_core_web_sm")

# Function for Data Extraction
def extract_data(url=None, file_path=None):
    if url:
        # Web scraping
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        text_data = soup.get_text(separator="\n")
        return text_data
    elif file_path:
        # Reading a local file
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    else:
        raise ValueError("Provide a URL or file path for data extraction!")

# Function for NLP Processing
def process_text(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    word_freq = {}
    for token in doc:
        if token.is_alpha and not token.is_stop:
            word_freq[token.text.lower()] = word_freq.get(token.text.lower(), 0) + 1
    return entities, word_freq

# Main Script
if __name__ == "__main__":
    # Example: Extract text from a URL or a local file
    # url = "https://en.wikipedia.org/wiki/Natural_language_processing"
    # text = extract_data(url=url)
    file_path = "example.txt"  # Replace with a valid file path
    text = extract_data(file_path=file_path)

    # Process the extracted text
    named_entities, word_frequency = process_text(text)

    # Display Results
    print("Named Entities:")
    for entity, label in named_entities:
        print(f"{entity} ({label})")

    print("\nWord Frequency:")
    word_freq_df = pd.DataFrame(word_frequency.items(), columns=["Word", "Frequency"]).sort_values(by="Frequency", ascending=False)
    print(word_freq_df.head(10))


ModuleNotFoundError: No module named 'nltk'