In [1]:
import requests
from bs4 import BeautifulSoup
import re
import certifi

# Set up a session with certificate verification using certifi
session = requests.Session()
session.verify = certifi.where()

urls = [
    "https://www.edx.org/course/data-science-machine-learning",
    "https://en.wikipedia.org/wiki/Engineering",
    "http://my.clevelandclinic.org/research",
    "https://en.wikipedia.org/wiki/Data_mining",
    "https://en.wikipedia.org/wiki/Data_mining#Data_mining",
]

# Function to save the text content of a web page to a text file
def save_webpage_text(url, output_file):
    try:
        if "eecs.csuohio.edu" in url:
            response = session.get(url, verify=False)
        else:
            response = session.get(url)  # Verify other URLs
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            for script in soup(['script', 'style']):
                script.extract()
            text_content = soup.get_text()
            text_content = re.sub(r'\s+', ' ', text_content).strip()
            with open(output_file, 'w', encoding='utf-8') as file:
                file.write(text_content)
            print(f"Saved content from {url} to {output_file}")
        else:
            print(f"Failed to retrieve content from {url}")
    except Exception as e:
        print(f"An error occurred: {e}")

# Loop through the URLs and save content to text files
for index, url in enumerate(urls):
    output_file = f"webpage_{index}.txt"
    save_webpage_text(url, output_file)




Saved content from https://www.edx.org/course/data-science-machine-learning to webpage_0.txt
Saved content from https://en.wikipedia.org/wiki/Engineering to webpage_1.txt
Saved content from http://my.clevelandclinic.org/research to webpage_2.txt
Saved content from https://en.wikipedia.org/wiki/Data_mining to webpage_3.txt
Saved content from https://en.wikipedia.org/wiki/Data_mining#Data_mining to webpage_4.txt


2. Preprocessing


In [2]:
import requests
from bs4 import BeautifulSoup
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
nltk.download('stopwords')

# Define a function to preprocess text
def preprocess_text(text):
    # Remove special symbols, punctuation, and question marks
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)

    # Convert text to lowercase
    text = text.lower()

    # Tokenize the text
    words = text.split()

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Apply stemming using the Porter Stemmer
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    # Join the preprocessed words back into a text
    preprocessed_text = " ".join(words)

    return preprocessed_text

# Function to save the preprocessed text content of a web page to a text file
def save_preprocessed_webpage_text(url, output_file):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            # Parse HTML content
            soup = BeautifulSoup(response.text, 'html.parser')
            # Remove scripts and style elements from the parsed content
            for script in soup(['script', 'style']):
                script.extract()
            # Extract and clean the text content
            text_content = soup.get_text()
            # Remove extra spaces and newline characters
            text_content = re.sub(r'\s+', ' ', text_content).strip()
            # Preprocess the text
            preprocessed_text = preprocess_text(text_content)
            # Save the preprocessed text to a text file
            with open(output_file, 'w', encoding='utf-8') as file:
                file.write(preprocessed_text)
            print(f"Saved preprocessed content from {url} to {output_file}")
        else:
            print(f"Failed to retrieve content from {url}")
    except Exception as e:
        print(f"An error occurred: {e}")

# Loop through the URLs and save preprocessed content to text files
for index, url in enumerate(urls):
    output_file = f"preprocessed_webpage_{index}.txt"
    save_preprocessed_webpage_text(url, output_file)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Saved preprocessed content from https://www.edx.org/course/data-science-machine-learning to preprocessed_webpage_0.txt
Saved preprocessed content from https://en.wikipedia.org/wiki/Engineering to preprocessed_webpage_1.txt
Saved preprocessed content from http://my.clevelandclinic.org/research to preprocessed_webpage_2.txt
Saved preprocessed content from https://en.wikipedia.org/wiki/Data_mining to preprocessed_webpage_3.txt
Saved preprocessed content from https://en.wikipedia.org/wiki/Data_mining#Data_mining to preprocessed_webpage_4.txt


 Count Term Frequency

In [3]:
pip install nltk




In [4]:
import collections
from nltk.util import ngrams

# Function for preprocessing and counting term frequencies
def preprocess_and_count_term_frequencies(text):
    # Preprocessing
    # Remove special symbols, punctuation, and question marks
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    # Convert text to lowercase
    text = text.lower()
    # Tokenize the text
    words = text.split()
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Apply stemming using the Porter Stemmer
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    # Term frequency counting
    # Tokenize the preprocessed text into terms (unigrams) and phrases (bi-grams and tri-grams)
    tokens = text.split()
    unigrams = tokens
    bigrams = list(ngrams(tokens, 2))
    trigrams = list(ngrams(tokens, 3))

    # Count the term frequencies
    term_frequencies = {
        "unigrams": collections.Counter(unigrams),
        "bigrams": collections.Counter(bigrams),
        "trigrams": collections.Counter(trigrams),
    }

    return term_frequencies

# Read the preprocessed text files and process them
for index, url in enumerate(urls):
    preprocessed_text = ""  # Initialize an empty string to store preprocessed text

    # Read the preprocessed text from the corresponding file
    with open(f"preprocessed_webpage_{index}.txt", 'r', encoding='utf-8') as file:
        preprocessed_text = file.read()

    # Call the preprocessing and term frequency counting function
    term_frequencies = preprocess_and_count_term_frequencies(preprocessed_text)

    # Print or store the term frequencies as needed
    print(f"Term frequencies for {url}:")
    print("Unigrams:", term_frequencies["unigrams"])
    print("Bigrams:", term_frequencies["bigrams"])
    print("Trigrams:", term_frequencies["trigrams"])


Term frequencies for https://www.edx.org/course/data-science-machine-learning:
Unigrams: Counter({'data': 22, 'cours': 18, 'learn': 17, 'edx': 16, 'scienc': 14, 'machin': 14, 'harvardx': 10, 'boot': 9, 'research': 8, 'recommend': 7, 'use': 7, 'system': 6, 'algorithm': 6, 'certif': 6, 'popular': 5, 'offer': 5, 'program': 5, 'inform': 5, 'movi': 4, 'oct': 4, 'build': 4, 'includ': 4, 'also': 4, 'take': 4, 'may': 4, 'onlin': 4, 'learner': 4, 'person': 4, 'us': 4, 'new': 3, 'receiv': 3, 'youll': 3, 'comput': 3, 'predict': 3, 'servic': 3, 'speech': 3, 'analysi': 3, 'train': 3, 'part': 3, 'ask': 3, 'code': 3, 'statement': 3, 'honor': 3, 'polici': 3, 'share': 3, 'identifi': 3, 'harvard': 3, 'region': 3, 'busi': 3, 'softwar': 3, 'search': 2, 'analysisharvardx': 2, 'learningdata': 2, 'one': 2, 'hour': 2, 'per': 2, 'enrol': 2, 'dec': 2, 'relat': 2, 'enrollenrol': 2, 'start': 2, 'process': 2, 'recognit': 2, 'princip': 2, 'compon': 2, 'regular': 2, 'overtrain': 2, 'avoid': 2, 'crossvalid': 2, 'skil

[link text](https://) Creating Document Vectors


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
import os

# Define the 7 topics
topics = ["research", "data", "mining", "analytics", "data mining", "machine learning", "deep learning"]

# Initialize a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(vocabulary=topics)

# Create an empty list to store document vectors
document_vectors = []

# Iterate through the preprocessed text of each document
for index, url in enumerate(urls):
    preprocessed_text = ""  # Initialize an empty string to store preprocessed text

    # Define the filename for the preprocessed text
    preprocessed_filename = f"preprocessed_webpage_{index}.txt"

    # Check if the file exists
    if os.path.exists(preprocessed_filename):
        # Read the preprocessed text from the file
        with open(preprocessed_filename, 'r', encoding='utf-8') as file:
            preprocessed_text = file.read()

        # Calculate TF-IDF scores for the document
        tfidf_scores = tfidf_vectorizer.fit_transform([preprocessed_text]).toarray()[0]

        # Create a dictionary where keys are topics and values are TF-IDF scores
        document_vector = {topic: tfidf_scores[i] for i, topic in enumerate(topics)}

        # Append the document vector to the list
        document_vectors.append(document_vector)

# Print or store the document vectors as needed
for i, doc_vector in enumerate(document_vectors):
    print(f"Document {i + 1} Vector:")
    for topic, score in doc_vector.items():
        print(f"{topic}: {score}")




Document 1 Vector:
research: 0.3417430630867044
data: 0.939793423488437
mining: 0.0
analytics: 0.0
data mining: 0.0
machine learning: 0.0
deep learning: 0.0
Document 2 Vector:
research: 0.9889363528682975
data: 0.14834045293024462
mining: 0.0
analytics: 0.0
data mining: 0.0
machine learning: 0.0
deep learning: 0.0
Document 3 Vector:
research: 0.9996714309094813
data: 0.0256326007925508
mining: 0.0
analytics: 0.0
data mining: 0.0
machine learning: 0.0
deep learning: 0.0
Document 4 Vector:
research: 0.05418182247688608
data: 0.998493585645472
mining: 0.0038701301769204343
analytics: 0.007740260353840869
data mining: 0.0
machine learning: 0.0
deep learning: 0.0
Document 5 Vector:
research: 0.05418182247688608
data: 0.998493585645472
mining: 0.0038701301769204343
analytics: 0.007740260353840869
data mining: 0.0
machine learning: 0.0
deep learning: 0.0


 Topic Words and Phrases

In [6]:
# Define the topics
topics = ["research", "data", "mining", "analytics", "data mining", "machine learning", "deep learning"]

# Create a dictionary to store relevant terms or phrases for each topic
topic_terms = {topic: [] for topic in topics}

# Iterate through the available preprocessed text files
for index, url in enumerate(urls):
    preprocessed_file = f"preprocessed_webpage_{index}.txt"

    # Check if the preprocessed file exists
    if os.path.exists(preprocessed_file):
        # Read the preprocessed text from the file
        with open(preprocessed_file, 'r', encoding='utf-8') as file:
            preprocessed_text = file.read()

        # Tokenize the preprocessed text
        tokens = preprocessed_text.split()

        # Iterate through the topics
        for topic in topics:
            # Define a list of terms or phrases associated with the current topic
            # Adjust these lists as per your domain or dataset
            topic_keywords = {
                "research": ["research", "study", "investigation", "survey"],
                "data": ["data", "information", "dataset", "statistics"],
                "mining": ["mining", "extraction", "knowledge discovery"],
                "analytics": ["analytics", "analysis", "statistical analysis"],
                "data mining": ["data mining", "pattern recognition", "association rules"],
                "machine learning": ["machine learning", "algorithms", "deep learning"],
                "deep learning": ["deep learning", "neural networks", "artificial intelligence"]
            }

            # Check if any of the topic keywords are present in the text
            for keyword in topic_keywords[topic]:
                if keyword in tokens:
                    topic_terms[topic].append(keyword)

# Print or store the relevant terms or phrases for each topic
for topic, terms in topic_terms.items():
    print(f"Relevant terms for {topic}: {', '.join(terms)}")


Relevant terms for research: research, research, survey, research, research, survey, research, survey
Relevant terms for data: data, dataset, data, data, data, dataset, data, dataset
Relevant terms for mining: 
Relevant terms for analytics: 
Relevant terms for data mining: 
Relevant terms for machine learning: 
Relevant terms for deep learning: 


Part 2: Data Transformation for Topic Analysis of Documents (Webpages)


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import numpy as np

# Define the 7 topics listed
topics = ["research", "data", "mining", "analytics", "data mining", "machine learning", "deep learning"]

# Initialize a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(vocabulary=topics)

# Create an empty list to store document vectors
document_vectors = []

# Iterate through the preprocessed text of each document
for index, url in enumerate(urls):
    preprocessed_text = ""  # Initialize an empty string to store preprocessed text

    # Define the filename for the preprocessed text
    preprocessed_filename = f"preprocessed_webpage_{index}.txt"

    # Check if the file exists
    if os.path.exists(preprocessed_filename):
        # Read the preprocessed text from the file
        with open(preprocessed_filename, 'r', encoding='utf-8') as file:
            preprocessed_text = file.read()

        # Calculate TF-IDF scores for the document
        tfidf_scores = tfidf_vectorizer.fit_transform([preprocessed_text]).toarray()[0]

        # Append the document vector to the list
        document_vectors.append(tfidf_scores)

# Normalize each document vector
normalized_vectors = [doc / np.linalg.norm(doc) if np.linalg.norm(doc) != 0 else doc for doc in document_vectors]

cosine_similarity_matrix = cosine_similarity(normalized_vectors, normalized_vectors)

print("      doc1     doc2     doc3     doc4      ")
for i, row in enumerate(cosine_similarity_matrix):
    doc_name = f"doc{i + 1}"
    row_str = " ".join([f"{score:.4f}" for score in row])
    print(f"{doc_name} {row_str}")


      doc1     doc2     doc3     doc4      
doc1 1.0000 0.4774 0.3657 0.9569 0.9569
doc2 0.4774 1.0000 0.9924 0.2017 0.2017
doc3 0.3657 0.9924 1.0000 0.0798 0.0798
doc4 0.9569 0.2017 0.0798 1.0000 1.0000
doc5 0.9569 0.2017 0.0798 1.0000 1.0000
