# Testing site for scraping

In [None]:
import requests
from bs4 import BeautifulSoup as bs

In [None]:
domain = 'https://www.theoi.com/'

In [None]:
URL = 'https://olympioi.com/monsters'

In [None]:
response = requests.get(URL)
if response.status_code == 200:
    print("Successfully fetched webpage!")
else:
    print("Failed to fetch the webpage. Status code:", response.status_code)

In [None]:
soup = bs(response.text, 'html.parser')
print(soup.body.prettify())

In [None]:
#obtaining links for greek heroes
links = soup.find_all("a")
links = [link['href'] for link in links if 'href' in link.attrs]
links

In [None]:
# monster_links = [link for link in links if "monsters" in link]
from urllib.parse import urljoin
monster_links = sorted(set(links))
monster_links = [urljoin(domain, link) for link in monster_links ]
# monster_links = [link for link in monster_links if "Olympios" in link]

In [None]:
len(monster_links)

# Scrapping

In [None]:
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
import requests


def extract_links(URL):
  response = requests.get(URL)

  if response.status_code == 200:
      print("Successfully fetched webpage!")
  else:
      print("Failed to fetch the webpage. Status code:", response.status_code)

  # Parse the HTML
  soup = bs(response.text, 'html.parser')


  # Find all <a> tags inside the <main> section
  links = soup.find_all('a')
  links = [link['href'] for link in links if 'href' in link.attrs]

  # Resolve relative URLs to absolute ones
  domain = URL
  resolved_links = sorted(set(urljoin(domain, link) for link in links))
  return resolved_links


In [None]:
URL = 'https://www.theoi.com/greek-mythology/greek-gods.html'
theoi = extract_links(URL)


In [None]:
URL = 'https://olympioi.com/monsters'
olympioi = extract_links(URL)

In [None]:
olympioi = sorted(set(olympioi))
# monster_links = [urljoin(domain, link) for link in monster_links ]
olympioi = [link for link in olympioi if "Olympios" in link]

In [None]:
#combines links of both monsters and greek heroes obtained from two different sites
links = theoi + olympioi

In [None]:
data = []
header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
}
#scraping and saving  the informations about links 
for link in links:
    try:
        response = requests.get(link, headers=header)
        soup = bs(response.content, 'html.parser')

        # Extracting titles
        title = soup.find('title').get_text(strip=True) if soup.find('title') else "No Title"

        # Extracting content within <main>
        main_content = soup.find('main')
        main_text = ".".join(p.get_text(strip=True) for p in main_content.find_all('p')) if main_content else "No Main Content"

        # Append to data
        data.append([title, link, main_text])
        print(f"Scraped: {title} | URL: {link}")
    except Exception as e:
        print(f"Error scraping {link}: {e}")


# File saving

In [None]:
import os

# Creating a folder in Colab to store the files
folder_name = "Greek-Mythology"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

# Creating a files in the specified folder
for entry in data:
    title, url, info = entry  # Unpack the entry
    file_name = title.split(" -")[0].strip() + ".txt"  # Using the text before '-' as the filename
    file_path = os.path.join(folder_name, file_name)  

    # Create formatted content
    content = f"Title: {title}\nURL: {url}\nInfo:\n{info}"

    # Writing to a text file
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(content)

    print(f"Created file: {file_path}")


import shutil
shutil.make_archive(folder_name, 'zip', folder_name)


from google.colab import files
files.download(f"{folder_name}.zip")


# Loading Data from Saved Dataset

In [None]:
from google.colab import drive
import os
import pandas as pd

def extract_info_from_drive_files(folder_path):
    drive.mount('/content/drive')

    data = []  # Initialize a data array

    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)

            with open(file_path, 'r', encoding='utf-8') as file:
                file_content = file.read()

                lines = file_content.split('\n')
                info_index = lines.index('Info:')
                content_after_info = '\n'.join(lines[info_index+1:]).strip()

                data.append(content_after_info)

    return data



folder_path = '/content/drive/MyDrive/Greek Mythology'
data = extract_info_from_drive_files(folder_path)

In [None]:
len(data)

# Stemming and Lementisation

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re

In [None]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# Initializing tools
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [None]:
cleaned_text = []
for text in data:
  # Removing special characters and numbers
  text = re.sub(r'[^a-zA-Z\s]', '', text)
  cleaned_text .append( re.sub(r'\s+', ' ', text).strip())


In [None]:
cleaned_text[8]

In [None]:
sentences,words = [],[]
for text in cleaned_text:
  # Sentence tokenization
  sentences.extend(sent_tokenize(text))
  # Word tokenization
  words.extend(word_tokenize(text))

In [None]:
len(words)

In [None]:
# Removing stop words
filtered_words = [word for word in words if word.lower() not in stop_words]

In [None]:
len(filtered_words)

In [None]:
# Stemming
stemmed_words = [ps.stem(word) for word in filtered_words]

# Lemmatization
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

In [None]:
import matplotlib.pyplot as plt
from collections import Counter

import matplotlib.pyplot as plt
from collections import Counter

def plot_frequencies(words):
    # Counting the frequency of each word
    word_freq = Counter(words)

    # Geting the top 100 most common words 
    top_words = word_freq.most_common(100)

    # Extracting words and their frequencies
    labels, frequencies = zip(*top_words)

    # Plotting the bar graph
    plt.figure(figsize=(20, 6))  # Set figure size
    plt.bar(labels, frequencies, color='skyblue')

   
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.title('Top 100 Most Frequent Words')

   
    plt.xticks(rotation=45, ha='right')

    
    plt.tight_layout()  
    plt.show()


In [None]:
plot_frequencies(stemmed_words)

In [None]:
plot_frequencies(lemmatized_words)

In [None]:
# Calculating the length of each sequence
sequence_lengths = [len(seq) for seq in sentences]



In [None]:
# Plotting a histogram of sequence lengths
plt.figure(figsize=(8, 6))
plt.hist(sequence_lengths, color='skyblue', edgecolor='black')

plt.xlabel('Sequence Length')
plt.ylabel('Frequency')
plt.title('Distribution of Sequence Lengths')

plt.show()

In [None]:
print(f'Average length of sequences : {int(sum(sequence_lengths)/len(sequence_lengths))}')

In [None]:
print(f"No of unique words : {len(set(stemmed_words))}")

In [None]:
print(f"No of unique words : {len(set(lemmatized_words))}")

In [None]:
unique_words = set(stemmed_words)  # Finding unique words
total_words = len(stemmed_words)   # Counting total words
lexical_diversity = len(unique_words) / total_words if total_words > 0 else 0

print(f"Lexical Diversity: {lexical_diversity:.2f}")

In [None]:
unique_words = set(lemmatized_words)  
total_words = len(lemmatized_words)   
lexical_diversity = len(unique_words) / total_words if total_words > 0 else 0

print(f"Lexical Diversity: {lexical_diversity:.2f}")

# Observations

- **Higher Unique Words and Lexical Diversity for Lemmatization:**

  Lemmatization considers grammar and maintains meaningful distinctions, so it retains more unique lemmas compared to the cruder process of stemming.
  Example: Lemmatization would differentiate "better" and "good," but stemming might treat them as the same.
-  **Lower Unique Words and Lexical Diversity for Stemming:**

  Stemming aggressively reduces words without considering their meaning, leading to overgeneralization