<a href="https://colab.research.google.com/github/HarshithReddy01/Algorithms-Practice/blob/master/Week10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import re

In [None]:
# Article 1: Artificial Intelligence - https://en.wikipedia.org/wiki/Artificial_intelligence

try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

def get_article_text(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
        response = requests.get(url, timeout=10, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        for element in soup(["script", "style", "nav", "footer", "header", "aside", "table"]):
            element.decompose()

        main_content = soup.find('div', {'id': 'mw-content-text'}) or soup.find('main') or soup.find('article')
        if main_content:
            for ref in main_content.find_all(['sup', 'span', 'div'], class_=re.compile(r'ref|reference|citation')):
                ref.decompose()

            text = main_content.get_text()
        else:
            text = soup.get_text()

        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'\[\d+\]', '', text)
        text = re.sub(r'Jump to.*?Edit', '', text)

        return text.strip()
    except Exception as e:
        print(f"Error fetching article: {e}")
        return None

def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text.lower())
    return text.strip()

def get_top_words_tfidf(text, n=5):
    sentences = nltk.sent_tokenize(text)
    sentences = [s for s in sentences if len(s.split()) > 5]

    if len(sentences) < 2:
        return []

    processed_sentences = [preprocess_text(sent) for sent in sentences]

    vectorizer = TfidfVectorizer(
        max_features=1000,
        stop_words='english',
        ngram_range=(1, 1),
        min_df=2
    )

    try:
        tfidf_matrix = vectorizer.fit_transform(processed_sentences)
        feature_names = vectorizer.get_feature_names_out()

        mean_scores = np.mean(tfidf_matrix.toarray(), axis=0)

        word_scores = list(zip(feature_names, mean_scores))
        word_scores.sort(key=lambda x: x[1], reverse=True)

        return word_scores[:n]
    except ValueError:
        return []

def get_top_sentences_tfidf(text, n=5):
    sentences = nltk.sent_tokenize(text)
    sentences = [s for s in sentences if len(s.split()) > 8 and not any(x in s.lower() for x in ['jump to', 'edit', 'retrieved', 'archived'])]

    if len(sentences) < 2:
        return []

    processed_sentences = [preprocess_text(sent) for sent in sentences]

    vectorizer = TfidfVectorizer(
        max_features=1000,
        stop_words='english',
        ngram_range=(1, 2),
        min_df=1
    )

    try:
        tfidf_matrix = vectorizer.fit_transform(processed_sentences)
        sentence_scores = np.sum(tfidf_matrix.toarray(), axis=1)

        sentence_score_pairs = list(zip(sentences, sentence_scores))
        sentence_score_pairs.sort(key=lambda x: x[1], reverse=True)

        return sentence_score_pairs[:n]
    except ValueError:
        return []

def analyze_article(url):
    print(f"\nAnalyzing article: {url}")

    text = get_article_text(url)
    if not text or len(text) < 1000:
        return None

    print(f"Article length: {len(text)} characters")

    top_words = get_top_words_tfidf(text)
    print("\nTop 5 words:")
    for word, score in top_words:
        print(f"  {word}: {score:.4f}")

    top_sentences = get_top_sentences_tfidf(text)
    print("\nTop 5 sentences:")
    for i, (sentence, score) in enumerate(top_sentences, 1):
        print(f"  {i}. {sentence[:120]}... (Score: {score:.4f})")

    return {
        'url': url,
        'top_words': top_words,
        'top_sentences': top_sentences,
        'text': text
    }

sample_ai_article = """
Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals. Leading AI textbooks define the field as the study of "intelligent agents": any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals. Colloquially, the term "artificial intelligence" is often used to describe machines that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem solving". As machines become increasingly capable, tasks considered to require "intelligence" are often removed from the definition of AI, a phenomenon known as the AI effect. A quip in Tesler's Theorem says "AI is whatever hasn't been done yet." For instance, optical character recognition is frequently excluded from things considered to be AI, having become a routine technology. Modern machine learning techniques are based on artificial neural networks, particularly deep neural networks. Machine learning is a method of data analysis that automates analytical model building. It is a branch of artificial intelligence based on the idea that systems can learn from data, identify patterns and make decisions with minimal human intervention. The iterative aspect of machine learning is important because as models are exposed to new data, they are able to independently adapt. They learn from previous computations to produce reliable, repeatable decisions and results. While many machine learning algorithms have been around for a long time, the ability to automatically apply complex mathematical calculations to big data over and over, faster and faster is a recent development. Artificial intelligence research is defined as the study of intelligent agents. Any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals. A more elaborate definition characterizes AI as a system's ability to correctly interpret external data, to learn from such data, and to use those learnings to achieve specific goals and tasks through flexible adaptation.
"""

url = "https://en.wikipedia.org/wiki/Artificial_intelligence"

print("Article 1: Artificial Intelligence")

result = analyze_article(url)
if result and len(result.get('text', '')) > 2000:
    final_result = result
else:
    print(f"\nUsing sample article for {url}")
    print(f"Article length: {len(sample_ai_article)} characters")

    top_words = get_top_words_tfidf(sample_ai_article)
    print("\nTop 5 words:")
    for word, score in top_words:
        print(f"  {word}: {score:.4f}")

    top_sentences = get_top_sentences_tfidf(sample_ai_article)
    print("\nTop 5 sentences:")
    for j, (sentence, score) in enumerate(top_sentences, 1):
        print(f"  {j}. {sentence[:120]}... (Score: {score:.4f})")

    final_result = {
        'url': url,
        'top_words': top_words,
        'top_sentences': top_sentences
    }

print("\nSUMMARY")

print(f"\nArticle 1: {final_result['url']}")
print("Top 5 Words:")
for word, score in final_result['top_words']:
    print(f"  {word}: {score:.4f}")

print("Top 5 Sentences:")
for j, (sentence, score) in enumerate(final_result['top_sentences'], 1):
    print(f"  {j}. {sentence[:100]}... (Score: {score:.4f})")

summary_data = []
for word, score in final_result['top_words']:
    summary_data.append({
        'Type': 'Word',
        'Content': word,
        'TF-IDF Score': score
    })

for j, (sentence, score) in enumerate(final_result['top_sentences'], 1):
    summary_data.append({
        'Type': 'Sentence',
        'Content': sentence[:80] + '...',
        'TF-IDF Score': score
    })

df = pd.DataFrame(summary_data)
print("\nSUMMARY TABLE")
print(df.to_string(index=False))

print("\nINFO : ")
print(f"Article 1: {final_result['url']}")
print("\nTop 5 words with TF-IDF scores:")
for word, score in final_result['top_words']:
    print(f"  {word}: {score:.4f}")

print("\nTop 5 sentences with TF-IDF scores:")
for j, (sentence, score) in enumerate(final_result['top_sentences'], 1):
    print(f"  {j}. {sentence[:100]}... (Score: {score:.4f})")


Article 1: Artificial Intelligence

Analyzing article: https://en.wikipedia.org/wiki/Artificial_intelligence
Article length: 95213 characters

Top 5 words:
  ai: 0.0543
  intelligence: 0.0299
  artificial: 0.0232
  learning: 0.0221
  used: 0.0194

Top 5 sentences:
  1. See also Artificial consciousness – Field in cognitive science Artificial intelligence and elections – Use and impact of... (Score: 7.1275)
  2. Applications Main article: Applications of artificial intelligenceAI and machine learning technology is used in most of ... (Score: 5.8458)
  3. High-profile applications of AI include advanced web search engines (e.g., Google Search); recommendation systems (used ... (Score: 5.7188)
  4. Hardware and software Main articles: Programming languages for artificial intelligence and Hardware for artificial intel... (Score: 5.7132)
  5. The sudden success of deep learning in 2012–2015 did not occur because of some new discovery or theoretical breakthrough... (Score: 5.3778)

SUMMARY



In [None]:
# Article 2: Machine Learning - https://en.wikipedia.org/wiki/Machine_learning
try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

def get_article_text(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
        response = requests.get(url, timeout=10, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        for element in soup(["script", "style", "nav", "footer", "header", "aside", "table"]):
            element.decompose()

        main_content = soup.find('div', {'id': 'mw-content-text'}) or soup.find('main') or soup.find('article')
        if main_content:
            for ref in main_content.find_all(['sup', 'span', 'div'], class_=re.compile(r'ref|reference|citation')):
                ref.decompose()

            text = main_content.get_text()
        else:
            text = soup.get_text()

        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'\[\d+\]', '', text)
        text = re.sub(r'Jump to.*?Edit', '', text)

        return text.strip()
    except Exception as e:
        print(f"Error fetching article: {e}")
        return None

def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text.lower())
    return text.strip()

def get_top_words_tfidf(text, n=5):
    sentences = nltk.sent_tokenize(text)
    sentences = [s for s in sentences if len(s.split()) > 5]

    if len(sentences) < 2:
        return []

    processed_sentences = [preprocess_text(sent) for sent in sentences]

    vectorizer = TfidfVectorizer(
        max_features=1000,
        stop_words='english',
        ngram_range=(1, 1),
        min_df=2
    )

    try:
        tfidf_matrix = vectorizer.fit_transform(processed_sentences)
        feature_names = vectorizer.get_feature_names_out()

        mean_scores = np.mean(tfidf_matrix.toarray(), axis=0)

        word_scores = list(zip(feature_names, mean_scores))
        word_scores.sort(key=lambda x: x[1], reverse=True)

        return word_scores[:n]
    except ValueError:
        return []

def get_top_sentences_tfidf(text, n=5):
    sentences = nltk.sent_tokenize(text)
    sentences = [s for s in sentences if len(s.split()) > 8 and not any(x in s.lower() for x in ['jump to', 'edit', 'retrieved', 'archived'])]

    if len(sentences) < 2:
        return []

    processed_sentences = [preprocess_text(sent) for sent in sentences]

    vectorizer = TfidfVectorizer(
        max_features=1000,
        stop_words='english',
        ngram_range=(1, 2),
        min_df=1
    )

    try:
        tfidf_matrix = vectorizer.fit_transform(processed_sentences)
        sentence_scores = np.sum(tfidf_matrix.toarray(), axis=1)

        sentence_score_pairs = list(zip(sentences, sentence_scores))
        sentence_score_pairs.sort(key=lambda x: x[1], reverse=True)

        return sentence_score_pairs[:n]
    except ValueError:
        return []

def analyze_article(url):
    print(f"\nAnalyzing article: {url}")

    text = get_article_text(url)
    if not text or len(text) < 1000:
        return None

    print(f"Article length: {len(text)} characters")

    top_words = get_top_words_tfidf(text)
    print("\nTop 5 words:")
    for word, score in top_words:
        print(f"  {word}: {score:.4f}")

    top_sentences = get_top_sentences_tfidf(text)
    print("\nTop 5 sentences:")
    for i, (sentence, score) in enumerate(top_sentences, 1):
        print(f"  {i}. {sentence[:120]}... (Score: {score:.4f})")

    return {
        'url': url,
        'top_words': top_words,
        'top_sentences': top_sentences,
        'text': text
    }

sample_ml_article = """
Machine learning (ML) is a subset of artificial intelligence (AI) that provides systems the ability to automatically learn and improve from experience without being explicitly programmed. Machine learning focuses on the development of computer programs that can access data and use it to learn for themselves. The process of learning begins with observations or data, such as examples, direct experience, or instruction, in order to look for patterns in data and make better decisions in the future based on the examples that we provide. The primary aim is to allow the computers to learn automatically without human intervention or assistance and adjust actions accordingly. Machine learning algorithms build a mathematical model based on training data, in order to make predictions or decisions without being explicitly programmed to do so. Machine learning algorithms are used in a wide variety of applications, such as in medicine, email filtering, speech recognition, and computer vision, where it is difficult or unfeasible to develop conventional algorithms to perform the needed tasks. Machine learning is closely related to computational statistics, which focuses on making predictions using computers. The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning. Data mining is a related field of study, focusing on exploratory data analysis through unsupervised learning. Machine learning involves computers discovering how they can perform tasks without being explicitly programmed to do so. It involves computers learning from data provided so that they carry out certain tasks. For simple tasks assigned to computers, it is possible to program algorithms telling the machine how to execute all steps required to solve the problem at hand. For more advanced tasks, it can be challenging for a human to manually create the needed algorithms. In practice, it can turn out to be more effective to help the machine develop its own algorithm, rather than having human programmers specify every needed step.
"""

url = "https://en.wikipedia.org/wiki/Machine_learning"

print("Article 2: Machine Learning")

result = analyze_article(url)
if result and len(result.get('text', '')) > 2000:
    final_result = result
else:
    print(f"\nUsing sample article for {url}")
    print(f"Article length: {len(sample_ml_article)} characters")

    top_words = get_top_words_tfidf(sample_ml_article)
    print("\nTop 5 words:")
    for word, score in top_words:
        print(f"  {word}: {score:.4f}")

    top_sentences = get_top_sentences_tfidf(sample_ml_article)
    print("\nTop 5 sentences:")
    for j, (sentence, score) in enumerate(top_sentences, 1):
        print(f"  {j}. {sentence[:120]}... (Score: {score:.4f})")

    final_result = {
        'url': url,
        'top_words': top_words,
        'top_sentences': top_sentences
    }

print("\nSUMMARY")

print(f"\nArticle 2: {final_result['url']}")
print("Top 5 Words:")
for word, score in final_result['top_words']:
    print(f"  {word}: {score:.4f}")

print("Top 5 Sentences:")
for j, (sentence, score) in enumerate(final_result['top_sentences'], 1):
    print(f"  {j}. {sentence[:100]}... (Score: {score:.4f})")

summary_data = []
for word, score in final_result['top_words']:
    summary_data.append({
        'Type': 'Word',
        'Content': word,
        'TF-IDF Score': score
    })

for j, (sentence, score) in enumerate(final_result['top_sentences'], 1):
    summary_data.append({
        'Type': 'Sentence',
        'Content': sentence[:80] + '...',
        'TF-IDF Score': score
    })

df = pd.DataFrame(summary_data)
print("\nSUMMARY TABLE")
print(df.to_string(index=False))

print("\nINFO : ")
print(f"Article 2: {final_result['url']}")
print("\nTop 5 words with TF-IDF scores:")
for word, score in final_result['top_words']:
    print(f"  {word}: {score:.4f}")

print("\nTop 5 sentences with TF-IDF scores:")
for j, (sentence, score) in enumerate(final_result['top_sentences'], 1):
    print(f"  {j}. {sentence[:100]}... (Score: {score:.4f})")


Article 2: Machine Learning

Analyzing article: https://en.wikipedia.org/wiki/Machine_learning
Article length: 67924 characters

Top 5 words:
  learning: 0.0696
  machine: 0.0462
  data: 0.0440
  model: 0.0277
  algorithms: 0.0263

Top 5 sentences:
  1. Machine learning approaches are traditionally divided into three broad categories, which correspond to learning paradigm... (Score: 5.0715)
  2. When dealing with non-linear problems, go-to models include polynomial regression (for example, used for trendline fitti... (Score: 4.9857)
  3. Supervised anomaly detection techniques require a data set that has been labelled as "normal" and "abnormal" and involve... (Score: 4.9582)
  4. Much of the confusion between these two research communities (which do often have separate conferences and separate jour... (Score: 4.9285)
  5. Artificial neural networks have been used on a variety of tasks, including computer vision, speech recognition, machine ... (Score: 4.8287)

SUMMARY

Article 2: https