In [8]:
import requests
# Function Description -- This function retrieves the top Wikipedia URLs related to a given topic.
# The user can optionally specify a category to filter the search results and the number of URLs to be returned.

# Input -- topic (str): The main topic for which Wikipedia URLs are to be retrieved.
#          category (str, optional): An optional category to filter the search results. If provided, only URLs with titles containing the category will be included.
#          num_urls (int): The number of Wikipedia URLs to be extracted (default is 5).

# Output -- Top 5 Wiki URLs related to user given topic
def get_top_urls(topic, category=None, num_urls=5):
    base_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "list": "search",
        "srsearch": topic,
        "srprop": "title",
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    urls = []
    for result in data.get('query', {}).get('search', []):
        title = result['title']
        article_url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
        if not category or category.lower() in title.lower():
            urls.append(article_url)

    return urls[:num_urls]

# Global variables to store the topic and top URLs
topic = ""
top_urls = []

def main():
    global topic, top_urls
    topic = input("Enter a topic: ")
    category = input("Enter a category (optional): ")
    top_urls = get_top_urls(topic, category)

    print(f"Top {len(top_urls)} URLs related to '{topic}' ({category if category else 'any category'}):")
    for i, url in enumerate(top_urls, start=1):
        print(f"{i}. {url}")

if __name__ == "__main__":
    main()


Enter a topic: IPL
Enter a category (optional): 
Top 5 URLs related to 'IPL' (any category):
1. https://en.wikipedia.org/wiki/Indian_Premier_League
2. https://en.wikipedia.org/wiki/2024_Indian_Premier_League
3. https://en.wikipedia.org/wiki/Delhi_Capitals
4. https://en.wikipedia.org/wiki/Kolkata_Knight_Riders
5. https://en.wikipedia.org/wiki/Chennai_Super_Kings


In [11]:
import requests
from bs4 import BeautifulSoup

google_urls=[]

def extract_summary_from_wikipedia(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    paragraphs = soup.find_all('p')
    summary = ' '.join(p.get_text() for p in paragraphs[:3])  # Extract the first 3 paragraphs as summary
    return summary

def get_top_google_urls(query, num_urls=5):
    base_url = "https://www.google.com/search"
    params = {
        "q": query,
        "num": num_urls
    }
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"
    }

    try:
        response = requests.get(base_url, params=params, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        search_results = soup.find_all('div', class_='tF2Cxc')
        urls = [result.a['href'] for result in search_results if result.a]
        return urls
    except requests.RequestException as e:
        print("Error fetching data from Google Search:", e)
        return []

def main(topic, wikipedia_url, google_urls):
    print("Extracting summary from the Wikipedia article...")
    wikipedia_summary = extract_summary_from_wikipedia(wikipedia_url)

    print("Generating Google search query...")
    google_query = f"{topic} {wikipedia_summary}"

    print("Fetching additional URLs from Google...")
    new_google_urls = get_top_google_urls(google_query)
    google_urls.extend(new_google_urls)

    print(f"Top {len(new_google_urls)} URLs related to the Wikipedia article:")
    for i, url in enumerate(new_google_urls, start=1):
        print(f"{i}. {url}")

if __name__ == "__main__":
    # Provide the topic and Wikipedia URL stored from the previous cell
    topic = "Enter your topic here"
    wikipedia_urls = top_urls  # Assuming top_urls is a list of URLs
    for url in wikipedia_urls:
        main(topic, url, google_urls)


Extracting summary from the Wikipedia article...
Generating Google search query...
Fetching additional URLs from Google...
Top 5 URLs related to the Wikipedia article:
1. https://en.wikipedia.org/wiki/Indian_Premier_League
2. https://www.linkedin.com/posts/umarjothi-m-a53263240_the-indian-premier-league-ipl-also-known-activity-7179463474501103617-gZ-s
3. https://www.chegg.com/homework-help/questions-and-answers/indian-premier-league-ipl-twenty20-cricket-format-league-india-league-founded-board-contro-q107480072
4. https://sportssquare.substack.com/p/what-is-the-ipl
5. https://en.wikipedia.org/wiki/2023_Indian_Premier_League
Extracting summary from the Wikipedia article...
Generating Google search query...
Fetching additional URLs from Google...
Top 5 URLs related to the Wikipedia article:
1. https://en.wikipedia.org/wiki/2024_Indian_Premier_League
2. https://ambitiousbaba.com/17th-indian-premier-league-2024-full-details/
3. https://www.iplt20.com/
4. https://en.wikipedia.org/wiki/India

In [12]:
import requests
from bs4 import BeautifulSoup
import nltk
from collections import Counter

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# Function Description
# This function extracts keywords from a given text by performing tokenization, part-of-speech tagging, and named entity recognition (NER). It returns a list of named entities containing six or fewer words.

# Input
       # text (str): The input text from which keywords are to be extracted.
# Output
       # list: A list of named entities containing six or fewer words.

def extract_keywords_from_text(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    # Perform part-of-speech tagging
    pos_tags = nltk.pos_tag(tokens)

    # Perform named entity recognition (NER)
    named_entities = nltk.ne_chunk(pos_tags)

    # Extract named entities of type 'NE' with less than or equal to 6 words
    named_entities_list = []
    for chunk in named_entities:
        if hasattr(chunk, 'label'):
            entity = ' '.join(c[0] for c in chunk)
            if len(entity.split()) <= 6:  # Discard entities with more than 6 words
                named_entities_list.append(entity)

    return named_entities_list


# Function Description
# This function extracts keywords from the content of a list of URLs. For each URL, it retrieves the text, processes it to extract keywords, and aggregates the keywords from all URLs, ensuring uniqueness.

# Input
       # urls (list): A list of URLs from which keywords are to be extracted.
# Output
       # list: A list of unique keywords extracted from the content of the URLs.

def extract_keywords_from_urls(urls):
    all_keywords = []
    for url in urls:
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            text = ' '.join([element.get_text() for element in soup.find_all(text=True)])
            keywords = extract_keywords_from_text(text)
            all_keywords.extend(keywords)
        except Exception as e:
            print(f"Error processing URL '{url}':", e)

    # Remove duplicates while preserving order
    unique_keywords = list(set(all_keywords))
    return unique_keywords


# Input: List of Google URLs
urls = google_urls



# Extract keywords from the provided URLs
unique_keywords = extract_keywords_from_urls(urls)

# Print or save unique keywords
print("Unique Keywords:", unique_keywords)

# Write unique keywords to a file
output_file = "unique_keywords.txt"
with open(output_file, 'w') as f:
    for keyword in unique_keywords:
        f.write(keyword + '\n')

print(f"Unique keywords saved to {output_file}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
  text = ' '.join([element.get_text() for element in soup.find_all(text=True)])


Unique Keywords: ['Q0т', '�h���3�', 'Zyt', 'ŢBg', 'ٿ', 'Statistics Total', 'ODIs', 'Wikipedia Disclaimers Code', 'Moody', 'Gurnoor Brar', 'Svenska', 'UZAY', 'EꩬQ', 'K_', 'Stanford Super Series', 'League T10 Indian Street', '搪kHy���Z', '�wiS�U�0F', 'NA', 'VENUES', 'Unique', '=\x12����U\x12', 'x2G', 'GPV', 'Cricket Development', 'Mutum', 'Billion IPL Cricket', 'INbA', 'Mohit Chauhan', 'PBԷI', 'T_e', 'EIU', 'Dxp', 'gWG', 'Slow', 'DIs', 'Gfϋ3��Ns', 'Sam Billings', '��רv8\x07�5n', 'Gp', 'WSR', 'haK', 'jXet', 'xP', 'Alex Wharf', 'PtY', 'Delhi Capitals Ricky Ponting', 'yXN', 'EO', 'Nz9uY', 'Efn_h', 'Images Summary', '�5���Q', 'JIW', 'Seasons Season', 'hRGM', 'Bha', 'MYg', 'Blood', 'Cape Town Knight Riders', 'Star India', 'NYR', ',���n\x10r�KwZP\x0f', 'RKa4', 'ҁRCצ', 'L3', 'VFƏdw', 'Yܩzd', 'NY', '���h���01�ͺ', 'QѭGuP', 'Wt', 'P �g\x15oK��d', 'cAI', '\x15�\x1aY�3�ȸ\x00', 'J0i1', 'Becomes Fastest Indian', 'F3GZ', 'uD', 'úE', 's8H', 'tߞT', 'Of', 'p2sQ', 'zQE', '���C��dOF��', '�\x0fi�\x13�', 'F4HP

In [13]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering
import sqlite3
try:
    with open("unique_keywords.txt", "r", encoding="utf-8") as file:
        keywords = file.readlines()
except UnicodeDecodeError as e:
    print("Unicode decoding error:", e)
    # Handle the error appropriately, such as specifying a different encoding or logging the error
except FileNotFoundError as e:
    print("File not found:", e)
    # Handle the file not found error
except Exception as e:
    print("An error occurred:", e)
    # Handle other types of exceptions

# Load keywords from file


# Preprocess keywords: Remove newline characters
keywords = [keyword.strip() for keyword in keywords]

# Vectorize keywords using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(keywords)

# Cluster keywords using hierarchical clustering
num_clusters = 30  # You can adjust this based on your preference
agg_cluster = AgglomerativeClustering(n_clusters=num_clusters)
agg_cluster.fit(X.toarray())

# Create a dictionary to map clusters to keywords
cluster_keywords = {}
for i, label in enumerate(agg_cluster.labels_):
    cluster_keywords.setdefault(label, []).append(keywords[i])

# Print clusters and their keywords
for cluster, keywords in cluster_keywords.items():
    print(f"Cluster {cluster + 1}:")
    print(", ".join(keywords))
    print()


# Function Description
# This function generates representative names for clusters of keywords by computing the sum of TF-IDF scores for each term in the cluster and selecting the top N words with the highest scores.

# Input
#      vectorizer: The fitted vectorizer (e.g., TfidfVectorizer) used to transform the keywords.
#      cluster_keywords (dict): A dictionary where keys are cluster identifiers and values are lists of keywords belonging to each cluster.

# Output
#      dict: A dictionary where keys are cluster identifiers and values are representative names for each cluster, formed by the top N words with the highest TF-IDF scores.

def get_cluster_names(vectorizer, cluster_keywords):
    feature_names = vectorizer.get_feature_names_out()
    cluster_names = {}

    for cluster, keywords in cluster_keywords.items():
        # Vectorize keywords in the cluster
        cluster_X = vectorizer.transform(keywords)
        # Sum TF-IDF scores for each term in the cluster
        tfidf_sum = cluster_X.sum(axis=0).A1  # Convert to 1D array
        # Get indices of the highest TF-IDF scores
        sorted_indices = np.argsort(tfidf_sum)[::-1]
        # Select top N words to represent the cluster
        top_n_words = [feature_names[index] for index in sorted_indices[:3]]
        cluster_names[cluster] = " ".join(top_n_words)

    return cluster_names

# Get cluster names
cluster_names = get_cluster_names(vectorizer, cluster_keywords)

# Print clusters with their names and keywords
conn = sqlite3.connect('clusters2.db')
cursor = conn.cursor()

# Create tables for each cluster with their names
for cluster, keywords in cluster_keywords.items():
    cluster_name = cluster_names[cluster].replace(' ', '_').replace(':', '_')  # Replace invalid characters
    table_name = f"Cluster_{cluster_name}"

    # Create table with keywords and cluster name
    cursor.execute(f"CREATE TABLE IF NOT EXISTS {table_name} (Keyword TEXT)")

    # Insert keywords into the table
    for keyword in keywords:
        cursor.execute(f"INSERT INTO {table_name} (Keyword) VALUES (?)", (keyword,))

# Commit changes and close connection
conn.commit()
conn.close()

print("Clusters and keywords saved to SQLite database.")

Cluster 1:
Q0т, �h���3�, Zyt, ŢBg, ٿ, Statistics Total, ODIs, Wikipedia Disclaimers Code, Moody, Gurnoor Brar, Svenska, UZAY, EꩬQ, K_, 搪kHy���Z, �wiS�U�0F, NA, VENUES, Unique, =����U, x2G, GPV, Mutum, INbA, PBԷI, T_e, EIU, Dxp, gWG, Slow, DIs, Gfϋ3��Ns, Sam Billings, ��רv8�5n, Gp, WSR, haK, jXet, xP, Alex Wharf, PtY, Delhi Capitals Ricky Ponting, yXN, EO, Nz9uY, Efn_h, Images Summary, �5���Q, JIW, Seasons Season, hRGM, Bha, MYg, Blood, NYR, ,���nr�KwZP, RKa4, ҁRCצ, L3, VFƏdw, Yܩzd, NY, ���h���01�ͺ, QѭGuP, Wt, P �goK��d, cAI, �Y�3�ȸ , J0i1, Becomes Fastest Indian, F3GZ, uD, úE, s8H, tߞT, Of, p2sQ, zQE, ���C��dOF��, �i��, F4HP, ݺ, PWrYX, QNy, Lok Sabha, NOT, Swedish, United Arab Emirates, Amit, Check, hzlkF, aE, Kyle, TvT, nUSf, ��Oe5���, pFI, Kapil Dev, KFC Twenty20, D0X, QOm, sQ9, OB���n�xw, LNn, �gR�y�, Ť7, Jeetbo, pG, Ɇ�F�S� /+, gQRp, Harriet Tubman, qBݗ, sIPC, gޘT, TNPSC, Oakland Raiders, nA, Z=�����, ƹX, IEo, њыVڟ, ԟgcȊ0, ڪdY, cUq, Ko, HJ7, ��=�����T, Vfn, H0r, Ӕ, A

In [14]:
import requests
import sqlite3

def get_top_urls(topic, category=None, num_urls=1):
    base_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "list": "search",
        "srsearch": topic,
        "srprop": "title",
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    urls = []
    for result in data.get('query', {}).get('search', []):
        title = result['title']
        article_url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
        if not category or category.lower() in title.lower():
            urls.append(article_url)

    return urls[:num_urls]

# Connect to SQLite database
conn_clusters = sqlite3.connect('clusters2.db')
cursor_clusters = conn_clusters.cursor()

# Retrieve all table names in the clusters database
cursor_clusters.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor_clusters.fetchall()

# Display the list of available cluster tables to the user
print("Available cluster tables:")
for idx, table in enumerate(tables):
    print(f"{idx + 1}. {table[0]}")

# Prompt the user to enter the indices of the cluster tables they want to process
selected_indices = input("Enter the indices of the cluster tables you want to process, separated by commas: ").split(',')
selected_cluster_tables = [tables[int(index.strip()) - 1][0] for index in selected_indices]

print("Selected cluster tables:")
print(selected_cluster_tables)

# Save the selected table names to a text file
with open("selected_tables.txt", "w") as file:
    for table in selected_cluster_tables:
        file.write(table + "\n")

# Connect to SQLite database for storing keyword URLs
conn_keyword_urls = sqlite3.connect('keyword_urls1.db')
cursor_keyword_urls = conn_keyword_urls.cursor()

# Process each selected cluster table
for table_name in selected_cluster_tables:
    print(f"Processing table: {table_name}")
    cursor_clusters.execute(f"SELECT Keyword FROM {table_name}")
    keywords = cursor_clusters.fetchall()
    print(f"Retrieved {len(keywords)} keywords from {table_name}")

    # Create table with cluster name in keyword URLs database
    cursor_keyword_urls.execute(f"CREATE TABLE IF NOT EXISTS {table_name} (Keyword TEXT, URLs TEXT)")

    # Process each keyword and retrieve top URLs
    for keyword_tuple in keywords:
        keyword = keyword_tuple[0]  # Extract keyword from tuple
        top_urls = get_top_urls(keyword)
        print(f"Top URLs for keyword '{keyword}': {top_urls}")

        # Store retrieved URLs in keyword URLs database
        url_string = "\n".join(top_urls)  # Convert list of URLs to a string with newline separators
        cursor_keyword_urls.execute(f"INSERT INTO {table_name} (Keyword, URLs) VALUES (?, ?)", (keyword, url_string))

# Commit changes and close connections
conn_keyword_urls.commit()
conn_keyword_urls.close()
conn_clusters.close()

print("Top URLs retrieved and stored in the SQLite database.")

Available cluster tables:
1. Cluster_twenty20_match_new
2. Cluster_super_lucknow_iyer
3. Cluster_league_indian_captain
4. Cluster_cricket_national_league
5. Cluster_sharma_rohit_mohit
6. Cluster_knight_riders_kolkata
7. Cluster_india_star_tv
8. Cluster_ipl_player_team
9. Cluster_super_kings_chennai
10. Cluster_punjab_kings_patel
11. Cluster_narine_sunil_kolkata
12. Cluster_t20_global_league
13. Cluster_news_hult_community
14. Cluster_royal_mumbai_plessis
15. Cluster_mq_혍v_il
16. Cluster_sports_ndtv_viacom18
17. Cluster_stadium_gandhi_cricket
18. Cluster_rajasthan_royals_riyan
19. Cluster_eu_iiq_ik
20. Cluster_pdf_download_brochure
21. Cluster_group_teams_tata
22. Cluster_khan_shah_rukh
23. Cluster_singh_arijit_navdeep
24. Cluster_hyderabad_sunrisers_head
25. Cluster_kumar_akshay_samson
26. Cluster_gujarat_gill_shubman
27. Cluster_cm_il_ikuu
28. Cluster_quiz_puzzle_work
29. Cluster_ne_혍v_iiu
30. Cluster_eq_iiu_ikuu
Enter the indices of the cluster tables you want to process, separated b

In [15]:
import requests
from bs4 import BeautifulSoup
import sqlite3

def extract_summary_from_wikipedia(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    paragraphs = soup.find_all('p')
    summary = ' '.join(p.get_text() for p in paragraphs[:3])  # Extract the first 3 paragraphs as summary
    return summary

def get_top_google_urls(query, num_urls=5):
    base_url = "https://www.google.com/search"
    params = {
        "q": query,
        "num": num_urls
    }
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"
    }

    try:
        response = requests.get(base_url, params=params, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        search_results = soup.find_all('div', class_='tF2Cxc')
        urls = [result.a['href'] for result in search_results if result.a]
        return urls
    except requests.RequestException as e:
        print("Error fetching data from Google Search:", e)
        return []

def add_column_if_not_exists(cursor, table_name, column_name, column_type):
    cursor.execute(f"PRAGMA table_info({table_name})")
    columns = [info[1] for info in cursor.fetchall()]
    if column_name not in columns:
        cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN {column_name} {column_type}")

def main(topic, conn, table_name):
    cursor = conn.cursor()

    # Ensure GoogleURLs column exists
    add_column_if_not_exists(cursor, table_name, "GoogleURLs", "TEXT")

    # Query the database to retrieve Wikipedia URLs associated with the keywords
    cursor.execute(f"SELECT URLs FROM {table_name}")
    wikipedia_urls = cursor.fetchall()

    # Process each Wikipedia URL
    for url_tuple in wikipedia_urls:
        url = url_tuple[0]
        print("Extracting summary from the Wikipedia article:", url)
        wikipedia_summary = extract_summary_from_wikipedia(url)

        print("Generating Google search query...")
        google_query = f"{topic} {wikipedia_summary}"

        print("Fetching additional URLs from Google...")
        google_urls = get_top_google_urls(google_query)

        # Store Google URLs in the database
        cursor.execute(f"UPDATE {table_name} SET GoogleURLs=? WHERE URLs=?",
                       ("\n".join(google_urls), url))

        print(f"Top {len(google_urls)} URLs related to the Wikipedia article:")
        for i, google_url in enumerate(google_urls, start=1):
            print(f"{i}. {google_url}")

    # Commit changes
    conn.commit()

if __name__ == "__main__":
    # Read the selected cluster table name from selected_tables.txt
    with open("selected_tables.txt", "r") as file:
        selected_table = file.readline().strip()

    # Set the topic to the selected cluster table name
    topic = selected_table

    # Connect to SQLite database
    conn = sqlite3.connect('keyword_urls1.db')

    # Process Wikipedia URLs associated with keywords for the selected table
    main(topic, conn, selected_table)

    # Close connection
    conn.close()

    print("Google URLs updated in the SQLite database.")


Extracting summary from the Wikipedia article: https://en.wikipedia.org/wiki/Mzansi_Super_League
Generating Google search query...
Fetching additional URLs from Google...
Top 4 URLs related to the Wikipedia article:
1. https://en.wikipedia.org/wiki/Mzansi_Super_League
2. https://en.wikipedia.org/wiki/2018_Mzansi_Super_League
3. https://www.wikiwand.com/en/Mzansi_Super_League
4. https://www.espncricinfo.com/story/everything-you-need-to-know-about-the-mzansi-super-league-1162593
Extracting summary from the Wikipedia article: https://en.wikipedia.org/wiki/Kolkata_Knight_Riders
Generating Google search query...
Fetching additional URLs from Google...
Top 4 URLs related to the Wikipedia article:
1. https://en.wikipedia.org/wiki/Kolkata_Knight_Riders
2. https://www.thedrum.com/profile/kolkata-knight-riders/case-study
3. https://teamshahrukhkhan.com/business-ventures/kolkata-knight-riders/
4. https://simple.wikipedia.org/wiki/Kolkata_Knight_Riders
Extracting summary from the Wikipedia article

In [17]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyphen (from textstat)
  Downloading pyphen-0.15.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.15.0 textstat-0.7.3


In [18]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m81.9/126.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [19]:
######################################## FINAL #################################

import requests
from bs4 import BeautifulSoup
import sqlite3
import nltk
import re
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import textstat
import spacy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from datetime import datetime
import numpy as np

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Load SpaCy model for NER
nlp = spacy.load('en_core_web_sm')
sentiment_analyzer = SentimentIntensityAnalyzer()

# Text preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\b\w{1,2}\b', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum()]  # Corrected line
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Extract content from URL
def extract_content(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"
    }
    try:
        if not url:
            raise ValueError("URL is empty")
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        paragraphs = soup.find_all('p')
        content = ' '.join(p.get_text() for p in paragraphs)
        return preprocess_text(content), soup
    except requests.exceptions.RequestException as e:
        print(f"Error extracting content from {url}: {e}")
        return "", None

# Keyword density calculation
def keyword_density(content, keyword):
    tokens = word_tokenize(content)
    keyword_tokens = word_tokenize(keyword.lower())
    count = sum(1 for token in tokens if token in keyword_tokens)
    return count / len(tokens) if tokens else 0

# Named Entity Recognition (NER)
def named_entity_recognition(content, keywords):
    doc = nlp(content)
    entities = [ent.text.lower() for ent in doc.ents]
    keyword_entities = [word.lower() for word in keywords.split()]
    common_entities = set(entities).intersection(set(keyword_entities))
    return len(common_entities) / len(keyword_entities) if keyword_entities else 0

# Sentiment Analysis
def sentiment_analysis(content):
    sentiment = sentiment_analyzer.polarity_scores(content)
    return sentiment['compound']

# Heading Tags Analysis
def heading_tags_analysis(soup, keyword):
    headings = soup.find_all(['h1', 'h2', 'h3'])
    keyword_count = sum(keyword.lower() in heading.get_text().lower() for heading in headings)
    return keyword_count / len(headings) if headings else 0

# Multimedia Content Detection
def multimedia_content_detection(soup):
    multimedia_count = len(soup.find_all(['img', 'video', 'audio']))
    return 1 if multimedia_count > 0 else 0

# Authority on Subject
def authority_on_subject(url):
    authoritative_domains = ["wikipedia.org", "imdb.com", "indianexpress.com", "onmanorama.com"]  # Example authoritative domains
    return 1 if any(domain in url for domain in authoritative_domains) else 0

# Recency of Content
def recency_of_content(soup):
    date_tags = soup.find_all(['time', 'span', 'p'], {'class': ['date', 'time', 'published']})
    for date_tag in date_tags:
        try:
            date_text = date_tag.get_text()
            publication_date = datetime.strptime(date_text, '%B %d, %Y')  # Adjust date format as needed
            days_since_publication = (datetime.now() - publication_date).days
            return 1 / (1 + days_since_publication)
        except ValueError:
            continue
    return 0

# Citations and References
def citations_and_references(soup):
    references = len(soup.find_all('a', href=True))
    return references

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# Read selected table names from the file
with open("selected_tables.txt", "r") as file:
    selected_cluster_tables = [line.strip() for line in file.readlines()]

# Connect to SQLite database
conn = sqlite3.connect('keyword_urls1.db')
cursor = conn.cursor()

# Drop the table if it exists and create a new one with the correct schema
cursor.execute("DROP TABLE IF EXISTS Ranked_URLs")
cursor.execute("""
    CREATE TABLE Ranked_URLs (
        Keyword TEXT,
        Wiki_URL TEXT,
        Google_URL TEXT,
        Combined_Score REAL
    )
""")

# Fetch all rows from the database for each selected cluster table
for table_name in selected_cluster_tables:
    cursor.execute(f"SELECT Keyword, URLs, GoogleURLs FROM {table_name}")
    rows = cursor.fetchall()

    data = []
    combined_scores = []

    for row in rows:
        keyword = row[0]
        wiki_url = row[1]
        google_urls = row[2].split('\n') if row[2] else []

        # Extract and preprocess Wikipedia content
        wiki_content, _ = extract_content(wiki_url)
        if not wiki_content:
            continue
        wiki_embedding = get_bert_embedding(wiki_content)

        for url in google_urls:
            content, soup = extract_content(url)
            if not content or not soup:
                continue

            embedding = get_bert_embedding(content)

            # Calculate individual scores
            similarity_score = cosine_similarity(wiki_embedding, embedding).flatten()[0]
            density = keyword_density(content, keyword)
            length = len(content.split())
            readability = textstat.flesch_reading_ease(content)
            ner_score = named_entity_recognition(content, keyword)
            sentiment_score = sentiment_analysis(content)
            heading_score = heading_tags_analysis(soup, keyword)
            multimedia_score = multimedia_content_detection(soup)
            authority_score = authority_on_subject(url)
            recency_score = recency_of_content(soup)
            references_count = citations_and_references(soup)

            # Store individual scores for normalization
            individual_scores = {
                "similarity_score": similarity_score,
                "density": density,
                "length": length,
                "readability": readability,
                "ner_score": ner_score,
                "sentiment_score": sentiment_score,
                "heading_score": heading_score,
                "multimedia_score": multimedia_score,
                "authority_score": authority_score,
                "recency_score": recency_score,
                "references_count": references_count
            }

            data.append((keyword, wiki_url, url, individual_scores))

    # Normalize individual scores
    for key in ["similarity_score", "density", "length", "readability", "ner_score", "sentiment_score", "heading_score", "multimedia_score", "authority_score", "recency_score", "references_count"]:
        max_value = max(d[3][key] for d in data)
        min_value = min(d[3][key] for d in data)
        for i in range(len(data)):
            data[i][3][key] = (data[i][3][key] - min_value) / (max_value - min_value) if max_value != min_value else 0

    # Calculate and store combined scores
    updated_data = []
    for entry in data:
        combined_score = (
            (entry[3]["similarity_score"] * 0.3) +
            (entry[3]["density"] * 0.2) +
            (entry[3]["length"] * 0.1) +
            (entry[3]["readability"] * 0.1) +
            (entry[3]["ner_score"] * 0.1) +
            (entry[3]["sentiment_score"] * 0.05) +
            (entry[3]["heading_score"] * 0.05) +
            (entry[3]["multimedia_score"] * 0.05) +
            (entry[3]["authority_score"] * 0.05) +
            (entry[3]["recency_score"] * 0.05) +
            (entry[3]["references_count"] * 0.05)
        )
        combined_scores.append(combined_score)
        updated_data.append((entry[0], entry[1], entry[2], combined_score))

    # Normalize combined scores
    max_combined_score = max(combined_scores)
    min_combined_score = min(combined_scores)
    normalized_data = [(d[0], d[1], d[2], (d[3] - min_combined_score) / (max_combined_score - min_combined_score) if max_combined_score != min_combined_score else 0) for d in updated_data]

    # Insert data into the new table
    cursor.executemany("INSERT INTO Ranked_URLs (Keyword, Wiki_URL, Google_URL, Combined_Score) VALUES (?, ?, ?, ?)", normalized_data)

# Commit changes and close the connection
conn.commit()
conn.close()

print("URLs ranked and stored")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Error extracting content from https://www.reddit.com/r/Cricket/comments/15e4mjv/is_something_wrong_with_knight_riders/: 403 Client Error: Blocked for url: https://www.reddit.com/r/Cricket/comments/15e4mjv/is_something_wrong_with_knight_riders/
URLs ranked and stored


In [20]:
import sqlite3

# Connect to SQLite database
conn = sqlite3.connect('keyword_urls1.db')
cursor = conn.cursor()

# Retrieve top 10 URLs based on combined scores
cursor.execute("""
    SELECT Keyword, Wiki_URL, Google_URL, Combined_Score
    FROM Ranked_URLs
    ORDER BY Combined_Score DESC
    LIMIT 10
""")
top_urls = cursor.fetchall()

# Display the top URLs
print("Top 10 URLs based on ranking:")
for rank, (keyword, wiki_url, google_url, combined_score) in enumerate(top_urls, start=1):
    print(f"Rank {rank}:")
    print(f"Keyword: {keyword}")
    print(f"Wikipedia URL: {wiki_url}")
    print(f"Google URL: {google_url}")
    print(f"Combined Score: {combined_score}")
    print()

# Close the connection
conn.close()



Top 10 URLs based on ranking:
Rank 1:
Keyword: KKR Kolkata Knight Riders
Wikipedia URL: https://en.wikipedia.org/wiki/Kolkata_Knight_Riders
Google URL: https://en.wikipedia.org/wiki/Kolkata_Knight_Riders
Combined Score: 1.0

Rank 2:
Keyword: Kolkata Knight Riders League Indian
Wikipedia URL: https://en.wikipedia.org/wiki/Kolkata_Knight_Riders
Google URL: https://en.wikipedia.org/wiki/Kolkata_Knight_Riders
Combined Score: 0.9547632254898426

Rank 3:
Keyword: KKR Kolkata Knight Riders
Wikipedia URL: https://en.wikipedia.org/wiki/Kolkata_Knight_Riders
Google URL: https://simple.wikipedia.org/wiki/Kolkata_Knight_Riders
Combined Score: 0.9364998858455528

Rank 4:
Keyword: Kolkata Knight Riders League Indian
Wikipedia URL: https://en.wikipedia.org/wiki/Kolkata_Knight_Riders
Google URL: https://simple.wikipedia.org/wiki/Kolkata_Knight_Riders
Combined Score: 0.9333290212422177

Rank 5:
Keyword: Ramandeep Singh Kolkata Knight Riders Team
Wikipedia URL: https://en.wikipedia.org/wiki/Kolkata_Knig

In [26]:
import sqlite3

# Function to fetch top URLs and handle duplicates
def fetch_top_urls(conn, limit=10):
    cursor = conn.cursor()

    # Fetch top URLs based on combined scores
    cursor.execute("""
        SELECT Keyword, Wiki_URL, Google_URL, Combined_Score
        FROM Ranked_URLs
        ORDER BY Combined_Score DESC
        LIMIT ?
    """, (limit,))
    top_urls = cursor.fetchall()

    # Create a dictionary to store the highest score for each unique Google URL
    unique_urls = {}

    # Iterate through fetched URLs and store the highest score for each unique Google URL
    for keyword, wiki_url, google_url, combined_score in top_urls:
        if google_url not in unique_urls:
            unique_urls[google_url] = (keyword, wiki_url, combined_score)
        # If Google URL is already in dictionary, update with higher score
        elif combined_score > unique_urls[google_url][2]:
            unique_urls[google_url] = (keyword, wiki_url, combined_score)

    # Sort by combined score descending and take top `limit` results
    filtered_top_urls = sorted(unique_urls.values(), key=lambda x: x[2], reverse=True)[:limit]

    return filtered_top_urls

# Connect to SQLite database
conn = sqlite3.connect('keyword_urls1.db')

try:
    # Fetch and display the top 10 unique URLs
    top_urls = fetch_top_urls(conn, limit=10)

    # Display the top URLs
    print("Top URLs based on ranking (with unique Google URLs):")
    for rank, (keyword, wiki_url, combined_score) in enumerate(top_urls, start=1):
        print(f"Rank {rank}:")
        print(f"Keyword: {keyword}")
        print(f"Wikipedia URL: {wiki_url}")
        # Since we don't have the actual Google URL, we're omitting it from the output
        print(f"Combined Score: {combined_score}")
        print()

finally:
    # Close the connection
    conn.close()


Top URLs based on ranking (with unique Google URLs):
Rank 1:
Keyword: KKR Kolkata Knight Riders
Wikipedia URL: https://en.wikipedia.org/wiki/Kolkata_Knight_Riders
Combined Score: 1.0

Rank 2:
Keyword: KKR Kolkata Knight Riders
Wikipedia URL: https://en.wikipedia.org/wiki/Kolkata_Knight_Riders
Combined Score: 0.9364998858455528

Rank 3:
Keyword: Knight Rider
Wikipedia URL: https://en.wikipedia.org/wiki/Knight_Rider
Combined Score: 0.8963711112212667

Rank 4:
Keyword: KKR Kolkata Knight Riders
Wikipedia URL: https://en.wikipedia.org/wiki/Kolkata_Knight_Riders
Combined Score: 0.8826980251880616

Rank 5:
Keyword: Abu Dhabi Knight Riders
Wikipedia URL: https://en.wikipedia.org/wiki/Abu_Dhabi_Knight_Riders
Combined Score: 0.8625490986495513

