In [17]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import tkinter as tk
from tkinter import filedialog
import time  # For adding delays
import random  # For randomizing delay times

def scrape_scholar_articles(query, num_pages):
    articles = []
    page = 0
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36"
    }
    
    while page < num_pages:
        url = f"https://scholar.google.com/scholar?start={page*10}&q={query}&hl=en&as_sdt=0,5"
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, "html.parser")
        results = soup.find_all("div", class_="gs_ri")
        
        for result in results:
            # Extract title
            title = result.find("h3", class_="gs_rt").text
            
            # Extract snippet containing authors, publisher, and year
            snippet = result.find("div", class_="gs_a").text
            
            # Extract conference or journal name from snippet
            conference_or_journal = None
            publisher = None
            year = None
            
            snippet_parts = snippet.split("-")
            if len(snippet_parts) > 1:
                conference_or_journal = snippet_parts[1].strip()
                publisher_info = snippet_parts[1].strip()
                # Extract year from the publisher_info
                year_part = snippet_parts[-1].strip()
                year = ''.join(filter(str.isdigit, year_part)) if any(char.isdigit for char in year_part) else None
            
            # Extract citations count
            citations = None
            citation_info = result.find("div", class_="gs_fl").find_all("a")
            for a in citation_info:
                if "Cited by" in a.text:
                    citations = a.text.split("Cited by ")[-1]

            keywords = [] 
            abstract = result.find("div", class_="gs_rs")
            if abstract:
                abstract_text = abstract.text  # Keep original case for replacements
                
                # Perform replacements first
                replacements = {
                    "DL": "deep learning",
                    "NLP": "natural language processing",
                    "CV": "computer vision",
                    "CNN": "convolutional neural networks",
                    "RNN": "recurrent neural networks",
                    "Gen AI": "generative adversarial networks",
                    "IOT": "internet of things",
                    " AR ": "augmented reality",
                    " R ": "Programming Concepts",
                    " C++ ": "Programming Concepts",
                    " C ": "Programming Concepts",
                    " Java ": "Programming Concepts",
                    " Python ": "Programming Concepts",
                    " SQL ": "Database",
                    " MongoDB ": "Database",
                }
                for short_form, full_form in replacements.items():
                    abstract_text = abstract_text.replace(short_form, full_form)
                
                # Convert the text to lowercase after replacements
                abstract_text = abstract_text.lower()
                
                # Define possible keywords
                possible_keywords = [
                    "machine learning", 
                    "deep learning",  # Includes "DL"
                    "big data", 
                    "artificial intelligence", 
                    "natural language processing",  # Includes "nlp"
                    "computer vision",  # Includes "cv"
                    "reinforcement learning", 
                    "data mining", 
                    "predictive analytics", 
                    "supervised learning", 
                    "unsupervised learning", 
                    "convolutional neural networks",  # Includes "cnn"
                    "recurrent neural networks",  # Includes "rnn"
                    "generative adversarial networks",  # Includes "gen ai"
                    "transfer learning", 
                    "cloud computing",
                    "internet of things",  # Includes "iot"
                    "robotics", 
                    "cybersecurity", 
                    "algorithm", 
                    "quantum computing", 
                    "programming concpts",
                    "database",
                    "augmented reality"  # Includes "ar"
                ]
            
                for keyword in possible_keywords:
                    if keyword in abstract_text:
                        keywords.append(keyword)
            if title:
                title_text = title  # Keep original case for replacements
                
                # Perform replacements first
                replacements = {
                    "DL": "deep learning",
                    "NLP": "natural language processing",
                    "CV": "computer vision",
                    "CNN": "convolutional neural networks",
                    "RNN": "recurrent neural networks",
                    "Gen AI": "generative adversarial networks",
                    "IOT": "internet of things",
                    " AR ": "augmented reality",
                    " R ": "Programming Concepts",
                    " C++ ": "Programming Concepts",
                    " C ": "Programming Concepts",
                    " Java ": "Programming Concepts",
                    " Python ": "Programming Concepts",
                    " SQL ": "Database",
                    " MongoDB ": "Database",
                }
                for short_form, full_form in replacements.items():
                    title_text = title_text.replace(short_form, full_form)
                
                # Convert the text to lowercase after replacements
                title_text = title_text.lower()
                
                # Define possible keywords
                possible_keywords = [
                    "machine learning", 
                    "deep learning",  # Includes "DL"
 
                    "big data", 
                    "artificial intelligence", 
                    "natural language processing",  # Includes "nlp"
                    "computer vision",  # Includes "cv"
                    "reinforcement learning", 
                    "data mining", 
                    "predictive analytics", 
                    "convolutional neural networks",  # Includes "cnn"
                    "recurrent neural networks",  # Includes "rnn"
                    "generative adversarial networks",  # Includes "gen ai"
                    "transfer learning", 
                    "internet of things",  # Includes "iot"
                    "robotics", 
                    "cybersecurity", 
                    "algorithm", 
                    "quantum computing", 
                    "programming concpts",
                    "database",
                    "augmented reality"  # Includes "ar"
                ]
            
                for keyword in possible_keywords:
                    if keyword in title_text:
                        keywords.append(keyword)

            key = set(keywords)
            # Append extracted information to the articles list
            articles.append({
                "Title": title,
                "Abstract": abstract.text,
                "Publisher": publisher,
                "Year": year,
                "Conference/Journal": conference_or_journal,
                "Keywords": key,
                "Citations Count": citations
            })

        page += 1
        
        # Add a delay between requests to avoid getting blocked
        time.sleep(random.uniform(1, 5))  # Random delay between 1 to 5 seconds

    return articles

def save_to_excel(articles, filename):
    df = pd.DataFrame(articles)
    df.to_excel(filename, index=False)

def browse_folder():
    folder_path = filedialog.askdirectory()
    entry_folder.delete(0, tk.END)
    entry_folder.insert(tk.END, folder_path)

def scrape_articles():
    query = entry_query.get()
    num_pages = int(entry_pages.get())

    articles = scrape_scholar_articles(query, num_pages)

    folder_path = entry_folder.get()
    if folder_path:
        filename = f"{folder_path}/G_phs.xlsx"
    else:
        filename = "scholar_articles.xlsx"

    save_to_excel(articles, filename)
    label_status.config(text="Extraction complete. Data saved to scholar_articles.xlsx.")

# Create the main window
window = tk.Tk()
window.title("Google Scholar Scraper")
window.geometry("400x250")

# Create input fields and labels
label_query = tk.Label(window, text="Article Title or Keyword:")
label_query.pack()
entry_query = tk.Entry(window, width=40)
entry_query.pack()

label_pages = tk.Label(window, text="Number of Pages:")
label_pages.pack()
entry_pages = tk.Entry(window, width=40)
entry_pages.pack()

label_folder = tk.Label(window, text="Output Folder (optional):")
label_folder.pack()
entry_folder = tk.Entry(window, width=40)
entry_folder.pack()

# Create browse button
button_browse = tk.Button(window, text="Browse", command=browse_folder)
button_browse.pack()

# Create extract button
button_extract = tk.Button(window, text="Extract Data", command=scrape_articles)
button_extract.pack()

# Create status label
label_status = tk.Label(window, text="")
label_status.pack()

# Run the main window loop
window.mainloop()
