# Data Extraction

## Author Data Extraction

In [10]:
from scholarly import scholarly
import pandas as pd

def get_author_details(author_name, author_affiliation):
    search_query = scholarly.search_author(author_name)
    for author in search_query:
        if author_affiliation.lower() in author.get("affiliation", "").lower():
            return scholarly.fill(author)
    return None

def get_publication_details(publication):
    return scholarly.fill(publication)

def save_to_excel(publication_data, all_years, filename="author_details_output.xlsx"):
    # Prepare final data list with desired column order
    final_data = []

    for row in publication_data:
        base_data = {
            "title": row["title"],
            "authors": row["authors"],
            "journal": row["journal"],
            "publication date": row["publication date"],
            "total citation": row["total citation"],
            "volume": row["volume"],
            "publisher": row["publisher"],
        }

        # Add year-wise citation data in correct order
        for year in all_years:
            base_data[str(year)] = row.get("citations_per_year", {}).get(year, 0)

        # Finally, add the description
        base_data["description"] = row["description"]

        final_data.append(base_data)

    # Create DataFrame and save
    df = pd.DataFrame(final_data)
    df.to_excel(filename, index=False)
    print(f"\nData saved to {filename}")

if __name__ == "__main__":
    author_name = input("Enter the author's name: ")
    author_affiliation = input("Enter the author's affiliation (or keyword): ")
    details = get_author_details(author_name, author_affiliation)

    if details:
        print("\nFetching publication details...")
        publications = details.get("publications", [])

        data = []
        all_years = set()

        for pub in publications[:290]:  # You can increase the number if needed
            detailed_pub = get_publication_details(pub)
            bib = detailed_pub.get("bib", {})
            cites_per_year = detailed_pub.get("cites_per_year", {})

            all_years.update(cites_per_year.keys())

            pub_year = bib.get("pub_year", "N/A")
            pub_month = bib.get("pub_month", "N/A")
            pub_day = bib.get("pub_day", "N/A")
            pub_date = f"{pub_year}/{pub_month}/{pub_day}" if pub_year != "N/A" and pub_month != "N/A" and pub_day != "N/A" else pub_year

            data.append({
                "title": bib.get("title", "Unknown Title"),
                "authors": bib.get("author", "N/A"),
                "journal": bib.get("journal", "N/A"),
                "publication date": pub_date,
                "total citation": detailed_pub.get("num_citations", "N/A"),
                "volume": bib.get("volume", "N/A"),
                "publisher": bib.get("publisher", "N/A"),
                "description": bib.get("abstract", "N/A"),
                "citations_per_year": cites_per_year
            })

        save_to_excel(data, sorted(all_years))

    else:
        print("Author not found. Check the name or affiliation keyword.")

Enter the author's name:  Rajendra Prasad
Enter the author's affiliation (or keyword):  Professor Dean, Amity University Haryana


Author not found. Check the name or affiliation keyword.


## Article Data Extraction

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import tkinter as tk
from tkinter import filedialog

def scrape_scholar_articles(query, num_pages):
    articles = []
    page = 0
    while page < num_pages:
        url = f"https://scholar.google.com/scholar?start={page*10}&q={query}&hl=en&as_sdt=0,5"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        results = soup.find_all("div", class_="gs_ri")

        for result in results:
            title = result.find("h3", class_="gs_rt").text
            authors = result.find("div", class_="gs_a").text
            link_tag = result.find("a")
            link = link_tag["href"] if link_tag else "N/A"

            citation_tag = result.find("div", class_="gs_fl").find_all("a")
            citation_text = next((a.text for a in citation_tag if "Cited by" in a.text), "Cited by 0")
            citations = int(citation_text.replace("Cited by", "").strip())

            articles.append({
                "Title": title,
                "Authors": authors,
                "Link": link,
                "Citations": citations
            })

        page += 1

    return articles

def save_to_excel(articles, filename):
    df = pd.DataFrame(articles)
    df.to_excel(filename, index=False)

def browse_folder():
    folder_path = filedialog.askdirectory()
    entry_folder.delete(0, tk.END)
    entry_folder.insert(tk.END, folder_path)

def scrape_articles():
    query = entry_query.get()
    num_pages = int(entry_pages.get())

    articles = scrape_scholar_articles(query, num_pages)

    folder_path = entry_folder.get()
    if folder_path:
        filename = f"{folder_path}/scholar_articles.xlsx"
    else:
        filename = "Health_articles.xlsx"

    save_to_excel(articles, filename)
    label_status.config(text="Extraction complete. Data saved to scholar_articles.xlsx.")

# Create the main window
window = tk.Tk()
window.title("Google Scholar Scraper")
window.geometry("400x250")

# Create input fields and labels
label_query = tk.Label(window, text="Article Title or Keyword:")
label_query.pack()
entry_query = tk.Entry(window, width=40)
entry_query.pack()

label_pages = tk.Label(window, text="Number of Pages:")
label_pages.pack()
entry_pages = tk.Entry(window, width=40)
entry_pages.pack()

label_folder = tk.Label(window, text="Output Folder (optional):")
label_folder.pack()
entry_folder = tk.Entry(window, width=40)
entry_folder.pack()

# Create browse button
button_browse = tk.Button(window, text="Browse", command=browse_folder)
button_browse.pack()

# Create extract button
button_extract = tk.Button(window, text="Extract Data", command=scrape_articles)
button_extract.pack()

# Create status label
label_status = tk.Label(window, text="")
label_status.pack()

# Run the main window loop
window.mainloop()