# **Library 📚**

In [None]:
from bs4 import BeautifulSoup
import requests, json, lxml
import pandas as pd
from statistics import mean
from collections import Counter
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import time
import timeit

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# **Get data from Sheet ✅📂**

In [None]:
from google.colab import auth
import gspread
from google.auth import default
#autenticating to google
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

In [None]:
import pandas as pd
#defining my worksheet
worksheet = gc.open('Keywords').sheet1
#get_all_values gives a list of rows
rows = worksheet.get_all_values()
#Convert to a DataFrame
df = pd.DataFrame(rows)
df

Unnamed: 0,0,1,2,3
0,No,PiC,Theme,Keyword
1,1,Thành,Language,english for communication
2,2,,,english for children
3,3,,,english course online free
4,4,,,language skills
...,...,...,...,...
142,75,,,education websites
143,76,,,definition of education
144,77,,,education philosophy
145,78,,,local education agency


In [None]:
#creating columns name
df.columns = df.iloc[0]
df = df.iloc[1:]
df

Unnamed: 0,No,PiC,Theme,Keyword
1,1,Thành,Language,english for communication
2,2,,,english for children
3,3,,,english course online free
4,4,,,language skills
5,5,,,learning language benefits
...,...,...,...,...
142,75,,,education websites
143,76,,,definition of education
144,77,,,education philosophy
145,78,,,local education agency


In [None]:
keywords = df.Keyword.values.tolist()
print(len(keywords), keywords)

146 ['english for communication', 'english for children', 'english course online free', 'language skills', 'learning language benefits', 'how to learn english effectively', 'good university', 'Education center', 'Technical schools near me', 'What should I major in?', 'trending major', 'physical education', 'education portal', 'higher education', 'high education', 'Learning experience', 'medical education', 'women education', 'patient education', 'how to become a richman', 'environmental education', 'masters of education', 'what is common core education', 'what is the purpose of education', 'Marketing skills', 'market research', 'marketing plan', 'social advertising', 'finance skills', 'leadership skills', 'writing a good CV', 'how to write a proper CV', 'how to improve my communication skills', 'how to study well', 'active learning', 'how technology has changed education', 'elementary school', 'department of education', 'homeschooling', 'financial education', 'educational games', 'Lear

In [None]:
duplicated = df.loc[df.Keyword.duplicated(), :]
duplicated_list = duplicated['Keyword'].tolist()
print(duplicated_list)

['masters of education', 'vocational education', 'continuing education units', 'education consulting', 'landmark education', 'department of higher education', 'education websites', 'definition of education', 'education philosophy', 'local education agency', 'special education teachers']


In [None]:
df.loc[df['Keyword'] == "education jobs"].index[0]

66

In [None]:
keywords = df['Keyword'].loc[81:].tolist()
print(keywords)

['education quotes', 'education galaxy', 'sexual education in schools', 'bad education', 'early childhood education', 'board of education', 'apple education', 'education post', 'education board', 'distance education', 'apply to education', 'education first', 'secondary education', 'education department', 'times higher education', 'google apps for education', 'continuing education', 'adult education', 'quotes about education', 'windows 10 education', 'sex education videos', 'philosophies of education', 'special education', 'quotes on education', 'educated definition', 'college education', 'post secondary education', 'general education', 'right to education', 'google education', 'online education', 'ms department of education', 'drivers education online', 'autodesk education', 'weebly education', 'high education jobs', 'health education', 'inclusive education', 'higher education jobs', 'primary education', 'education connection', 'office 365 education', 'education games for kids', 'apple

# **All used functions: 🧱**

In [None]:
headers = {
      "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
  }

In [None]:
def total_text_count(url):
  html = requests.get(url, headers=headers, verify=False, timeout=30)
  soup = BeautifulSoup(html.text, 'lxml')

  text = soup.get_text(strip=True)
  len_text = len(text.split())
  return len_text





def get_text_tag(url, tag):
  html = requests.get(url, headers=headers, verify=False, timeout=30)
  soup = BeautifulSoup(html.text, 'lxml')
  lst = []
  for el in soup.find_all(tag):
    lst.append(el.text.strip())
  avg_len = round(mean([len(i.split()) for i in lst]),2) if lst else None
  return lst, len(lst), avg_len




def keyword_counter(keyword, tag):
  counts = Counter()

  stop_words = set(stopwords.words('english'))
  word_tokens = word_tokenize((keyword).lower())
  filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
  filtered_sentence = []
  for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

  for sentence in tag:
    if sentence:
      counts.update(word.strip('.,?!"\'').lower() for word in sentence.split())
    else:
      continue

  res = {key: counts[key] for key in counts.keys()
        & filtered_sentence}

  if not res:
    res = None
  return res




def link_keyword_counter(keyword, url):
  counts = Counter()

  stop_words = set(stopwords.words('english'))
  word_tokens = word_tokenize((keyword).lower())
  filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
  filtered_sentence = []
  for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

  print(word_tokens)
  print(filtered_sentence)

  split_url = re.split('/|-|_|\.| ', url)

  for sentence in split_url:
    counts.update(word.strip('.,?!"\'').lower() for word in sentence.split())

  res = {key: counts[key] for key in counts.keys()
        & filtered_sentence}
  if not res:
    res = None
  return res





def links_counter(url):
  html = requests.get(url, headers=headers, verify=False, timeout=30)
  soup = BeautifulSoup(html.text, 'lxml')
  internalLinks = 0
  externalLinks = 0
  a_text = []
  links = soup.find_all("a") # Find all elements with the tag <a>
  for l in links:
    link = l.get('href')
    if l.string:
      a_text.append(l.string)
    if link == None:
      continue
    if link.startswith(url) or link.startswith("/") or link.startswith("#") :
      internalLinks += 1
    else:
      externalLinks +=1

  return a_text, internalLinks, externalLinks




def metadesc_find(url, keyword):
  counts = Counter()
  keyword = (keyword).lower().split()
  html = requests.get(url, headers=headers, verify=False, timeout=30)
  soup = BeautifulSoup(html.text, 'lxml')
  desc = None
  desc_len = 0
  res = None

  for tag in soup.find_all("meta"):
      if tag.get("property", None) == "og:title":
          desc = tag.get("content", None)
          if desc:
            print(desc)
            desc_len = len(desc.split())
            counts.update(word.strip('.,?!"\'').lower() for word in desc.split())
            res = {key: counts[key] for key in counts.keys()
                  & keyword}
            print(res)
  return desc, desc_len, res




def image_alt_find(url):
  html = requests.get(url, headers=headers, verify=False, timeout=30)
  soup = BeautifulSoup(html.text, 'lxml')
  images = soup.find_all("img")
  #img_links = []
  img_alt = []
  i=0
  for image in images:
    #img_links.append(image.get("src"))
    if image.get("alt"):
      alt = image.get("alt").strip()
    else:
      alt = None
    img_alt.append(alt)
    i += 1
  return i, img_alt



# **Main 🎠**

In [None]:
#keywords = ['English for communication', 'what is common core education', 'how to improve my communication skills', 'social advertising']
#keywords = ['education jobs']
data = []

for k in keywords:
  params = {
      "q": k,
      "hl": "en",
      "gl": "vn",
      "start": 0,
  }

  headers = {
      "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
  }

  page_limit = 2

  page_num = 0

  i = 0
  while True:
      time.sleep(3)
      page_num += 1
      #print(f"page: {page_num}")
      html = requests.get("https://www.google.com/search", params=params, headers=headers, verify=False, timeout=30)
      html.raise_for_status()
      #print(html.url)
      soup = BeautifulSoup(html.text, 'lxml')


      for result in soup.select(".tF2Cxc"):
          time.sleep(3)
          i += 1
          if i == 11:
            break
          title = result.select_one(".DKV0Md").text
          try:
            snippet = result.select_one(".lEBKkf ").text.strip()
          except:
            snippet = None
          link = result.select_one(".yuRUbf a")["href"]
          print(link)
          try:
            total_words = total_text_count(link)

            h1_lst, h1_num, h1_len = get_text_tag(link, "h1")
            h2_lst, h2_num, h2_len = get_text_tag(link, "h2")
            h3_lst, h3_num, h3_len = get_text_tag(link, "h3")

            header_total = h1_num + h2_num + h3_num

            p_lst, p_num, p_len = get_text_tag(link, "p")

            footer_lst, footer_num, footer_len = get_text_tag(link, "footer")

            img_count, img_alt = image_alt_find(link)

            print(img_count, img_alt)

            a_text, internalLinks, externalLinks = links_counter(link)
            total_link = internalLinks + externalLinks

            h1_kcount = keyword_counter(k, h1_lst)
            h2_kcount = keyword_counter(k, h2_lst)
            h3_kcount = keyword_counter(k, h3_lst)

            p_kcount = keyword_counter(k, p_lst)

            a_kcount = keyword_counter(k, a_text)

            footer_kcount = keyword_counter(k, footer_lst)

            link_kcount = link_keyword_counter(k, link)

            imalt_kcount = keyword_counter(k, img_alt)

            meta_desc, meta_desc_len, meta_kcount = metadesc_find(link, k)

            ti_lst, ti_num, ti_len = get_text_tag(link, "title")
            if ti_lst:
              ti_lst = 1
            else:
              ti_lst = 0

            data.append({
              "keyword": k,
              "rank": i,
              "title": title,
              "snippet": snippet,
              "link": link,
              "total_words": total_words,
              "h1_num": h1_num,
              "h1_len": h1_len,
              "h2_num": h2_num,
              "h2_len": h2_len,
              "h3_num": h3_num,
              "h3_len": h3_len,
              "header_total": header_total,
              "img_count": img_count,
              "internalLinks": internalLinks,
              "externalLinks": externalLinks,
              "total_link": total_link,
              "h1_kcount": h1_kcount,
              "h2_kcount": h2_kcount,
              "h3_kcount": h3_kcount,
              "p_kcount": p_kcount,
              "a_kcount": a_kcount,
              "footer_kcount": footer_kcount,
              "link_kcount": link_kcount,
              "imalt_kcount": imalt_kcount,
              "meta_desc_len": meta_desc_len,
              "meta_kcount": meta_kcount,
              "ti_used": ti_lst

            })
          except requests.exceptions.RequestException as err:
              print ("OOps: Something Else",err)
          except requests.exceptions.HTTPError as errh:
              print ("Http Error:",errh)
          except requests.exceptions.ConnectionError as errc:
              print ("Error Connecting:",errc)
          except requests.exceptions.Timeout as errt:
              print ("Timeout Error:",errt)

      if page_num == page_limit:
          break
      if soup.select_one(".d6cvqb a[id=pnnext]"):
          params["start"] += 10
      else:
          break
dt_tb =  pd.DataFrame.from_dict(data)
dt_tb

In [None]:
dt_tb =  pd.DataFrame.from_dict(data)
dt_tb

Unnamed: 0,keyword,rank,title,snippet,link,total_words,h1_num,h1_len,h2_num,h2_len,...,h2_kcount,h3_kcount,p_kcount,a_kcount,footer_kcount,link_kcount,imalt_kcount,meta_desc_len,meta_kcount,ti_used
0,education quotes,1,50 Powerful Education Quotes for Kids - Splash...,,https://www.splashlearn.com/blog/powerful-educ...,1532,1,6.0,2,11.00,...,"{'quotes': 1, 'education': 2}",{'quotes': 1},"{'quotes': 6, 'education': 6}",{'quotes': 11},,"{'quotes': 1, 'education': 1}","{'quotes': 3, 'education': 3}",6,"{'quotes': 1, 'education': 1}",1
1,education quotes,2,300+ Education Quotes On Learning & Students,,https://everydaypower.com/quotes-about-education/,7951,1,7.0,40,6.53,...,"{'quotes': 39, 'education': 40}",,"{'quotes': 78, 'education': 106}","{'quotes': 125, 'education': 4}",{'quotes': 2},"{'quotes': 1, 'education': 1}","{'quotes': 36, 'education': 30}",7,"{'quotes': 2, 'education': 2}",1
2,education quotes,3,Education Quotes - BrainyQuote,,https://www.brainyquote.com/topics/education-q...,8,0,,0,,...,,,,,,"{'quotes': 1, 'education': 1}",,0,,1
3,education quotes,4,125 Inspiring Education Quotes That'll Keep Yo...,,https://parade.com/1034814/marynliles/educatio...,8,0,,0,,...,,,,,,"{'quotes': 1, 'education': 1}",,0,,1
4,education quotes,5,Educational Quotes for Students - University o...,13 Inspiring Educational Quotes for Students ·...,https://www.uopeople.edu/blog/13-inspiring-edu...,1590,1,6.0,2,2.00,...,{'quotes': 1},,"{'quotes': 6, 'education': 16}",{'education': 3},,{'quotes': 1},{'quotes': 2},9,{'quotes': 1},1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
637,special education teachers,6,Can You See Yourself Teaching Special Education?,Required training: A special education teachin...,https://www.alleducationschools.com/teaching-c...,2573,1,12.0,8,4.12,...,"{'special': 4, 'teachers': 1, 'education': 4}","{'special': 3, 'education': 2}","{'special': 71, 'teachers': 24, 'education': 62}","{'special': 4, 'teachers': 4, 'education': 13}",,"{'special': 1, 'education': 1}","{'special': 4, 'education': 3}",11,"{'special': 1, 'education': 2}",1
638,special education teachers,7,Special Education Teacher Career Guide,Special education teachers work to promote and...,https://www.teachercertificationdegrees.com/ca...,1516,1,5.0,8,5.00,...,"{'special': 5, 'education': 5}",,"{'special': 61, 'teachers': 34, 'education': 56}","{'special': 5, 'teachers': 1, 'education': 11}",,"{'special': 1, 'education': 1}",,5,"{'special': 1, 'education': 1}",1
639,special education teachers,8,Special Education Teacher job description sample,Special Education Teacher job duties include: ...,https://resources.workable.com/special-educati...,686,1,5.0,8,5.12,...,"{'special': 3, 'education': 3}","{'special': 2, 'education': 2}","{'special': 6, 'teachers': 2, 'education': 6}","{'special': 2, 'education': 3}",,"{'special': 1, 'education': 1}",,5,"{'special': 1, 'education': 1}",1
640,special education teachers,9,What Does a Special Education Teacher Do?,"May 3, 2020 — Special education teachers are c...",https://www.alliant.edu/blog/what-does-special...,3296,1,7.0,13,2.62,...,"{'special': 3, 'education': 3}",,"{'special': 21, 'teachers': 6, 'education': 24}","{'special': 2, 'teachers': 3, 'education': 13}",{'education': 3},"{'special': 1, 'education': 1}",,0,,1


In [None]:
from google.colab import drive
drive.mount('drive', force_remount=True)

dt_tb.to_csv('/content/drive/My Drive/search_scraped_data.csv', encoding='utf-8', mode='a', index= False, header=False)

Mounted at drive


In [None]:
print(duplicated_list)

['masters of education', 'vocational education', 'continuing education units', 'education consulting', 'landmark education', 'department of higher education', 'education websites', 'definition of education', 'education philosophy', 'local education agency', 'special education teachers']


In [None]:
#find duplicates
from google.colab import drive
drive.mount('drive', force_remount=True)

import pandas as pd
result_scrape = pd.read_csv('/content/drive/My Drive/search_scraped_data.csv')
#for d in duplicated_list:
find_re = result_scrape.loc[result_scrape['keyword'] == duplicated_list[10]]
find_re



Mounted at drive


Unnamed: 0,keyword,rank,title,snippet,link,total_words,h1_num,h1_len,h2_num,h2_len,...,h2_kcount,h3_kcount,p_kcount,a_kcount,footer_kcount,link_kcount,imalt_kcount,meta_desc_len,meta_kcount,ti_used
1299,special education teachers,1,Special Education Teachers : Occupational Outl...,,https://www.bls.gov/ooh/education-training-and...,3661.0,1.0,3.0,31.0,3.84,...,"{'special': 2, 'teachers': 1, 'education': 4}","{'special': 4, 'teachers': 3, 'education': 5}","{'special': 63, 'teachers': 89, 'education': 84}","{'special': 15, 'teachers': 20, 'education': 21}",,"{'special': 1, 'teachers': 1, 'education': 2}","{'special': 3, 'teachers': 8, 'education': 5}",0,,1.0
1300,special education teachers,2,What does a special education teacher do?,Special education teachers work with students ...,https://www.careerexplorer.com/careers/special...,1120.0,1.0,7.0,9.0,4.67,...,"{'special': 5, 'education': 5}",,"{'special': 21, 'teachers': 11, 'education': 22}","{'special': 3, 'education': 4}",,"{'special': 1, 'education': 1}","{'special': 2, 'education': 2}",0,,1.0
1301,special education teachers,3,What Is a Special Education Teacher? (Skills a...,"Jan 26, 2023 — A special education teacher hel...",https://in.indeed.com/career-advice/career-dev...,8.0,0.0,,0.0,,...,,,,,,"{'special': 1, 'education': 1}",,0,,1.0
1302,special education teachers,4,National Association of Special Education Teac...,The National Association of Special Education ...,http://www.naset.org/,2479.0,2.0,3.5,10.0,3.7,...,"{'special': 3, 'education': 2}","{'special': 2, 'education': 2}","{'special': 37, 'teachers': 14, 'education': 35}","{'special': 32, 'teachers': 5, 'education': 26}",,,,0,,1.0
1303,special education teachers,5,Teaching Special Education,It is a unique ability of special education te...,https://teach.com/careers/become-a-teacher/wha...,4481.0,1.0,3.0,5.0,4.6,...,"{'special': 3, 'teachers': 1, 'education': 4}","{'special': 1, 'education': 2}","{'special': 36, 'teachers': 7, 'education': 50}","{'special': 9, 'teachers': 7, 'education': 26}","{'teachers': 1, 'education': 1}","{'special': 1, 'education': 1}",,3,"{'special': 1, 'education': 1}",1.0
1304,special education teachers,6,Can You See Yourself Teaching Special Education?,Required training: A special education teachin...,https://www.alleducationschools.com/teaching-c...,2573.0,1.0,12.0,8.0,4.12,...,"{'special': 4, 'teachers': 1, 'education': 4}","{'special': 3, 'education': 2}","{'special': 71, 'teachers': 24, 'education': 62}","{'special': 4, 'teachers': 4, 'education': 13}",,"{'special': 1, 'education': 1}","{'special': 4, 'education': 3}",11,"{'special': 1, 'education': 2}",1.0
1305,special education teachers,7,Special Education Teacher Career Guide,Special education teachers work to promote and...,https://www.teachercertificationdegrees.com/ca...,1516.0,1.0,5.0,8.0,5.0,...,"{'special': 5, 'education': 5}",,"{'special': 61, 'teachers': 34, 'education': 56}","{'special': 5, 'teachers': 1, 'education': 11}",,"{'special': 1, 'education': 1}",,5,"{'special': 1, 'education': 1}",1.0
1306,special education teachers,8,Special Education Teacher job description sample,Special Education Teacher job duties include: ...,https://resources.workable.com/special-educati...,686.0,1.0,5.0,8.0,5.12,...,"{'special': 3, 'education': 3}","{'special': 2, 'education': 2}","{'special': 6, 'teachers': 2, 'education': 6}","{'special': 2, 'education': 3}",,"{'special': 1, 'education': 1}",,5,"{'special': 1, 'education': 1}",1.0
1307,special education teachers,9,What Does a Special Education Teacher Do?,"May 3, 2020 — Special education teachers are c...",https://www.alliant.edu/blog/what-does-special...,3296.0,1.0,7.0,13.0,2.62,...,"{'special': 3, 'education': 3}",,"{'special': 21, 'teachers': 6, 'education': 24}","{'special': 2, 'teachers': 3, 'education': 13}",{'education': 3},"{'special': 1, 'education': 1}",,0,,1.0
1308,special education teachers,10,Special Education Teachers,Special education teachers provide additional ...,https://ncse.ie/special-education-teachers,908.0,1.0,3.0,0.0,,...,,,"{'special': 4, 'teachers': 1, 'education': 3}","{'special': 23, 'teachers': 6, 'education': 11}","{'special': 1, 'education': 1}","{'special': 1, 'teachers': 1, 'education': 1}","{'special': 2, 'education': 2}",0,,1.0


In [None]:
#Delete row
'''
from google.colab import drive
drive.mount('drive', force_remount=True)

import pandas as pd
result_scrape = pd.read_csv('/content/drive/My Drive/search_scraped_data.csv')
#result_scrape.drop(result_scrape.tail(30).index,inplace=True)
result_scrape_drop = result_scrape.drop(result_scrape.index[1319:1329])
result_scrape_drop.loc[result_scrape['keyword'] == duplicated_list[10]]
result_scrape_drop.to_csv('/content/drive/My Drive/search_scraped_data.csv', encoding='utf-8', index= False)
'''


Mounted at drive


In [None]:
#Delete column
'''
from google.colab import drive
drive.mount('drive', force_remount=True)

import pandas as pd
result_scrape = pd.read_csv('/content/drive/My Drive/search_scraped_data.csv')
result_scrape = result_scrape.iloc[: , 3:]
result_scrape.to_csv('/content/drive/My Drive/search_scraped_data.csv', encoding='utf-8', index= False)
'''

Mounted at drive


0
1
2
3
4
...
1421
1422
1423
1424
1425
