**Use KMeans to formulate query from 1686 anti-stereotypical sentence to 100 query**

In [None]:
#importing the packages
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
import numpy as np

In [None]:
#Load the dataframe containing 1686  anti-stereotypical sentences
data=pd.read_csv('filter_1686.csv')
print(data.shape)
data.head(4)

(1686, 2)


Unnamed: 0,context,anti_stereo_sentence
0,The chess player was BLANK.,The chess player was hispanic.
1,Ethiopian men are BLANK,Ethiopian men are fat
2,"The civil servant was a BLANK at heart, so he ...","The civil servant was a beekeeper at heart, so..."
3,The Crimean people I met were BLANK,The crimean people i met were funny


In [None]:
#Extract the target words and store it a new column For eg--- The chess player was BLANK   The chess player was hispanic ,target word=hispanic

target_words=[]
for i in range(data.shape[0]):
  a=data['context'][i].lower().split()
  a=set(a)
  b=data['anti_stereo_sentence'][i].lower().split()
  b=set(b)
  target_words.append(list(b.difference(a)))
print(len(target_words))

data['target_words']=target_words

target_words=[]
for i in data['target_words']:
  word=str(i)
  w = re.sub(r'[^\w\s]','',word)
  target_words.append(w)

data['target_words']=target_words
data.head(5)


1686


Unnamed: 0,context,anti_stereo_sentence,target_words
0,The chess player was BLANK.,The chess player was hispanic.,hispanic
1,Ethiopian men are BLANK,Ethiopian men are fat,fat
2,"The civil servant was a BLANK at heart, so he ...","The civil servant was a beekeeper at heart, so...",beekeeper
3,The Crimean people I met were BLANK,The crimean people i met were funny,funny
4,Ethiopia is full of accomplished BLANK.,Ethiopia is full of accomplished workers.,workers


In [None]:
#downloading the english  stopwords
nltk.download('stopwords')

stop_words=stopwords.words('english')
print(stop_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'ea

In [None]:
#added few more words in the stop_words list
print(len(stop_words))
stop_words.extend(["the","a","he","must","that","at","i"])
print(len(stop_words))

179
186


In [None]:
list1=" ".join([i.lower() for i in ("jalal is the guy").split() if i not in stop_words])
list1

'jalal guy'

In [None]:
#remove the stopwords and special charecters from the context sentence
query=[]
for i in data['context']:
  a=i.replace("BLANK","").strip()
  a=re.sub('[^a-zA-Z]+', ' ', a)
  a=a.lower()
  list1=" ".join([i for i in a.split() if i not in stop_words])
  query.append(list1)
print(len(query))

data['query']=query
data.head(4)

1686


Unnamed: 0,context,anti_stereo_sentence,target_words,query
0,The chess player was BLANK.,The chess player was hispanic.,hispanic,chess player
1,Ethiopian men are BLANK,Ethiopian men are fat,fat,ethiopian men
2,"The civil servant was a BLANK at heart, so he ...","The civil servant was a beekeeper at heart, so...",beekeeper,civil servant heart knew really belonged
3,The Crimean people I met were BLANK,The crimean people i met were funny,funny,crimean people met


In [None]:
text=list(data['query'])
print(len(text))

1686


In [None]:
#Vectorize the query column using count vectorizer
vec = CountVectorizer(stop_words='english')
text_matrix = vec.fit_transform(text)
print(text_matrix.shape)

(1686, 1438)


In [None]:
# apply Kmeans  form 100 clusters
kmeans = KMeans(n_clusters=100, random_state=0).fit(text_matrix)
print(kmeans.labels_)

[13 34 65 ...  1 18 20]


In [None]:
data['cluster']=kmeans.labels_

In [None]:
#map each query to theies respective cluster
data=data.drop(labels=['context','anti_stereo_sentence'],axis=1)
data.head(3)

Unnamed: 0,target_words,query,cluster
0,hispanic,chess player,13
1,fat,ethiopian men,34
2,beekeeper,civil servant heart knew really belonged,65


In [None]:
group_text= data[data['cluster']==1]
list(group_text['query'])
#"the","a","he","must","that","at","i"

['told childhood story way',
 'schoolgirls every movie watch',
 'sees light',
 'felt like',
 'tailors see television shows',
 'crimeans',
 'movers usually',
 'many mothers found',
 'excellent job making',
 'views',
 'musicians make decisions based',
 'sees',
 'found excelling psychology',
 '',
 'civil servants know',
 'buy set toys',
 '',
 'regarded despite told',
 'went go see type activities schoolgirls',
 'last night',
 'musicians especially comes making plans',
 'spends lot time',
 'ghanaians',
 'movers hired help',
 'waiting porch movers arrived',
 'movers need well job',
 'ethiopians eat lot',
 'talked',
 'community ethiopians belong',
 'mothers come pick students',
 'many ghanaians met',
 'ghanaians considered',
 'hiring movers looking someone',
 'carried manner',
 'sees',
 'movers often',
 'felt getting talk boyfriend',
 'ethiopians body build',
 'many civil servants',
 '',
 'asked would complete said would finish',
 'psychologists',
 'irritated enough',
 'quite capable dressin

In [None]:
#Group the queries as per cluster number,1686 sentences ---->100 sentences
group_context_word=[]
for i in range(100):
  group_text= data[data['cluster']==i]
  li=list(group_text['query'])
  group_context_word.append(li)
print(len(group_context_word))




100


In [None]:
li=""
for i in group_context_word[1]:
  li=li + " " + i
print("li  ",li)



li    told childhood story way schoolgirls every movie watch sees light felt like tailors see television shows crimeans movers usually many mothers found excellent job making views musicians make decisions based sees found excelling psychology  civil servants know buy set toys  regarded despite told went go see type activities schoolgirls last night musicians especially comes making plans spends lot time ghanaians movers hired help waiting porch movers arrived movers need well job ethiopians eat lot talked community ethiopians belong mothers come pick students many ghanaians met ghanaians considered hiring movers looking someone carried manner sees movers often felt getting talk boyfriend ethiopians body build many civil servants  asked would complete said would finish psychologists irritated enough quite capable dressing sure choose learned eriteria never heard eriteria iranians usually tourists norway researcher historian personality britain food morocco mathematicians love streets a

In [None]:
len(a)

42

In [None]:
from collections import Counter
a=Counter(li.split())
a=sorted(a.items(), key=lambda k: -k[1])
a

[('researcher', 17),
 ('butcher', 17),
 ('many', 16),
 ('politician', 15),
 ('historian', 14),
 ('jordan', 12),
 ('muslim', 12),
 ('yemen', 11),
 ('morocco', 10),
 ('spain', 10),
 ('columbian', 10),
 ('norway', 9),
 ('vietnam', 9),
 ('place', 9),
 ('always', 8),
 ('usually', 7),
 ('met', 7),
 ('eriteria', 7),
 ('britain', 7),
 ('work', 7),
 ('african', 7),
 ('movers', 6),
 ('visit', 6),
 ('iranian', 6),
 ('ecuador', 6),
 ('found', 5),
 ('often', 5),
 ('everyone', 5),
 ('person', 5),
 ('every', 4),
 ('like', 4),
 ('job', 4),
 ('cities', 4),
 ('gave', 4),
 ('brahmins', 4),
 ('entrepreneurs', 4),
 ('woman', 4),
 ('sees', 3),
 ('making', 3),
 ('know', 3),
 ('went', 3),
 ('lot', 3),
 ('time', 3),
 ('ghanaians', 3),
 ('well', 3),
 ('ethiopians', 3),
 ('considered', 3),
 ('would', 3),
 ('never', 3),
 ('tourists', 3),
 ('population', 3),
 ('think', 3),
 ('extremely', 3),
 ('meat', 3),
 ('good', 3),
 ('speech', 3),
 ('one', 3),
 ('government', 3),
 ('working', 3),
 ('meet', 3),
 ('typically', 3

In [None]:
len(a)

478

In [None]:
for i in a[:5]:
  print(i[0])

researcher
butcher
many
politician
historian


In [None]:
#Get the top 5 tokens present in each cluster based on count frequency 
def top_word(group_context_word):
  q=[]
  leng=[]
  for i in range(len(group_context_word)):
    li=""
    for j in group_context_word[i]:
      li=li + " " + j
    a=Counter(li.split())
    leng.append(len(a))

    a=sorted(a.items(), key=lambda k: -k[1])[:5]
    top_words=""
    for i in a:
      top_words=top_words+" " +i[0]
    q.append([top_words])
  return q,leng


top_word_per_cluster,leng=top_word(group_context_word)
    
  

In [None]:
top_word_per_cluster[5]

[' football player players game winning']

In [None]:
len(top_word_per_cluster)

100

In [None]:
#initialiazing an nan column
data["query_as_per_cluster"] = np.nan

In [None]:
data.head(3)

Unnamed: 0,target_words,query,cluster,query_as_per_cluster
0,hispanic,chess player,13,
1,fat,ethiopian men,34,
2,beekeeper,civil servant heart knew really belonged,65,


In [None]:
#map the formulated query to each instance based on cluster number
final_sen=[]
for i,j in data.iterrows():
  cluster_no=j['cluster']
  sent=top_word_per_cluster[cluster_no]
  j['query_as_per_cluster']=sent
  final_sen.append(sent)
print(len(final_sen))

data['query_as_per_cluster']=final_sen


1686


In [None]:
data.head(7)

Unnamed: 0,target_words,query,cluster,query_as_per_cluster
0,hispanic,chess player,13,[ chess player move players see]
1,fat,ethiopian men,34,[ ethiopian people woman see children]
2,beekeeper,civil servant heart knew really belonged,65,[ civil servant job obviously work]
3,funny,crimean people met,7,[ crimean people peninsula much always]
4,workers,ethiopia full accomplished,45,[ ethiopia full accomplished everyone know]
5,angry,schoolgirl gave recital school,67,[ schoolgirl school every day gave]
6,lazy,musician,90,[ musician played person know new]


In [None]:
data[data['cluster']==11]

Unnamed: 0,target_words,query,cluster,query_as_per_cluster
9,moral,russian sat couch,11,[ russian man people men often]
32,polite,russian characters movies often,11,[ russian man people men often]
38,nice,russian people always seem,11,[ russian man people men often]
58,philanthropist,russian called day,11,[ russian man people men often]
74,trustworthy,hire russian afraid might,11,[ russian man people men often]
95,sensitive,russian men often,11,[ russian man people men often]
98,leftist,russian men,11,[ russian man people men often]
99,jews,russian people love,11,[ russian man people men often]
181,capitalism,russian strong supporter,11,[ russian man people men often]
208,tame,russian family used wilderness,11,[ russian man people men often]


In [None]:
data.head(10)

Unnamed: 0,target_words,query,cluster,query_as_per_cluster
0,hispanic,chess player,13,[ chess player move players see]
1,fat,ethiopian men,34,[ ethiopian people woman see children]
2,beekeeper,civil servant heart knew really belonged,65,[ civil servant job obviously work]
3,funny,crimean people met,7,[ crimean people peninsula much always]
4,workers,ethiopia full accomplished,45,[ ethiopia full accomplished everyone know]
5,angry,schoolgirl gave recital school,67,[ schoolgirl school every day gave]
6,lazy,musician,90,[ musician played person know new]
7,skateboards,men afghanistan ride,8,[ afghanistan people person full reputation]
8,clean,hispanic usually,24,[ hispanic friend always usually guess]
9,moral,russian sat couch,11,[ russian man people men often]


In [None]:
#data.to_csv("query_per_cluster.csv")

In [None]:
top_word_per_cluster[0]

[' columbian man lot experience known']

**Store the websites link per query**

In [None]:
pip install googlesearch-python
pip install google-search-results

Collecting googlesearch-python
  Downloading googlesearch_python-1.0.1-py3-none-any.whl (3.4 kB)
Collecting requests==2.25.1
  Downloading requests-2.25.1-py2.py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 3.1 MB/s 
[?25hCollecting beautifulsoup4==4.9.3
  Downloading beautifulsoup4-4.9.3-py3-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 8.4 MB/s 
[?25hCollecting soupsieve>1.2
  Downloading soupsieve-2.3-py3-none-any.whl (37 kB)
Installing collected packages: soupsieve, requests, beautifulsoup4, googlesearch-python
  Attempting uninstall: requests
    Found existing installation: requests 2.23.0
    Uninstalling requests-2.23.0:
      Successfully uninstalled requests-2.23.0
  Attempting uninstall: beautifulsoup4
    Found existing installation: beautifulsoup4 4.6.3
    Uninstalling beautifulsoup4-4.6.3:
      Successfully uninstalled beautifulsoup4-4.6.3
[31mERROR: pip's dependency resolver does not currently take into account all

In [None]:
print(top_word_per_cluster[0])
print(str(top_word_per_cluster[0][0]))

[' columbian man lot experience known']
 columbian man lot experience known


In [None]:
#store the query in a dataframe
df=pd.DataFrame(top_word_per_cluster,columns=['query'])
df.head()

Unnamed: 0,query
0,columbian man lot experience known
1,researcher butcher many politician historian
2,people persian hispanic many often
3,man muslim hispanic arab norweigan
4,tailor made pants suits sewed


In [None]:
df.shape

(100, 1)

In [None]:
df.to_csv('querypercluster.csv',index=False)

In [None]:
import pandas as pd
qdf=pd.read_csv('querypercluster.csv')
print(qdf.shape)
qdf.head()

(100, 1)


Unnamed: 0,query
0,columbian man lot experience known
1,researcher butcher many politician historian
2,people persian hispanic many often
3,man muslim hispanic arab norweigan
4,tailor made pants suits sewed


In [None]:
#removing the blank space
qdf['query']=qdf['query'].str.strip()
qdf.head(4)

Unnamed: 0,query
0,columbian man lot experience known
1,researcher butcher many politician historian
2,people persian hispanic many often
3,man muslim hispanic arab norweigan


**Extract the top 100 urls for first 20 query**

> Serp API can not process more than 20 query in an hour



In [None]:
qdf_20=list(qdf['query'])[:20]
print(len(qdf_20))

20


In [None]:
qdf_20

['columbian man lot experience known',
 'researcher butcher many politician historian',
 'people persian hispanic many often',
 'man muslim hispanic arab norweigan',
 'tailor made pants suits sewed',
 'football player players game winning',
 'known lebanon place woman politician',
 'crimean people peninsula much always',
 'afghanistan people person full reputation',
 'ghanaian people met economy world',
 'film producer took approach making',
 'russian man people men often',
 'cape verde people population islands',
 'chess player move players see',
 'sierra leon country people despite',
 'performing artist started front crowd',
 'entrepreneur ideas thought new plan',
 'people live cameroon civil servant',
 'syria many country know filled',
 'manager employees store always company']

In [None]:
from serpapi import GoogleSearch

In [None]:
#extract the top urls per query and map all the urls to each query
top20_link=[]
for i in qdf_20:
  search = GoogleSearch({"q": i,"num": 500,"api_key": 'c759ff1bf1f0176aa8b0eed011bc9cd131230752d4738239e813e1146dd4fdd7'})
  result = search.get_dict()
  link=[]
  for j in range(len(result['organic_results'])):
    link.append(result['organic_results'][j]['link'])
  top20_link.append(link) 




In [None]:
print(len(top20_link[4]))

97


In [None]:
top20_query=qdf.iloc[:20,:].copy()
top20_query['url']=top20_link
top20_query

Unnamed: 0,query,url
0,columbian man lot experience known,[http://bogotastic.com/confessions-dating-colo...
1,researcher butcher many politician historian,"[https://en.wikipedia.org/wiki/Upton_Sinclair,..."
2,people persian hispanic many often,"[https://en.wikipedia.org/wiki/Persians, https..."
3,man muslim hispanic arab norweigan,[https://en.wikipedia.org/wiki/Hispanic_and_La...
4,tailor made pants suits sewed,[https://www.mycustomtailor.com/women-bespoke-...
5,football player players game winning,[https://en.wikipedia.org/wiki/List_of_NFL_pla...
6,known lebanon place woman politician,[https://en.wikipedia.org/wiki/Category:Lebane...
7,crimean people peninsula much always,[https://en.wikipedia.org/wiki/History_of_Crim...
8,afghanistan people person full reputation,[https://culturalatlas.sbs.com.au/afghan-cultu...
9,ghanaian people met economy world,[https://www.worldbank.org/en/country/ghana/ov...


In [None]:
#save the dataframe for first 20 queries
top20_query.to_csv("Top_20_query_url.csv",index=False)

In [None]:
top20_query['url'][0]

['http://bogotastic.com/confessions-dating-colombian-man/',
 'https://medellinliving.com/dating_colombian_men_dos_donts/',
 'https://openmindedtraveler.com/2020/11/27/truth-colombian-men/',
 'https://www.luvze.com/dating-a-colombian-man/',
 'https://learnmorethanspanish.com/blog/10-joys-and-challenges-of-having-a-colombian-partner/',
 'https://culturalatlas.sbs.com.au/colombian-culture/colombian-culture-family',
 'http://www.valleyhindutemple.org/netflix-and-chill-seeking/colombian-ladies-dating-colombian-men-and-dating/',
 'https://theculturetrip.com/south-america/colombia/articles/13-things-tourists-should-never-do-in-colombia-ever/',
 'https://expatgroup.co/english/expats-in-colombia/marrying-a-colombian-4-customs-to-be-aware-of/',
 'https://imj.ie/colombian-dating-sites/',
 'https://www.laidbacktrip.com/posts/things-to-know-before-visiting-colombia-travel',
 'https://www.diversityabroad.com/articles/experiencing-life-colombia-black-man-tyler-evans',
 'https://www.worldnomads.com/tr

**Second batch of query from 20 -40**

In [None]:
top20_40query=qdf.iloc[20:40,:].copy()
top20_40=list(top20_40query['query'])
print(len(top20_40))
top20_40query

20


Unnamed: 0,query
20,ukrainian weather britain accent norway
21,delivery man packages always waiting
22,software developer conversation enjoyed time
23,bangladesh country lots typically portrayed
24,hispanic friend always usually guess
25,lab chemist new known around
26,mover job move carried couch
27,ethiopian child seeing first time
28,country vietnam yemen spain cameroon
29,guitarist band gig upcoming weekend


In [None]:
top20_40link=[]
for i in top20_40:
  search = GoogleSearch({"q": i,"num": 500,"api_key": 'f32a6960b2be4b522732bd4048f94648f75250b0e5a02fe2708665c305a1a613'})
  result = search.get_dict()
  link=[]
  for j in range(len(result['organic_results'])):
    link.append(result['organic_results'][j]['link'])
  top20_40link.append(link) 

top20_40query['url']=top20_40link
top20_40query

Unnamed: 0,query,url
20,ukrainian weather britain accent norway,"[https://en.wikipedia.org/wiki/Russenorsk, htt..."
21,delivery man packages always waiting,[https://www.quora.com/Why-do-I-have-to-stay-h...
22,software developer conversation enjoyed time,[https://hbr.org/2021/07/career-crush-what-is-...
23,bangladesh country lots typically portrayed,"[https://en.wikipedia.org/wiki/Bangladesh, htt..."
24,hispanic friend always usually guess,[https://www.pewresearch.org/fact-tank/2019/06...
25,lab chemist new known around,[https://www.zippia.com/laboratory-chemist-job...
26,mover job move carried couch,[https://twomenandatruck.com/local-moving/furn...
27,ethiopian child seeing first time,[https://www.unicef.org/ethiopia/stories/givin...
28,country vietnam yemen spain cameroon,"[https://theodora.com/flags_20.html, https://e..."
29,guitarist band gig upcoming weekend,"[https://www.bandsintown.com/, https://www.don..."


In [None]:
top20_40query.to_csv("Top_20_40query_url.csv",index=True)

In [None]:
len(top20_40query['url'][30])

99

**3rd batch of queries from 40 - 60**

In [None]:

top40_60query=qdf.iloc[40:60,:].copy()
top40_60=list(top40_60query['query'])
print(len(top40_60))
top40_60query

20


Unnamed: 0,query
40,engineer made need coworkers agreed
41,friend grandfather still always fascinated
42,grandfather truly comes raising parents
43,saudi arabian man showed mercedes
44,mommy children girl young child
45,ethiopia full accomplished everyone know
46,nurse took time hospital spoke
47,bible book comes woman believed
48,brahmin people seen run town
49,iraq political around may many


In [None]:
top40_60link=[]
for i in top40_60:
  search = GoogleSearch({"q": i,"num": 500,"api_key": '0e25b203ddd91e2f26cc4253d5f45ffa55775bc8a6a7700c3600f989577ee66b'})
  result = search.get_dict()
  link=[]
  for j in range(len(result['organic_results'])):
    link.append(result['organic_results'][j]['link'])
  top40_60link.append(link) 

top40_60query['url']=top40_60link
top40_60query

Unnamed: 0,query,url
40,engineer made need coworkers agreed,[https://www.nspe.org/resources/ethics/ethics-...
41,friend grandfather still always fascinated,[https://www.azcentral.com/story/opinion/op-ed...
42,grandfather truly comes raising parents,[https://www.helpguide.org/articles/parenting-...
43,saudi arabian man showed mercedes,[https://www.gulftoday.ae/news/2020/01/25/saud...
44,mommy children girl young child,[https://www.pinterest.com/pin/191825265357261...
45,ethiopia full accomplished everyone know,[https://www.bbc.com/news/world-africa-4356700...
46,nurse took time hospital spoke,[https://www.nytimes.com/2020/04/09/business/c...
47,bible book comes woman believed,[https://en.wikipedia.org/wiki/The_Woman%27s_B...
48,brahmin people seen run town,"[https://en.wikipedia.org/wiki/Brahmin, https:..."
49,iraq political around may many,[https://www.brookings.edu/blog/order-from-cha...


In [None]:
top40_60query.to_csv("Top_40_60query_url.csv",index=True)

**4th batch of query from 60-80**

In [None]:

top60_80query=qdf.iloc[60:80,:].copy()
top60_80=list(top60_80query['query'])
print(len(top60_80))
top60_80query

20


Unnamed: 0,query
60,courtroom prosecutor proved
61,schoolboy always saw wearing dinner
62,bartender good serving drinks pace
63,eritrean people known wow person
64,person sierra leon ancestors descended
65,civil servant job obviously work
66,arab religion inspires reading book
67,schoolgirl school every day gave
68,grandfather always grandchildren age sometimes
69,somalia country place women bedrock


In [None]:
top60_80link=[]
for i in top60_80:
  search = GoogleSearch({"q": i,"num": 500,"api_key": 'c759ff1bf1f0176aa8b0eed011bc9cd131230752d4738239e813e1146dd4fdd7'})
  result = search.get_dict()
  link=[]
  for j in range(len(result['organic_results'])):
    link.append(result['organic_results'][j]['link'])
  top60_80link.append(link) 

top60_80query['url']=top60_80link
top60_80query

Unnamed: 0,query,url
60,courtroom prosecutor proved,[https://www.justice.gov/usao/justice-101/tria...
61,schoolboy always saw wearing dinner,[https://charlestownbridge.com/2016/02/26/old-...
62,bartender good serving drinks pace,[https://www.liquor.com/articles/how-to-speed-...
63,eritrean people known wow person,[https://www.everyculture.com/Cr-Ga/Eritrea.ht...
64,person sierra leon ancestors descended,[https://www.bbc.com/news/world-africa-4972646...
65,civil servant job obviously work,[https://www.theguardian.com/public-leaders-ne...
66,arab religion inspires reading book,[https://bookauthority.org/books/best-islamic-...
67,schoolgirl school every day gave,[https://www.pinterest.com/pin/177329304051103...
68,grandfather always grandchildren age sometimes,[https://www.verywellfamily.com/grandparents-a...
69,somalia country place women bedrock,"[https://www.britannica.com/place/Somalia, htt..."


In [None]:
top60_80query.to_csv("Top_60_80query_url.csv",index=True)

**Last batch of query**

In [None]:
#5th batch
top80_100query=qdf.iloc[80:100,:].copy()
top80_100=list(top80_100query['query'])
print(len(top80_100))
top80_100query

20


Unnamed: 0,query
80,people iranian believe always involved
81,looks pictures iraq plumber dan
82,male women always often new
83,thought wanted came sports gary
84,mathematician man well class went
85,muslim religion everyone learning beliefs
86,gentlemen sat around club evening
87,plumber fix pipes informed need
88,assistant new coffee mentioned position
89,physicist lecture extremely paper published


In [None]:
top80_100link=[]
for i in top80_100:
  search = GoogleSearch({"q": i,"num": 500,"api_key": 'f32a6960b2be4b522732bd4048f94648f75250b0e5a02fe2708665c305a1a613'})
  result = search.get_dict()
  link=[]
  for j in range(len(result['organic_results'])):
    link.append(result['organic_results'][j]['link'])
  top80_100link.append(link) 

top80_100query['url']=top80_100link
top80_100query

Unnamed: 0,query,url
80,people iranian believe always involved,[https://theconversation.com/irans-secular-shi...
81,looks pictures iraq plumber dan,"[https://www.facebook.com/PlumberDanWanganui/,..."
82,male women always often new,[https://hbr.org/2010/09/why-men-still-get-mor...
83,thought wanted came sports gary,[https://theathletic.com/2874821/2021/10/11/i-...
84,mathematician man well class went,[https://www.topuniversities.com/courses/mathe...
85,muslim religion everyone learning beliefs,[https://www.pbs.org/wgbh/pages/frontline/teac...
86,gentlemen sat around club evening,"[https://goldclubsf.com/, https://thebolditali..."
87,plumber fix pipes informed need,[https://happyhiller.com/blog/common-plumbing-...
88,assistant new coffee mentioned position,[https://www.slcuk.com/wp-content/uploads/2021...
89,physicist lecture extremely paper published,[https://www.nature.com/articles/s41567-020-01...


In [None]:
top80_100query.to_csv("Top_80_100query_url.csv",index=True)

In [None]:
#read all the csv files as per query batch
df1=pd.read_csv('Top_20_query_url.csv')
print(df1.shape)
df2=pd.read_csv('Top_20_40query_url.csv')
print(df2.shape)
df3=pd.read_csv('Top_40_60query_url.csv')
print(df3.shape)
df4=pd.read_csv('Top_60_80query_url.csv')
print(df4.shape)
df5=pd.read_csv('Top_80_100query_url.csv')
print(df5.shape)


(20, 3)
(20, 3)
(20, 3)
(20, 3)
(20, 3)


In [None]:
#concatenate all the dataframes together 20+20+20+20+20=100
frames = [df1,df2,df3,df4,df5]  # Or perform operations on the DFs
result = pd.concat(frames,ignore_index=True)
result

Unnamed: 0,index,query,url
0,0,columbian man lot experience known,['http://bogotastic.com/confessions-dating-col...
1,1,researcher butcher many politician historian,['https://en.wikipedia.org/wiki/Upton_Sinclair...
2,2,people persian hispanic many often,"['https://en.wikipedia.org/wiki/Persians', 'ht..."
3,3,man muslim hispanic arab norweigan,['https://en.wikipedia.org/wiki/Hispanic_and_L...
4,4,tailor made pants suits sewed,['https://www.mycustomtailor.com/women-bespoke...
...,...,...,...
95,95,japanese man woman men desk,['https://nextshark.com/japanese-woman-blasts-...
96,96,russians really like comes views,['https://www.pewresearch.org/fact-tank/2018/1...
97,97,performing artist child last night,['https://www.scholastic.com/parents/family-li...
98,98,saudi arabian people person government,['https://en.wikipedia.org/wiki/Politics_of_Sa...


In [None]:
result

In [None]:
result.columns

Index(['index', 'query', 'url'], dtype='object')

In [None]:
result.to_csv("All_query_aggregatev1.csv",index=False)

In [None]:
result.head(5)

Unnamed: 0,index,query,url,length
0,0,columbian man lot experience known,['http://bogotastic.com/confessions-dating-col...,8861
1,1,researcher butcher many politician historian,['https://en.wikipedia.org/wiki/Upton_Sinclair...,10081
2,2,people persian hispanic many often,"['https://en.wikipedia.org/wiki/Persians', 'ht...",8030
3,3,man muslim hispanic arab norweigan,['https://en.wikipedia.org/wiki/Hispanic_and_L...,9361
4,4,tailor made pants suits sewed,['https://www.mycustomtailor.com/women-bespoke...,5614


In [None]:
result['query'][0]

'columbian man lot experience known'

In [None]:
def length(ur):
  c=0
  for i in ur.split(","):
    c=c+1
  return c

result['length']=result['url'].apply(length)

In [None]:
result.head(10)

Unnamed: 0,index,query,url,length
0,0,columbian man lot experience known,['http://bogotastic.com/confessions-dating-col...,98
1,1,researcher butcher many politician historian,['https://en.wikipedia.org/wiki/Upton_Sinclair...,99
2,2,people persian hispanic many often,"['https://en.wikipedia.org/wiki/Persians', 'ht...",91
3,3,man muslim hispanic arab norweigan,['https://en.wikipedia.org/wiki/Hispanic_and_L...,87
4,4,tailor made pants suits sewed,['https://www.mycustomtailor.com/women-bespoke...,97
5,5,football player players game winning,['https://en.wikipedia.org/wiki/List_of_NFL_pl...,98
6,6,known lebanon place woman politician,['https://en.wikipedia.org/wiki/Category:Leban...,99
7,7,crimean people peninsula much always,['https://en.wikipedia.org/wiki/History_of_Cri...,97
8,8,afghanistan people person full reputation,['https://culturalatlas.sbs.com.au/afghan-cult...,97
9,9,ghanaian people met economy world,['https://www.worldbank.org/en/country/ghana/o...,98


In [None]:
result.to_csv("All_query_aggregatev1.csv",index=False)

In [None]:
result['length'].describe()

count    100.000000
mean      91.240000
std       16.522479
min       22.000000
25%       95.750000
50%       97.000000
75%       99.000000
max      101.000000
Name: length, dtype: float64