# Part 1 - Web Scraping

### Importing libraries

In [5]:
import urllib.request
import html5lib
import requests
import regex as re
import bs4
import random
from bs4 import BeautifulSoup
import pandas as pd
import time

### Reading from CSV file

In [10]:
data = pd.read_csv('url_technology(1).csv')

### CSV file having URLs to scrap looks like this

In [11]:
data

Unnamed: 0,url
0,https://medium.com/javascript-scene/top-javasc...
1,https://medium.com/job-advice-for-software-eng...
2,https://itnext.io/load-testing-using-apache-jm...
3,https://medium.com/s/story/black-mirror-bander...
4,https://medium.com/fast-company/the-worst-desi...
...,...
58093,https://medium.com/@christopherthomson/i-would...
58094,https://medium.com/@sportulaproducts1/sportula...
58095,https://medium.com/@thesupergamercorpus/as-som...
58096,https://medium.com/swlh/do-writing-aids-such-a...


### Defining dataframe and implementing scrapping

In [9]:
#creating dataframe and defining column names 
df = pd.DataFrame(columns = ['Index', 'URL', 'Title', 'Sub Title', 'Article', 'Author', 'Author URL','Image URL', 'Reading Time', 'Clap Count'])

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.42'}

urls_scraped = []

#Scrapping for 550 random URLs from CSV file
for i in range(550):
    
        cur_link_data = []
        url_index = random.randint(2, 58100)
        url_to_scrap = data.iloc[url_index ]['url']
#Implementing exception here to continue scrapping incase we don't get access to some URLs         
        
        try:
        
            r = requests.get(url_to_scrap, headers=headers, timeout = 20)
            r_text = r.text
            soup = BeautifulSoup(r_text, "html5lib")
#Getting title on article's webpage. Exception added
            try:
                title = soup.find('h1').contents[0].text
            except AttributeError:
                title_raw = (soup.find('title')).text
                title = title_raw[:title_raw.find("|")]

            if not title:
                continue
#Getting Subtitle
            try:
                if "Written by" in soup.find('h2').contents[0].text:
                    sub_title = "None"
                else:
                    sub_title = soup.find('h2').contents[0].text
            except AttributeError:
                sub_title = "None"
#Getting Author name
            try:
                author_name_raw = (soup.find("h2", {"class": "pw-author-name"})).text
                author_name = author_name_raw[11:]
            except AttributeError:
                author_name = "None"
#Getting Author URL
            try:
                author_url_raw = (soup.find("h2", {"class": "pw-author-name"})).parent['href']
                author_url = "https://medium.com" + author_url_raw[:author_url_raw.find("?")]
            except AttributeError:
                author_url = "None"
#Getting reading time
            r_content = r.content.decode("utf-8")
            reading_time = r_content[r_content.find(" min read")-2:r_content.find(" min read")]
            if reading_time[0] == '"':
                reading_time = reading_time[1:]
#Getting clap count
            try:
                clap_count = r_content.split("clapCount\":")[1]
            except IndexError:
                clap_count = r_content.split("clapCount\":")[0]
            clap_count = clap_count[0:clap_count.find(",")]
#Getting images link
            reg = re.compile('(https:\/\/miro\.medium\.com\/v2\/resize:fit:)[\d]+(\/)(format:webp\/|)(\w|\d|-|\*)+(\.jpeg|\.png|\.gif)')
            img_tags = soup.find_all('picture')
            img_srcs = str(len(img_tags))
            for img in img_tags:
                mos = reg.search(str(img))
                try:
                    img_srcs += ", " + str(mos[0])
                except TypeError:
                    continue
#Getting article's content
            text = ""
            divTag = soup.find_all("div", {"class": "ch bg dx dy dz ea"})
            for tag in divTag:
                tdTags = tag.find_all("p")
                for tag in tdTags:
                    text += tag.text

            if not text:
                continue
#Appending and saving the scraped data in dataframe
            df.loc[i] = [str(url_index), url_to_scrap, title, sub_title, text, author_name, author_url, img_srcs, reading_time, clap_count]
            urls_scraped.append(url_to_scrap)
            print(i)
#Sleep time added. Our scrapper should wait 1 and 5 seconds between requests            
            time.sleep(1)
        
        except:
            df.to_csv("data_scraped.csv", encoding='utf-8')
            time.sleep(5)
#Creating a csv file of our scrapped data, from our dataframe.
df.to_csv("data_scraped.csv", encoding='utf-8')
print("Scraped Successfully")

0
1
2
4
5
8
10
11
12
13
14
15
16
17
19
20
21
22
24
25
26
27
28
30
31
32
33
35
36
37
38
39
40
42
43
44
45
47
48
49
50
51
52
53
56
57
58
59
60
61
62
65
66
67
68
69
71
72
74
75
76
77
78
79
80
82
83
84
85
86
87
88
89
90
91
92
94
95
96
97
98
99
100
101
104
106
107
109
110
111
112
113
115
118
119
120
121
122
124
125
126
127
129
131
132
133
134
135
136
137
138
141
142
144
145
146
147
148
149
151
153
154
155
156
157
158
160
162
163
164
165
166
167
168
169
171
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
193
195
196
197
198
200
202
203
206
207
208
209
210
211
212
213
214
216
217
218
219
220
221
222
223
224
225
226
227
228
229
231
233
236
237
238
239
240
241
242
243
244
246
248
249
252
253
254
256
258
260
261
263
264
265
266
267
268
269
270
272
273
274
275
276
277
278
279
281
282
284
285
288
289
290
291
292
293
297
298
299
300
302
303
304
305
306
308
309
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
329
330
331
332
333
334
335
336
337
338
339


### Overview of our scrapped data

In [12]:
scraped_data= pd.read_csv('data_scraped.csv')

In [13]:
scraped_data

Unnamed: 0.1,Unnamed: 0,Index,URL,Title,Sub Title,Article,Author,Author URL,Image URL,Reading Time,Clap Count
0,0,12925,https://medium.com/@dalwrobinson/where-the-str...,Where the streets have new names,,The Hall PressFollowHall Associated Publicatio...,The Hall Press,https://medium.com/@dalwrobinson,2,8,68
1,1,42280,https://medium.com/microsoft-cybersecurity/wei...,Weighing Past and Future Success,Nothing worthwhile is easy. What are you willi...,Lucas DowdFollowMicrosoft Cybersecurity--Liste...,Lucas Dowd,https://medium.com/@lucas.dowd,"1, https://miro.medium.com/v2/resize:fit:640/f...",2,1
2,2,1753,https://medium.com/@rohela99/microservice-canv...,Microservice Canvas,,RohelaFollow--ListenShareEvery business analys...,Rohela,https://medium.com/,"3, https://miro.medium.com/v2/resize:fit:640/f...",6,0
3,4,6769,https://medium.com/@chevallier/the-road-to-cto...,The Road to CTO (#3),This ghost won’t escape me much longer 👻,Jérémy ChevallierFollow--ListenShareIt’s encou...,Jérémy Chevallier,https://medium.com/@chevallier,"2, https://miro.medium.com/v2/resize:fit:640/0...",2,10
4,5,28240,https://medium.com/@roblevintennis/this-was-fu...,This was fun! I challenged myself to do them i...,,Ady NgomRob LevinFollow--2ListenShareThis was ...,Rob Levin,https://medium.com/,0,1,11
...,...,...,...,...,...,...,...,...,...,...,...
429,543,31523,https://medium.com/active-theory/netflix-our-p...,Netflix: Our Planet,,Member-only storyActive TheoryFollowActive The...,Active Theory,https://medium.com/@activetheory,0,7,948
430,544,48943,https://medium.com/@livewellmumbai/android-10-...,Android 10 update arrives on the Nokia 8.1,,Sanket Ramesh PrasadeFollow--ListenShareAndroi...,Sanket Ramesh Prasade,https://medium.com/@livewellmumbai,1,2,0
431,545,12711,https://medium.com/@avonleafisher/go-live-in-a...,“Go Live in a Cave if You’re Really Anti-Capit...,,Avonlea FisherFollow--ListenShareWhether or no...,Avonlea Fisher,https://medium.com/,"2, https://miro.medium.com/v2/resize:fit:640/f...",4,231
432,547,43619,https://medium.com/messaricrypto/derivatives-i...,Derivatives in Crypto,"Futures and options and swaps, oh my!",Jack PurdyFollowMessari Crypto--ListenShareThi...,Jack Purdy,https://medium.com/@jackpurdy,"1, https://miro.medium.com/v2/resize:fit:640/f...",3,149


### Some of the data we scraped have empty values, some URL links were broken, and some articles were paid, could only be accessed by having a membership account on medium.com.

# Part 2 - Creating an API

### API in flask

In [1]:
from flask import Flask,request, jsonify
import pandas as pd

app= Flask(__name__)

df = pd.read_csv('sample.csv')

@app.route('/',methods=['GET'])
def index():
    return 'Hello World'

@app.route('/search', methods=['GET'])
def search_titles():
    query = request.args.get('query')  
    
    if not query:
        return jsonify({'error': 'No query provided'})

    results = df[df['Title'].str.contains(query, case=False)]
    
    results_list = results.to_dict(orient='records')
    
    return jsonify({'results': results_list})

if __name__ == "__main__":
    app.run(debug=True)

ModuleNotFoundError: No module named 'flask'

### Thank You!!!