# **Automated URL validator - Checking whether a website URL exists or not**



**Goal:** Given a list of URLs, we want to validate whether the website links exist or not

## Import Libraries

In [3]:
import pandas as pd
import requests

## Load CrowdTangle dataset 

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import os
os.chdir("/content/drive/MyDrive/PATH_TO_THE_DIRECTORY")

In [26]:
csv_data = pd.read_csv("FILE_NAME.csv", low_memory=False,  lineterminator='\n', sep=';', error_bad_lines=False)



  exec(code_obj, self.user_global_ns, self.user_ns)


In [27]:
csv_data.shape

(315169, 40)

In [8]:
csv_data.columns

Index(['account.name', 'account.handle', 'platformId', 'Page Category',
       'Page Admin Top Country', 'Page Description', 'Page Created',
       'subscriberCount', 'Followers at Posting', 'date', 'Post Created Date',
       'Post Created Time', 'type', 'totalInteraction',
       'statistics.actual.likeCount', 'statistics.actual.commentCount',
       'statistics.actual.shareCount', 'statistics.actual.loveCount',
       'statistics.actual.wowCount', 'statistics.actual.hahaCount',
       'statistics.actual.sadCount', 'statistics.actual.angryCount',
       'statistics.actual.careCount', 'Video Share Status', 'Is Video Owner?',
       'statistics.actual.videoPostViewCount',
       'statistics.actual.videoTotalViewCount',
       'statistics.actual.videoAllCrosspostsViewCount', 'Video Length',
       'postUrl', 'message', 'expandedLinks.original',
       'expandedLinks.expanded', 'imageText', 'title', 'description',
       'brandedContentSponsor.platformId', 'brandedContentSponsor.name',
 

## Pre-processing of the dataset

Let us identify the top links/URLs present in the dataset

In [28]:
# top links 
csv_data ['expandedLinks.original'].value_counts()

https://www.facebook.com/hanumansinghsirana/videos/1951535514949751/                              28
http://www.akhandbharatimes.com/                                                                  20
https://janganapp.page.link/X42f                                                                  19
https://www.facebook.com/pushpendrakuldelhi001/videos/332266691549294/                            15
https://sachkhabar.co.in/now-biden-wants-modis-help-immediately-only-india-can-save-the-world/    13
                                                                                                  ..
https://www.facebook.com/pradeepBhajpa/videos/434140024317184/                                     1
https://www.facebook.com/photo.php?fbid=1844342252408908&set=p.1844342252408908&type=3             1
https://www.facebook.com/photo.php?fbid=2086761884799509&set=gm.2009189855886754&type=3            1
https://www.facebook.com/203867673485517/photos/a.203869050152046/952309761974634/?type=3  

Select a sub-set of the top URLs

In [34]:
# top N links
N=100 # choose any number for N

URL_list = csv_data ['expandedLinks.original'].dropna().value_counts() [:N].index.tolist()

import numpy as np
for i in range (np.size(URL_list)):
  print (URL_list[i]) 

https://www.facebook.com/hanumansinghsirana/videos/1951535514949751/
http://www.akhandbharatimes.com/
https://janganapp.page.link/X42f
https://www.facebook.com/pushpendrakuldelhi001/videos/332266691549294/
https://sachkhabar.co.in/now-biden-wants-modis-help-immediately-only-india-can-save-the-world/
https://www.facebook.com/251541358337843
https://sachkhabar.co.in/modi-governments-big-blow-to-zakir-naik/
https://www.facebook.com/462116500605383
https://khabarbharattak.com/rohingya-go-back-to-myanmar/
https://khabarbharattak.com/due-to-this-big-belief-mukesh-ambani-got-200-years-old-olive-trees-for-his-house-know-why/
https://khabarbharattak.com/these-bollywood-stars-whose-marriage-was-broken-in-a-few-years-some-for-9-months-and-some-for-2-years-see-the-full-list/
https://khabarbharattak.com/karan-johar-asked-kajol-what-would-you-do-if-your-daughter-ran-away-with-shahrukh-khans-son-kajol-gave-this-funny-answer/
https://appearnews.com/ambesy/
https://khabarbharattak.com/bobby-deols-son-g

## Use the response from requests.get to validate the website URLs

Let's now define a simple function to call the response of get requests

In [17]:
def link_validator (URL):
  response = requests.get(str(URL))
  if response.status_code == 200:
    print(URL, '---> VALID')
  else:
    print(URL, '---> INVALID') 
  return

Let's try the function on a few example links (manually) and see whether it works or not!

In [25]:
link_validator ("https://www.facebook.com/hanumansinghsirana/videos/1951535514949751/")
link_validator ("https://khabarbharattak.com/rohingya-go-back-to-myanmar/")
link_validator ("https://sachkhabar.co.in/modi-governments-big-blow-to-zakir-naik/")

https://www.facebook.com/hanumansinghsirana/videos/1951535514949751/ ---> VALID
https://khabarbharattak.com/rohingya-go-back-to-myanmar/ ---> INVALID
https://sachkhabar.co.in/modi-governments-big-blow-to-zakir-naik/ ---> INVALID


Let's modify the above *simple* get request function to bypass the connection errors

In [35]:
def url_exists_stream(URL: str):
    try:
        with requests.get(URL, stream=True) as response:
            try:
                response.raise_for_status()
                print (URL, '---> VALID')
                return True
            except requests.exceptions.HTTPError:
                print (URL, '---> INVALID')
                return False
    except requests.exceptions.ConnectionError:
        print (URL, '---> INVALID')
        return False

Let's now apply the modified function on top 100 shared links we have scrapped from CrowdTangle

In [36]:
for i in range (np.size(URL_list)):
  url_exists_stream (URL_list[i])

https://www.facebook.com/hanumansinghsirana/videos/1951535514949751/ ---> VALID
http://www.akhandbharatimes.com/ ---> INVALID
https://janganapp.page.link/X42f ---> VALID
https://www.facebook.com/pushpendrakuldelhi001/videos/332266691549294/ ---> VALID
https://sachkhabar.co.in/now-biden-wants-modis-help-immediately-only-india-can-save-the-world/ ---> INVALID
https://www.facebook.com/251541358337843 ---> VALID
https://sachkhabar.co.in/modi-governments-big-blow-to-zakir-naik/ ---> INVALID
https://www.facebook.com/462116500605383 ---> VALID
https://khabarbharattak.com/rohingya-go-back-to-myanmar/ ---> INVALID
https://khabarbharattak.com/due-to-this-big-belief-mukesh-ambani-got-200-years-old-olive-trees-for-his-house-know-why/ ---> VALID
https://khabarbharattak.com/these-bollywood-stars-whose-marriage-was-broken-in-a-few-years-some-for-9-months-and-some-for-2-years-see-the-full-list/ ---> VALID
https://khabarbharattak.com/karan-johar-asked-kajol-what-would-you-do-if-your-daughter-ran-away-w