In [1]:
import re
import time

import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
FAQ_URLS = [
    'https://www.indiantradeportal.in/vs.jsp?lang=0&id=0,55,276',
    'https://www.indiantradeportal.in/vs.jsp?lang=0&id=0,55,287',
    'https://www.indiantradeportal.in/vs.jsp?lang=0&id=0,55,280',
    'https://www.indiantradeportal.in/vs.jsp?lang=0&id=0,55,15502',
    'https://www.indiantradeportal.in/vs.jsp?lang=0&id=0,55,284',
    'https://www.indiantradeportal.in/vs.jsp?lang=0&id=0,55,286',
    'https://www.indiantradeportal.in/vs.jsp?lang=0&id=0,55,283',
    'https://www.indiantradeportal.in/vs.jsp?lang=0&id=0,55,15627',
    'https://www.indiantradeportal.in/vs.jsp?lang=0&id=0,55,15535',
    'https://www.indiantradeportal.in/vs.jsp?lang=0&id=0,55,288',
    'https://www.indiantradeportal.in/vs.jsp?lang=0&id=0,55,15549',
    'https://www.indiantradeportal.in/vs.jsp?lang=0&id=0,55,23222',
]

In [3]:
pattern = r'\?([A-Z])'

# 대체 함수
def replace_match(match):
    if match.group(1).isupper():
        return f"?\n{match.group(1)}"
    return match.group(0)

In [6]:
def get_data(pattern: str, url:str):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    subcategory = soup.select_one('#inner-left-side > div > div:nth-child(2) > p > font > strong > span').text
    data = soup.select('#accordion')
    data = data[0].text.replace('\xa0', '').replace('Question:', '\nQuestion:').replace('“', '').replace('”', '').replace("'", '').replace('Answer: ', '?').replace('Answer:', '?').replace('??', '?').replace('RatesThe', 'Rates?The').replace('cost.The', 'cost?The').replace('exporters.The', 'exporters?The').strip()
    results = re.sub(pattern, replace_match, data).split('\n')
    results = [result for result in results if result != '']
    return subcategory, results

In [7]:
def make_individual_dataset(metadata, url, subcategory, results: list):
    questions = []
    answers = []
    sources = []
    categories = []
    
    for index in range(len(results)):
        if index % 2 == 0:
            questions.append(results[index])
        else:
            answers.append(results[index])
            sources.append(metadata['source'])
            categories.append(metadata['category'])
    subcategories = [subcategory] * (len(results) // 2)
    urls = [url] * (len(results) // 2)
    
    return questions, answers, sources, categories, subcategories, urls

In [9]:
questions = []
answers = []
sources = []
categories = []
subcategories = []
urls = []

for url in FAQ_URLS:
    subcategory, results = get_data(pattern, url)
    
    metadata = {
    'source': 'INDIAN TRADE PORTAL',
    'category': '3. 정책 및 무역',
    'subcategory': subcategory,
    'url': url,
    }
    
    question, answer, source, category, subcategory, url = make_individual_dataset(metadata, url, subcategory, results)
    questions += question
    answers += answer
    sources += source
    categories += category
    subcategories += subcategory
    urls += url
    
df = pd.DataFrame({
        'question': questions,
        'answer': answers,
        'source': sources,
        'category': categories,
        'subcategory': subcategories,
        'url': urls,
    })
df.to_csv('indian_trade_portal_faq.csv', index=False)

Frequently Asked QuestionsImport-Export Code Number https://www.indiantradeportal.in/vs.jsp?lang=0&id=0,55,276
Frequently Asked QuestionsGeneral Provisions Regarding Imports And Exports https://www.indiantradeportal.in/vs.jsp?lang=0&id=0,55,287
Frequently Asked QuestionsBanking/RBI Regulations https://www.indiantradeportal.in/vs.jsp?lang=0&id=0,55,280
Frequently Asked QuestionsMerchandise Exports from India Scheme (MEIS) https://www.indiantradeportal.in/vs.jsp?lang=0&id=0,55,15502
Frequently Asked QuestionsDuty Exemption / Remission Schemes https://www.indiantradeportal.in/vs.jsp?lang=0&id=0,55,284
Frequently Asked QuestionsEPCG https://www.indiantradeportal.in/vs.jsp?lang=0&id=0,55,286
Frequently Asked QuestionsDeemed Exports https://www.indiantradeportal.in/vs.jsp?lang=0&id=0,55,283
Frequently Asked QuestionsGoods and Services Tax https://www.indiantradeportal.in/vs.jsp?lang=0&id=0,55,15627
Frequently Asked QuestionsE-SEAL https://www.indiantradeportal.in/vs.jsp?lang=0&id=0,55,15535


In [10]:
df = pd.read_csv('indian_trade_portal_faq.csv')
df.tail()

Unnamed: 0,question,answer,source,category,subcategory,url
254,Question: Whether RoDTEP benefit can be claime...,The benefit under the scheme can be claimed on...,INDIAN TRADE PORTAL,3. 정책 및 무역,Frequently Asked QuestionsRoDTEP SCHEME,https://www.indiantradeportal.in/vs.jsp?lang=0...
255,Question: If a product of an exporter has not ...,The RoDTEP scheme operates under the budgetary...,INDIAN TRADE PORTAL,3. 정책 및 무역,Frequently Asked QuestionsRoDTEP SCHEME,https://www.indiantradeportal.in/vs.jsp?lang=0...
256,Question: Whether there is concept of special ...,"No, as per the Notification No. 19/2021 Dt. 17...",INDIAN TRADE PORTAL,3. 정책 및 무역,Frequently Asked QuestionsRoDTEP SCHEME,https://www.indiantradeportal.in/vs.jsp?lang=0...
257,Question: There are some exporters who have be...,The Notification No. 19/ 2021 clearly speaks t...,INDIAN TRADE PORTAL,3. 정책 및 무역,Frequently Asked QuestionsRoDTEP SCHEME,https://www.indiantradeportal.in/vs.jsp?lang=0...
258,Question: Whether e-commerce exports will be e...,The benefit of the scheme may be eligible prov...,INDIAN TRADE PORTAL,3. 정책 및 무역,Frequently Asked QuestionsRoDTEP SCHEME,https://www.indiantradeportal.in/vs.jsp?lang=0...
