In [15]:
from kafka import KafkaProducer
from fastapi import FastAPI, HTTPException
import mysql.connector
from pydantic import BaseModel
import json
import configparser
import pandas as pd
import boto3
from bs4 import BeautifulSoup
import re

In [16]:
def read_config():
    config = configparser.ConfigParser()
    config.read(r'D:\OSG-AA-Products\Analytics_POD_Page_Category\config\configurations.ini', encoding='utf-8')
    return config

In [17]:
pdp_list = [['amazon','/dp'],['amazon','/gp/aw/d/'],['target','/p/'],['walmart','/ip'],['walmart','track'],['costco','.product'],['cvs','-prodid'],['aldi','/p/'],['dollargeneral','/product-page'],['ebay','/itm/'],['kroger','/p/'],['samsclub','/p/'],['walgreens','product'],['dm','-p'],['mediamarkt','product'],['otto','/p/'],['rossmann','/p/'],['saturn','product'],['biccamera','/item/'],['cosme','/product/'],['edion','/detail.html?'],['bestbuy','skuId'],['noon','/p/'],['noon','/p?o'],['carrefouruae','/p/']]

In [18]:
search_key = [['amazon','s?k='],['target','searchTerm'],['walmart','search'],['walmart','query'],['bestbuy','searchpage'],['costco','CatalogSearch'],['cvs','/search'],['dollargeneral','/search-results'],['ebay','/sch/'],['kroger','/search'],['samsclub','/s/'],['walgreens','/search'],['dm','/search?query'],['mediamarkt','/search.html?'],['otto','/suche/'],['rossmann','/search?text'],['saturn','/search.html?query'],['biccamera','/search/'],['edion','/item_list.html?','keyword'],['rakuten','/search/'],['bestbuy','?st'],['noon','/search/'],['noon','/search?q'],['instashop','/search/'],['carrefouruae','/search?'],['/search'],['/s?'],['/suche']]

In [19]:
deal_key = [['amazon','/goldbox'],['amazon','/offers'],['amazon','/angebot'],['amazon','/coupon'],['target','/top-deals'],['target','/circle'],['walmart','/m'],['walmart','/deals'],['bestbuy','sale-page'],['bestbuy','top-deals'],['amazon','bestsellers','Best-Sellers'],['aldi','/weekly-specials'],['kroger','/savings/'],['kroger','weekly-ad'],['kroger','/page/'],['walgreens','/offers'],['amazon','Best-Sellers'],['carrefouruae','/c/clp_online-deals-promotion']]

In [20]:
review = [['amazon','/ask','/questions'],['amazon','/product-reviews/'],['amazon','/gp/aw/cr/'],['amazon','/gp/aw/reviews/']]

In [21]:
cart = [['amazon','/c/ref=mw_dp_buy_crt'],['amazon','/cart/'],['amazon','/huc/view.html?'],['otto','/basket/'],['/cart'],['/checkout']]

In [22]:
category = [['target','/c/'],['walmart','browse'],['walmart','/cp'],['costco','/baby-kids'],['costco','/diapers-wipes'],['cvs','/baby-child'],['dollargeneral','/category-page'],['samsclub','/c/'],['samsclub','/b/'],['mediamarkt','/category/'],['rossmann','/c/'],['saturn','/category/'],['biccamera','/category/'],['edion','/item_list.html?c'],['rakuten','/category/'],['bestbuy','abcat']]

In [23]:
def search_engine(URL):
    config = read_config()
    for key in eval(config['PageCategory']['search_engine_page']):
        if URL.count(key,0,35) > 0:
            return True


def social_media(URL):
    config = read_config()
    for key in eval(config['PageCategory']['social_media_page']):
        if URL.count(key,0,45) > 0:
            return True

def pdp_page(URL):
    config = read_config()
    for key in eval(config['PageCategory']['pdp']):
        matching = all(x in URL for x in key)
        if matching:
            return True

def search_page(URL):
    config = read_config()
    for key in eval(config['PageCategory']['search']):
        matching = all(x in URL for x in key)
        if matching:
            return True

def deal_page(URL):
    config = read_config()
    for key in eval(config['PageCategory']['deal']):
        matching = all(x in URL for x in key)
        if matching:
            return True

def reviews_page(URL):
    config = read_config()
    for key in eval(config['PageCategory']['reviews']):
        matching = all(x in URL for x in key)
        if matching:
            return True

def cart_page(URL):
    config = read_config()
    for key in eval(config['PageCategory']['cart']):
        matching = all(x in URL for x in key)
        if matching:
            return True

def category_page(URL):
    config = read_config()
    for key in eval(config['PageCategory']['category']):
        matching = all(x in URL for x in key)
        if matching:
            return True
        elif all(x in URL for x in ['amazon','i=']) and \
             all(x not in URL for x in ['k='] ):
              return True

In [28]:
def get_pdp(URL,html_path):
    config = read_config()
    if search_engine(URL):
        return "search_engine_page"
    elif social_media(URL):
        return "social_media_page"
    elif pdp_page(URL):
        return "pdp_page"
    elif search_page(URL):
        return "search_page"
    elif deal_page(URL):
        return "deal_page"
    elif reviews_page(URL):
        return "reviews_page"
    elif cart_page(URL):
        return "cart_page"
    elif category_page(URL):
        return "category_page"
    else:
        client = boto3.client(
            service_name='s3',
            region_name='us-east-1',
            aws_access_key_id=config['S3Settings']['aws_access_key_id'],
            aws_secret_access_key=config['S3Settings']['aws_secret_access_key'])
        html_obj = client.get_object(Bucket="prod-osgsearchmedia", Key=html_path)
        body = html_obj['Body']
        html_content = body.read()
        soup = BeautifulSoup(html_content, 'html.parser')
        text = soup.find_all(text=True)
        if text == None:
            return "No Data"
        else:
            new = []
            for i in text:
                new.append(i.lower())
            keys = eval(config['HTML_Keywords']['pdp'])
            pdp = [i for i in new if i in keys]
            if len(pdp) > 0 and len(pdp)< 10:
                return "pdp_page"
            else:
                return "other_page"


In [29]:
df = pd.read_csv('data6899.csv')
df['html_path'] = df['htmlLink'].apply(lambda x: x[x.find("/downloads/")+11:])

In [30]:
df['html_path']

0     ae661eb6-9e70-4c6c-b42a-d54db50ba74b_41888_Zjh...
1     ae661eb6-9e70-4c6c-b42a-d54db50ba74b_41888_Zjh...
2     ae661eb6-9e70-4c6c-b42a-d54db50ba74b_41888_Zjh...
3     ae661eb6-9e70-4c6c-b42a-d54db50ba74b_41888_Zjh...
4     ae661eb6-9e70-4c6c-b42a-d54db50ba74b_41888_Zjh...
                            ...                        
87    86a6d7d2-3dd6-4854-ac9c-f1a7e1a81cb7_41888_ZjM...
88    86a6d7d2-3dd6-4854-ac9c-f1a7e1a81cb7_41888_ZjM...
89    86a6d7d2-3dd6-4854-ac9c-f1a7e1a81cb7_41888_ZjM...
90    86a6d7d2-3dd6-4854-ac9c-f1a7e1a81cb7_41888_ZjM...
91    86a6d7d2-3dd6-4854-ac9c-f1a7e1a81cb7_41888_ZjM...
Name: html_path, Length: 92, dtype: object

In [34]:
for html in df['html_path']:
    print(get_pdp("https://www.amazon.com/asdsfdsfsdfsdf",html))

other_page
other_page
other_page
other_page
pdp_page
other_page
other_page
pdp_page
other_page
other_page
other_page
other_page
other_page
other_page
other_page
other_page
other_page


KeyboardInterrupt: 

In [19]:
config = read_config()
eval(config['PageCategory']['reviews'])

[['amazon', '/ask', '/questions'],
 ['amazon', '/product-reviews/'],
 ['amazon', '/gp/aw/cr/'],
 ['amazon', '/gp/aw/reviews/']]

In [10]:
df = pd.read_csv(r"D:\OSG-AA-Products\Analytics_POD_PDP\data\data1859_ped.csv")

In [4]:
df.head()

Unnamed: 0,UUID,Resp Id,Keyword Used,URL,Time Spent(sec),Search Domain,Latitude,Longitude,Total Time Spent(secs),city,...,startTime,clickedUrl,deviceName,osVersion,videoLink,audioLink,cameraLink,imageLink,htmlLink,PDP_tag
0,81c42915-f72c-49f4-906f-7a733ed448df,00d86d41-9c5c-44a5-8b23-5a5109360a39,,https://www.google.com/,18.0,www.google.com,,,1003.0,,...,1970-01-20 03:20:45.951,,Vetrivel R?s iPad,14.6,[https://app.o360.online/downloads/NTg0NmJjMzQ...,[],[],[],[],search_page
1,81c42915-f72c-49f4-906f-7a733ed448df,00d86d41-9c5c-44a5-8b23-5a5109360a39,,https://in.search.yahoo.com/?fr2=inr,108.0,in.search.yahoo.com,,,0.0,,...,1970-01-20 03:20:45.969,,Vetrivel R?s iPad,14.6,[https://app.o360.online/downloads/NTg0NmJjMzQ...,[],[],[],[],search_page
2,81c42915-f72c-49f4-906f-7a733ed448df,00d86d41-9c5c-44a5-8b23-5a5109360a39,google,https://www.google.co.in/search?q=google,7.0,www.google.co.in,,,0.0,,...,1970-01-20 03:20:46.078,,Vetrivel R?s iPad,14.6,[https://app.o360.online/downloads/NTg0NmJjMzQ...,[],[],[],[],search_page
3,81c42915-f72c-49f4-906f-7a733ed448df,00d86d41-9c5c-44a5-8b23-5a5109360a39,,https://www.google.com/,6.0,www.google.com,,,0.0,,...,1970-01-20 03:20:46.084,,Vetrivel R?s iPad,14.6,[https://app.o360.online/downloads/NTg0NmJjMzQ...,[],[],[],[],search_page
4,81c42915-f72c-49f4-906f-7a733ed448df,00d86d41-9c5c-44a5-8b23-5a5109360a39,instashop,https://www.google.com/search?q=instashop&sour...,11.0,www.google.com,,,0.0,,...,1970-01-20 03:20:46.091,,Vetrivel R?s iPad,14.6,[https://app.o360.online/downloads/NTg0NmJjMzQ...,[],[],[],[],search_page


In [7]:
df['page_category'] = df.apply(lambda row: check_pages(row), axis=1)

In [8]:
df['page_category'].value_counts()

Not Found             1972
search_engine_page     859
PDP                    824
Search Page            794
Cart Page               84
social_media_page       36
Deals Page              10
Reviews Page             9
Category Page            5
Name: page_category, dtype: int64

In [9]:
df.to_csv(r"D:\OSG-AA-Products\Analytics_POD_PDP\data\data1859_ped_page_category.csv", index=False)

In [19]:
output_dict_full = []
for i in range(len(data_dict)):
    config = read_config()
    url = data_dict[i]['URL']
    uuid = data_dict[i]['UUID']
    htmlLink = data_dict[i]['htmlLink']
    resp_id = data_dict[i]['Resp Id']
    htmlPath = htmlLink[htmlLink.find("/downloads/")+11:]
    class_pdp = get_pdp(url,htmlPath)

    # Call the PDP Model
    pdp_tag = get_pdp(url, htmlPath)

    # Sending data to Seisens though Kafka
    output_dict = {'UUID':uuid, 'Resp Id':resp_id, 'URL':url, 'HTML_Link':htmlLink, 'PDP_Tag':pdp_tag}

    # Sending Data to MySQL Database
    output_dict_full.append(output_dict)
    print(output_dict['PDP_Tag'])

skipped
search_page
NO
skipped
NO


KeyboardInterrupt: 

In [124]:
producer = KafkaProducer(bootstrap_servers=['54.175.181.170:9092'],
                        value_serializer=lambda x: 
                        json.dumps(x).encode('utf-8'))
for i in range(len(output_dict_full)):
    producer.send(eval(config['KafkaSettings']['Produce_topic']), value=output_dict_full[i])

In [109]:
producer = KafkaProducer(bootstrap_servers=['54.175.181.170:9092'],
                         value_serializer=lambda x: 
                         json.dumps(x).encode('utf-8'))

In [110]:
for row in output_dict_full:
    producer.send('test_python_9', value=row)

In [78]:
df = pd.read_csv(r"D:\OSG-AA-Products\Analytics_POD_PDP\SampleTemplate.csv")
data_dict = df.to_dict('records')

In [125]:
topic = eval(config['KafkaSettings']['Produce_topic'])

In [126]:
topic

'test_python_9'