
## Calculate keyword similarity score (training) <br>

In the following, we use the overlap coefficient to compute the keyword similarity score of the keywords extracted from the training data:

In [1]:
import os
from bs4 import BeautifulSoup
import json
import pandas as pd
import numpy as np
import math
from googletrans import Translator # pip install googletrans==4.0.0-rc1
translator = Translator()

In [None]:
train_df = pd.read_csv ('train/_TRAIN_details_in_df.csv')

In [109]:
keywords_score_df = pd.DataFrame(columns = ["pair_id","url1_lang", "url2_lang", "key1","key2","key1_translated","key2_translated"])



### Overlap coefficient for keywords <br>

We treat keywords in a similar fashion as the named entities (see `03_get_training_title_named_entity_similarity_score.ipynb`) and we use the overlap coefficient to measure similarity. Whenever keywords are not available, the score is set to -1.0.<br><br>

Analogously to the case of named entities, we translate and normalize keywords to English before comparing them. This requires to take extra care because of the limitations of the Google Translate API. <br><br>

We start by comparing the pairs that do not require translation and, afterwards, we deal with the ones that require translation.

In [7]:
def normalize(input_list):
     return [(((e.lower()).replace('.','')).replace('-',' ')).replace("'s", "") for e in input_list]


def overlapScore(key1, key2):
    key1 = set(key1)
    key2 = set(key2)
    interset = key1.intersection(key2)
    union = key1.union(key2)
    
    denominator = len(key1)
    if len(key2) < denominator:
        denominator = len(key2)
    
    if len(key1) <1 or len(key2) <1 or len(union) < 1 or len(interset)<1:
        return 0
    return (len(interset)/(denominator))


# # first do the ones that don't need translation (same lang can directly map to to each other)
# # only when its different language paired together do we need to translate them to english then compare
for i, row in train_df.iterrows():
    pair = row['pair_id']
    score = -1
    key1 = ''
    key2 = ''
    key1_translated = ''
    key2_translated = ''
    print('--------------',i,'---------------')
    print(pair)
    if isinstance(row['keywords1'], str) and isinstance(row['keywords2'], str):
        score = 0
        key1 = row['keywords1'].split(",")
        key2 = row['keywords2'].split(",")

        key1_translated = row['keywords1']
        key2_translated = row['keywords2']
        score = overlapScore(key1, key2)
    
        if row['url1_lang'] != row['url2_lang']:
            trans.append(i)
    entry = {'pair_id':pair,'url1_lang':row['url1_lang'],'url2_lang':row['url2_lang'],'key1':row['keywords1'],'key2':row['keywords2'],'key1_translated':key1_translated,'key2_translated':key2_translated,'score':score}
    print('entry:\n',entry)
    keywords_score_df = keywords_score_df.append(entry, ignore_index = True)



In [18]:
# we need the midpoint because googletran keyword translation sometimes would timeout 
# and we have to start again from when it stopped running

midpoint = 0

for i, row in train_df.iterrows():
    pair = row['pair_id']
    score = -1
    key1 = ''
    key2 = ''
    key1_translated = ''
    key2_translated = ''
    
    if isinstance(row['keywords1'], str) and isinstance(row['keywords2'], str):
        score = 0
        key1 = row['keywords1'].split(",")
        key2 = row['keywords2'].split(",")

        if row['url1_lang'] != row['url2_lang'] and i >= midpoint:
            print('--------------',i,'---------------')
            print(pair)
            key1_translated = []
            key2_translated = []
            
            print(row['url1_lang'],' ',row['url2_lang'])
            print('og 1:',row['keywords1'])
            print('og 2:',row['keywords2'])

            for k in key1:
                result = translator.translate(k,src=row['url1_lang'])
                key1_translated.append(result.text)
                
            #we have to deal individually with exotic cases
            for k in key2:
                if k == 'www.ynet.co.il':
                    continue
                result = translator.translate(k,src=row['url2_lang'])
                key2_translated.append(result.text)

            print('trans 1:',key1_translated)
            print('trans 2:',key2_translated)
            
            if row['url1_lang'] == 'en' or row['url2_lang'] == 'en':
                score = overlapScore(normalize(key1),normalize(key2))
            score_trans = overlapScore(normalize(key1_translated),normalize(key2_translated))
            print('score:',score)
            print('score_trans:',score_trans)
            if score_trans > score:
                score = score_trans
            print('final score:',score)
            
            
            key1_translated = ','.join(key1_translated)
            key2_translated = ','.join(key2_translated)
            print('key1trans:',key1_translated)

            keywords_score_df.at[i,'key1_translated'] = key1_translated
            keywords_score_df.at[i,'key2_translated'] = key2_translated
            keywords_score_df.at[i,'score'] = score
            print('iloc:',keywords_score_df.iloc[i]['key1_translated'])



In [11]:
keywords_score_df

Unnamed: 0,pair_id,url1_lang,url2_lang,key1,key2,key1_translated,key2_translated,score
0,1484084337_1484110209,en,en,"Law and order,UnitedStates,Martinsburg,Crime,W...","LatinAmericaandCaribbean,Latin America and Car...","Law and order,UnitedStates,Martinsburg,Crime,W...","LatinAmericaandCaribbean,Latin America and Car...",0.166667
1,1484396422_1483924666,en,en,,"smg2_world,smg_europe,smg2_news",,,-1.000000
2,1484698254_1483758694,en,en,,"Full Coverage Times of Israel podcasts,US emba...",,,-1.000000
3,1576314516_1576455088,en,en,"Zomato,zomatoubereatsbusinessacquisitionindiaa...","swiggy,Swiggy,indian online food delivery mark...","Zomato,zomatoubereatsbusinessacquisitionindiaa...","swiggy,Swiggy,indian online food delivery mark...",0.500000
4,1484036253_1483894099,en,en,"India,ISRO,lunarorbiter,landonthemoon","India,space","India,ISRO,lunarorbiter,landonthemoon","India,space",0.500000
...,...,...,...,...,...,...,...,...
4959,1586195445_1598778991,tr,tr,"BirleşmişMilletler,Yemen,Birleşmiş Milletler,G...","BirleşmişMilletler,Yemen,Birleşmiş Milletler,G...","BirleşmişMilletler,Yemen,Birleşmiş Milletler,G...","BirleşmişMilletler,Yemen,Birleşmiş Milletler,G...",0.800000
4960,1590915424_1590940388,tr,tr,"İspanya,La Liga,RealMadrid,Real Madrid,LaLiga,...","laliga,la liga,koronavirüs,İspanya 1. Futbol L...","İspanya,La Liga,RealMadrid,Real Madrid,LaLiga,...","laliga,la liga,koronavirüs,İspanya 1. Futbol L...",0.000000
4961,1526157103_1492737005,tr,tr,"MASASINDA',OLMADI:,'Borca,DEV,SATIŞ,BOĞULAN,ÇA...","Ahmet Nur Çebi,Beşiktaş,transfer,Türkiye Futbo...","MASASINDA',OLMADI:,'Borca,DEV,SATIŞ,BOĞULAN,ÇA...","Ahmet Nur Çebi,Beşiktaş,transfer,Türkiye Futbo...",0.000000
4962,1603274500_1618292937,tr,tr,"Rasim Yüksel,Güncel,Haber,Ergene,RasimYüksel,K...","Koronavirüs,Yaşam,Yeşiltepe,Haber","Rasim Yüksel,Güncel,Haber,Ergene,RasimYüksel,K...","Koronavirüs,Yaşam,Yeşiltepe,Haber",0.500000


In [17]:
path = 'train/_TRAIN_keywords_score.csv'
keywords_score_df.to_csv(path,index=False)
