# Stage 4: Matching
## Import required packages


In [1]:
# import py_entitymatching package
import py_entitymatching as em
import pandas as pd
import re
import nltk



## Read the input tables

In [2]:
# specify filepaths for tables A and B. 
path_A = 'spoj/json/spoj_blocking.csv' # patoh_A is the file path where table_A.csv is stored
path_B = 'codechef/data/codechef_blocking.csv' # path_B is the file path where table_B.csv is stored

In [3]:
# read table A; table A has 'ID' as the key attribute
A = em.read_csv_metadata(path_A)

# read table B; table B has 'ID' as the key attribute
B = em.read_csv_metadata(path_B)
A.rename(columns={"Unnamed: 0":"ID"}, inplace=True)
B.rename(columns={"Unnamed: 0":"ID"}, inplace=True)

No handlers could be found for logger "py_entitymatching.io.parsers"


In [4]:
A["words"] = A["description"].fillna("") + A["input"].fillna("") + A["output"].fillna("")
A = A.drop(["description", "input", "output"], axis=1)
B["words"] = B["description"].fillna("") + B["input"].fillna("") + B["output"].fillna("")
B = B.drop(["description", "input", "output"], axis=1)

In [5]:
# Step 1
def std_words(df):
    letters_only = re.sub("[^a-zA-Z]", " ", df["words"]) # letters only, drop numbers & symbols
    words = letters_only.lower().split() # lower case, split by word                  
    stops = set((nltk.corpus.stopwords.words("english") + 
                ['a','b','c','d','e','f','g','h','i','j','k','l','m','n',
                'o','p','q','r','s','t','u','v','w','x','y','z'])) # general stopwords & letters  
    meaningful_words = [w for w in words if not w in stops] # remove stopwords
    return( " ".join( meaningful_words)) # return re-joined string

# Step 2
A["words"] = A.apply(std_words, axis=1)
B["words"] = B.apply(std_words, axis=1)
em.set_key(A, "ID")
em.set_key(B, "ID")

True

## Read in candidate set

In [40]:
# read C
C = em.read_csv_metadata('candidate_set.csv', ltable=A, rtable=B)

In [41]:
table_A = pd.read_csv("spoj/json/problems.csv")
table_B = pd.read_csv("codechef/data/codechef_problems.csv")

In [42]:
C["ltable_url"] = table_A.loc[C.ltable_ID.values,"url"].values
C["rtable_url"] = table_B.loc[C.rtable_ID.values,"link"].values

In [43]:
C["ltable_problem_code"] = table_A.loc[C.ltable_ID.values,"title"].values
C["ltable_problem_code"] = table_A.loc[C.ltable_ID.values,"title"].apply(lambda x: x.split(" - ")[0]).values
C["rtable_problem_code"] = table_B.loc[C.rtable_ID.values,"problem_code"].values

In [44]:
C["ltable_problem_title"] = table_A.loc[C.ltable_ID.values,"title"].apply(lambda x:x.split(" - ")[1]).values
C["rtable_problem_title"] = table_B.loc[C.rtable_ID.values,"title"].values

In [45]:
def my_function(x, y):
    # x, y will be of type pandas series
    
    # get name attribute
    x_words = x['words'].split(" ")
    y_words = y['words'].split(" ")
    
    if em.cosine(x_words, y_words) > 0.5:
        return False
    else:
        return True
bb = em.BlackBoxBlocker()
bb.set_black_box_function(my_function)

In [46]:
C1 = bb.block_candset(C, n_jobs=2)
C1

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:11


Unnamed: 0,_id,ltable_ID,rtable_ID,ltable_url,rtable_url,ltable_problem_code,rtable_problem_code,ltable_problem_title,rtable_problem_title
2,2,0,5166,http://www.spoj.com/problems/TEST,https://www.codechef.com/problems/TEST,TEST,TEST,"Life, the Universe, and Everything","Life, the Universe, and Everything"
1257,1257,31,2205,http://www.spoj.com/problems/CAPPIZZA,https://www.codechef.com/problems/CDS008,CAPPIZZA,CDS008,Caper Pizza,Divide it Equally
1470,1470,64,5005,http://www.spoj.com/problems/AMR11F,https://www.codechef.com/problems/FLOORSMV,AMR11F,FLOORSMV,Magical Bridges,Moving between Floors
1735,1735,74,282,http://www.spoj.com/problems/FASHION,https://www.codechef.com/problems/CDSE03,FASHION,CDSE03,Fashion Shows,IQ
1736,1736,74,2885,http://www.spoj.com/problems/FASHION,https://www.codechef.com/problems/VITC04,FASHION,VITC04,Fashion Shows,Ramp Walk
1856,1856,102,3241,http://www.spoj.com/problems/COINTOSS,https://www.codechef.com/problems/SOPC04,COINTOSS,SOPC04,Coin Tosses,Toss the Coin
1903,1903,119,4537,http://www.spoj.com/problems/HIGH,https://www.codechef.com/problems/HIGH,HIGH,HIGH,Highways,Highways
1909,1909,125,2440,http://www.spoj.com/problems/IOPC1200,https://www.codechef.com/problems/IOPC1200,IOPC1200,IOPC1200,Hardware upgrade,Hardware upgrade
1922,1922,127,340,http://www.spoj.com/problems/IOPC1201,https://www.codechef.com/problems/IOPC1201,IOPC1201,IOPC1201,Rubiks cube,Rubiks cube
2065,2065,131,2962,http://www.spoj.com/problems/IOPC1202,https://www.codechef.com/problems/IOPC1202,IOPC1202,IOPC1202,Quadrilaterals,Quadrilaterals


In [48]:
D = C1.sample(400, random_state=44)

In [47]:
# display C1
dbg = em.debug_blocker(C1, A, B, output_size=5, verbose=True)
dbg

Unnamed: 0,_id,similarity,ltable_ID,rtable_ID,ltable_title,ltable_words,rtable_title,rtable_words
0,0,0.444444,231,2107,LGIC,given sequance natural numbers find th term sequence one natural number one natural number th te...,CDVA1501,given sequence natural numbers find th term ansingle line containing natural number nprint th te...
1,1,0.431373,4775,3874,MAIN72,given array integers want find sum integers expressed sum least one subset given array first lin...,COMB4SUM,special sum numbers defined denotes absolute value first line contains number test cases follow ...
2,2,0.428571,2307,3513,CEQU,let us see following equation ax given three positive integers determine whether exists least on...,GOC203,prof vats nit hamirpur given problem whole class imposed condition give attendance solve within ...
3,3,0.428571,4651,4748,NOVICE21,given three integers find many integers inclusive divisible first line contains number test case...,LCM,given compute sum lcm pairs positive integers integer divides give answer modulo first line cont...
4,4,0.428571,4417,2598,CNTTREE,given tree need count many subtrees diameter first line contains number test cases test cases fo...,TRSUM,let lx denote level node rooted tree lx root otherwise lx ly parent rooted tree need calculate s...


In [39]:
C

Unnamed: 0,_id,ltable_ID,rtable_ID,ltable_url,rtable_url,ltable_problem_code,rtable_problem_code,ltable_problem_title,rtable_problem_title
2,2,0,5166,http://www.spoj.com/problems/TEST,https://www.codechef.com/problems/TEST,TEST,TEST,"Life, the Universe, and Everything","Life, the Universe, and Everything"
1257,1257,31,2205,http://www.spoj.com/problems/CAPPIZZA,https://www.codechef.com/problems/CDS008,CAPPIZZA,CDS008,Caper Pizza,Divide it Equally
1735,1735,74,282,http://www.spoj.com/problems/FASHION,https://www.codechef.com/problems/CDSE03,FASHION,CDSE03,Fashion Shows,IQ
1736,1736,74,2885,http://www.spoj.com/problems/FASHION,https://www.codechef.com/problems/VITC04,FASHION,VITC04,Fashion Shows,Ramp Walk
1856,1856,102,3241,http://www.spoj.com/problems/COINTOSS,https://www.codechef.com/problems/SOPC04,COINTOSS,SOPC04,Coin Tosses,Toss the Coin
1903,1903,119,4537,http://www.spoj.com/problems/HIGH,https://www.codechef.com/problems/HIGH,HIGH,HIGH,Highways,Highways
1909,1909,125,2440,http://www.spoj.com/problems/IOPC1200,https://www.codechef.com/problems/IOPC1200,IOPC1200,IOPC1200,Hardware upgrade,Hardware upgrade
1922,1922,127,340,http://www.spoj.com/problems/IOPC1201,https://www.codechef.com/problems/IOPC1201,IOPC1201,IOPC1201,Rubiks cube,Rubiks cube
2065,2065,131,2962,http://www.spoj.com/problems/IOPC1202,https://www.codechef.com/problems/IOPC1202,IOPC1202,IOPC1202,Quadrilaterals,Quadrilaterals
2066,2066,132,3375,http://www.spoj.com/problems/IOPC1203,https://www.codechef.com/problems/IOPC1203,IOPC1203,IOPC1203,Crazy texting,Crazy texting


In [49]:
D.head()

Unnamed: 0,_id,ltable_ID,rtable_ID,ltable_url,rtable_url,ltable_problem_code,rtable_problem_code,ltable_problem_title,rtable_problem_title
28171,28171,3067,3283,http://www.spoj.com/problems/ESYRCRTN,https://www.codechef.com/problems/GCDTREE,ESYRCRTN,GCDTREE,Easy Recursion,GCD on Tree
48572,48572,5087,624,http://www.spoj.com/problems/CODESPTA,https://www.codechef.com/problems/IBT,CODESPTA,IBT,2s Complement,Iterated Bitcount Function
4782,4782,526,4764,http://www.spoj.com/problems/OPC1708C,https://www.codechef.com/problems/EXPCOMM,OPC1708C,EXPCOMM,Factorial large,Exponentiation Commutativity
42150,42150,4326,3544,http://www.spoj.com/problems/ARRANGE2,https://www.codechef.com/problems/CC2,ARRANGE2,CC2,Rearranging Digits,Rihanna and Fibonacci
38898,38898,4157,3158,http://www.spoj.com/problems/QCJ3,https://www.codechef.com/problems/QCJ6,QCJ3,QCJ6,The Game,THE GAME


In [50]:
D.loc[:,["ltable_url","ltable_problem_code","rtable_url","rtable_problem_code"]].to_csv('labelled_set.csv', index=False)