In [1]:
#Fuzzy matchings

#1. Fuzzy match team names extracted by gpt to actual names
#2. Fuzzy matching for manually curated chunks - to evaluate gpt vs manual vs fuzzy

import os
from fuzzywuzzy import fuzz
import pandas as pd
import nltk.data
from multiprocessing import Pool
import time
from pandas.core.common import flatten
from itertools import compress
#import justext

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') #English Language Tokenizer for identifying sentences

meta = pd.read_csv("../data/raw/team_meta_db.tsv", sep = "\t")

batches = os.listdir("../data/processed/text_batches_inter-team_filtered/") #all_batches_inter-team_fuzzy_select_freq/")
batches = [batch.replace(".csv", "") for batch in batches]
batches = sorted(batches)

#batches = ["1", "2"]

In [2]:
#2. Fuzzy matching for manually curated chunks
#Functions used previously

def advanced_search(team_name, wiki_text):
    
    names = [team_name]
    for char in ['_', '-']:
        names.append(team_name.replace(char, ' '))
    if team_name=='EPFL':
        names.append('EPF Lausanne')
    for name in names:
        if name in wiki_text:
            return True
    if fuzzy_search(team_name, wiki_text):
        return True
    return False

def fuzzy_search(team_name, wiki_text, r_cutoff = 90):
    
    if (len(wiki_text) < len(team_name)):
        r = fuzz.ratio(team_name, wiki_text)
        r_cutoff = 80
    
    else:
        r = fuzz.partial_ratio(team_name, wiki_text)
    
    if r > r_cutoff:
        return True
    else:
        return False

def compute_collab(text, teams_search, source_team = None):
    
    cdf = pd.DataFrame(columns = ['Team', 'Target'])
            
    for k in teams_search:
        if not source_team == k:
            key = advanced_search(k, text)
            if key == True:
                cdf = cdf.append({'Team': source_team,'Target': k}, ignore_index=True)
    
    return cdf           

In [4]:
df = pd.DataFrame()

for batch_no in batches:
    
        print("Started batch " + batch_no)
        start = time.time()

        batch = pd.read_csv("../data/processed/text_batches_inter-team_filtered/" + batch_no + ".csv")
        
        
        
        for row_no in range(len(batch)):
            
            year = batch['year'][row_no]
            text = batch['text'][row_no]
            source_team = batch['team'][row_no]
            page = batch['page'][row_no]
            
            teams_search = meta[(meta['Year'] == year) & (meta['Status'] == 'Accepted')]['Team'].unique()
            
            temp = compute_collab(text, teams_search, source_team)
            
            if len(temp) > 0:
                temp['chunk_id'] = row_no
                temp['year'] = year
                temp['batch_id'] = batch_no
                temp['page'] = page
                df = df.append(temp)
                
        end = time.time()
        time_elapsed = (end-start)

        print("Completed batch " + batch_no + " in " + str(time_elapsed) + " seconds")
        

Started batch 1
Completed batch 1 in 2.0833778381347656 seconds
Started batch 10
Completed batch 10 in 2.0765459537506104 seconds
Started batch 100
Completed batch 100 in 2.03383207321167 seconds
Started batch 1000
Completed batch 1000 in 2.0263092517852783 seconds
Started batch 1001
Completed batch 1001 in 1.9899468421936035 seconds
Started batch 1002
Completed batch 1002 in 2.137650489807129 seconds
Started batch 1003
Completed batch 1003 in 2.1582460403442383 seconds
Started batch 1004
Completed batch 1004 in 2.3337209224700928 seconds
Started batch 1005
Completed batch 1005 in 2.244807481765747 seconds
Started batch 1006
Completed batch 1006 in 2.2795281410217285 seconds
Started batch 1007
Completed batch 1007 in 2.3046071529388428 seconds
Started batch 1008
Completed batch 1008 in 2.343060255050659 seconds
Started batch 1009
Completed batch 1009 in 2.241925001144409 seconds
Started batch 101
Completed batch 101 in 2.326124668121338 seconds
Started batch 1010
Completed batch 1010 i

Completed batch 173 in 2.899829387664795 seconds
Started batch 174
Completed batch 174 in 2.8607099056243896 seconds
Started batch 175
Completed batch 175 in 2.7057855129241943 seconds
Started batch 176
Completed batch 176 in 2.275974750518799 seconds
Started batch 177
Completed batch 177 in 2.865668535232544 seconds
Started batch 178
Completed batch 178 in 2.828559398651123 seconds
Started batch 179
Completed batch 179 in 2.8941361904144287 seconds
Started batch 18
Completed batch 18 in 2.6230669021606445 seconds
Started batch 180
Completed batch 180 in 2.9009909629821777 seconds
Started batch 181
Completed batch 181 in 2.7310791015625 seconds
Started batch 182
Completed batch 182 in 2.453429698944092 seconds
Started batch 183
Completed batch 183 in 2.9562807083129883 seconds
Started batch 184
Completed batch 184 in 3.016552686691284 seconds
Started batch 185
Completed batch 185 in 2.6670525074005127 seconds
Started batch 186
Completed batch 186 in 2.7180938720703125 seconds
Started b

Completed batch 283 in 3.5710034370422363 seconds
Started batch 284
Completed batch 284 in 3.49432110786438 seconds
Started batch 285
Completed batch 285 in 2.947030782699585 seconds
Started batch 286
Completed batch 286 in 3.017251968383789 seconds
Started batch 287
Completed batch 287 in 3.321530818939209 seconds
Started batch 288
Completed batch 288 in 3.0406274795532227 seconds
Started batch 289
Completed batch 289 in 3.5711421966552734 seconds
Started batch 29
Completed batch 29 in 3.0422072410583496 seconds
Started batch 290
Completed batch 290 in 3.273061513900757 seconds
Started batch 291
Completed batch 291 in 2.6753854751586914 seconds
Started batch 292
Completed batch 292 in 2.2203967571258545 seconds
Started batch 293
Completed batch 293 in 2.311248302459717 seconds
Started batch 294
Completed batch 294 in 2.7923991680145264 seconds
Started batch 295
Completed batch 295 in 2.717250347137451 seconds
Started batch 296
Completed batch 296 in 2.2973692417144775 seconds
Started 

Completed batch 393 in 2.823662281036377 seconds
Started batch 394
Completed batch 394 in 2.612353563308716 seconds
Started batch 395
Completed batch 395 in 2.7484512329101562 seconds
Started batch 396
Completed batch 396 in 2.7482359409332275 seconds
Started batch 397
Completed batch 397 in 3.1540653705596924 seconds
Started batch 398
Completed batch 398 in 2.6560542583465576 seconds
Started batch 399
Completed batch 399 in 2.7968740463256836 seconds
Started batch 4
Completed batch 4 in 2.6532115936279297 seconds
Started batch 40
Completed batch 40 in 2.6422955989837646 seconds
Started batch 400
Completed batch 400 in 2.666914224624634 seconds
Started batch 401
Completed batch 401 in 2.7030577659606934 seconds
Started batch 402
Completed batch 402 in 2.736203670501709 seconds
Started batch 403
Completed batch 403 in 2.8187074661254883 seconds
Started batch 404
Completed batch 404 in 2.8442234992980957 seconds
Started batch 405
Completed batch 405 in 2.9996795654296875 seconds
Started 

Completed batch 502 in 2.608893871307373 seconds
Started batch 503
Completed batch 503 in 2.8305766582489014 seconds
Started batch 504
Completed batch 504 in 2.9067769050598145 seconds
Started batch 505
Completed batch 505 in 2.5477728843688965 seconds
Started batch 506
Completed batch 506 in 2.702648639678955 seconds
Started batch 507
Completed batch 507 in 2.8033671379089355 seconds
Started batch 508
Completed batch 508 in 2.654755115509033 seconds
Started batch 509
Completed batch 509 in 2.529837131500244 seconds
Started batch 51
Completed batch 51 in 2.6736390590667725 seconds
Started batch 510
Completed batch 510 in 2.5484378337860107 seconds
Started batch 511
Completed batch 511 in 2.7032573223114014 seconds
Started batch 512
Completed batch 512 in 2.9740383625030518 seconds
Started batch 513
Completed batch 513 in 2.697572708129883 seconds
Started batch 514
Completed batch 514 in 3.4461166858673096 seconds
Started batch 515
Completed batch 515 in 3.281355381011963 seconds
Starte

Completed batch 612 in 3.1264052391052246 seconds
Started batch 613
Completed batch 613 in 3.211729049682617 seconds
Started batch 614
Completed batch 614 in 2.8870606422424316 seconds
Started batch 615
Completed batch 615 in 2.590522527694702 seconds
Started batch 616
Completed batch 616 in 2.320068120956421 seconds
Started batch 617
Completed batch 617 in 2.2792937755584717 seconds
Started batch 618
Completed batch 618 in 2.264387845993042 seconds
Started batch 619
Completed batch 619 in 2.3017213344573975 seconds
Started batch 62
Completed batch 62 in 2.689849376678467 seconds
Started batch 620
Completed batch 620 in 2.735974073410034 seconds
Started batch 621
Completed batch 621 in 2.839179277420044 seconds
Started batch 622
Completed batch 622 in 2.751852512359619 seconds
Started batch 623
Completed batch 623 in 2.7091050148010254 seconds
Started batch 624
Completed batch 624 in 2.6341588497161865 seconds
Started batch 625
Completed batch 625 in 2.6766855716705322 seconds
Started 

Completed batch 722 in 2.6888976097106934 seconds
Started batch 723
Completed batch 723 in 4.1498048305511475 seconds
Started batch 724
Completed batch 724 in 2.972597599029541 seconds
Started batch 725
Completed batch 725 in 2.595592498779297 seconds
Started batch 726
Completed batch 726 in 2.638838529586792 seconds
Started batch 727
Completed batch 727 in 3.5384879112243652 seconds
Started batch 728
Completed batch 728 in 2.735353708267212 seconds
Started batch 729
Completed batch 729 in 2.8397481441497803 seconds
Started batch 73
Completed batch 73 in 3.565775156021118 seconds
Started batch 730
Completed batch 730 in 2.78139066696167 seconds
Started batch 731
Completed batch 731 in 2.5993008613586426 seconds
Started batch 732
Completed batch 732 in 2.928481340408325 seconds
Started batch 733
Completed batch 733 in 3.4677586555480957 seconds
Started batch 734
Completed batch 734 in 3.6487491130828857 seconds
Started batch 735
Completed batch 735 in 3.089668035507202 seconds
Started b

Completed batch 832 in 2.639953851699829 seconds
Started batch 833
Completed batch 833 in 2.633859157562256 seconds
Started batch 834
Completed batch 834 in 2.3845767974853516 seconds
Started batch 835
Completed batch 835 in 2.7295944690704346 seconds
Started batch 836
Completed batch 836 in 2.6172518730163574 seconds
Started batch 837
Completed batch 837 in 2.5019750595092773 seconds
Started batch 838
Completed batch 838 in 2.372670888900757 seconds
Started batch 839
Completed batch 839 in 3.0837838649749756 seconds
Started batch 84
Completed batch 84 in 3.2822513580322266 seconds
Started batch 840
Completed batch 840 in 3.182230234146118 seconds
Started batch 841
Completed batch 841 in 2.9753637313842773 seconds
Started batch 842
Completed batch 842 in 2.9061501026153564 seconds
Started batch 843
Completed batch 843 in 3.7112233638763428 seconds
Started batch 844
Completed batch 844 in 3.296844005584717 seconds
Started batch 845
Completed batch 845 in 3.3222594261169434 seconds
Start

Completed batch 942 in 3.4843575954437256 seconds
Started batch 943
Completed batch 943 in 3.2536416053771973 seconds
Started batch 944
Completed batch 944 in 3.0169765949249268 seconds
Started batch 945
Completed batch 945 in 3.128927707672119 seconds
Started batch 946
Completed batch 946 in 3.050800085067749 seconds
Started batch 947
Completed batch 947 in 2.9608421325683594 seconds
Started batch 948
Completed batch 948 in 2.862067222595215 seconds
Started batch 949
Completed batch 949 in 2.426088333129883 seconds
Started batch 95
Completed batch 95 in 2.7096688747406006 seconds
Started batch 950
Completed batch 950 in 2.6503801345825195 seconds
Started batch 951
Completed batch 951 in 2.846470594406128 seconds
Started batch 952
Completed batch 952 in 2.6599533557891846 seconds
Started batch 953
Completed batch 953 in 2.779742956161499 seconds
Started batch 954
Completed batch 954 in 2.6152915954589844 seconds
Started batch 955
Completed batch 955 in 2.616319417953491 seconds
Started

In [5]:
df.to_csv("../data/processed/fuzzy_curated/inter-team_all_batches_fuzzy_curation.csv", index=False)