In [1]:
#Fuzzy matchings

#1. Fuzzy match team names extracted by gpt to actual names
#2. Fuzzy matching for manually curated chunks - to evaluate gpt vs manual vs fuzzy

import os
from fuzzywuzzy import fuzz
import pandas as pd
import nltk.data
from multiprocessing import Pool
import time
from pandas.core.common import flatten
from itertools import compress
#import justext

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') #English Language Tokenizer for identifying sentences

meta = pd.read_csv("../data/raw/team_meta_db.tsv", sep = "\t")

batches = os.listdir("../data/processed/text_batches_inter-team_filtered/") #all_batches_inter-team_fuzzy_select_freq/")
batches = [batch.replace(".csv", "") for batch in batches]
batches = sorted(batches)

#batches = ["1", "2"]

In [2]:
#2. Fuzzy matching for manually curated chunks
#Functions used previously

def advanced_search(team_name, wiki_text):
    
    names = [team_name]
    for char in ['_', '-']:
        names.append(team_name.replace(char, ' '))
    if team_name=='EPFL':
        names.append('EPF Lausanne')
    for name in names:
        if name in wiki_text:
            return True
    if fuzzy_search(team_name, wiki_text):
        return True
    return False

def fuzzy_search(team_name, wiki_text, r_cutoff = 90):
    
    if (len(wiki_text) < len(team_name)):
        r = fuzz.ratio(team_name, wiki_text)
        r_cutoff = 80
    
    else:
        r = fuzz.partial_ratio(team_name, wiki_text)
    
    if r > r_cutoff:
        return True
    else:
        return False

def compute_collab(text, teams_search, source_team = None):
    
    cdf = pd.DataFrame(columns = ['Team', 'Target'])
            
    for k in teams_search:
        if not source_team == k:
            key = advanced_search(k, text)
            if key == True:
                cdf = cdf.append({'Team': source_team,'Target': k}, ignore_index=True)
    
    return cdf           

In [3]:
df = pd.DataFrame()

for batch_no in batches:
    
        print("Started batch " + batch_no)
        start = time.time()

        batch = pd.read_csv("../data/processed/text_batches_inter-team_filtered/" + batch_no + ".csv")
        
        
        
        for row_no in range(len(batch)):
            
            year = batch['year'][row_no]
            text = batch['text'][row_no]
            source_team = batch['team'][row_no]
            page = batch['page'][row_no]
            
            teams_search = meta[(meta['Year'] == year) & (meta['Status'] == 'Accepted')]['Team'].unique()
            
            temp = compute_collab(text, teams_search, source_team)
            
            if len(temp) > 0:
                temp['chunk_id'] = row_no
                temp['year'] = year
                temp['batch_id'] = batch_no
                temp['page'] = page
                df = df.append(temp)
                
        end = time.time()
        time_elapsed = (end-start)

        print("Completed batch " + batch_no + " in " + str(time_elapsed) + " seconds")
        

Started batch 1
Completed batch 1 in 2.066100835800171 seconds
Started batch 10
Completed batch 10 in 2.0395123958587646 seconds
Started batch 100
Completed batch 100 in 1.9664368629455566 seconds
Started batch 1000
Completed batch 1000 in 2.3852691650390625 seconds
Started batch 1001
Completed batch 1001 in 2.4787824153900146 seconds
Started batch 1002
Completed batch 1002 in 2.3541195392608643 seconds
Started batch 1003
Completed batch 1003 in 2.5510053634643555 seconds
Started batch 1004
Completed batch 1004 in 2.7026267051696777 seconds
Started batch 1005
Completed batch 1005 in 2.723708391189575 seconds
Started batch 1006
Completed batch 1006 in 2.884155035018921 seconds
Started batch 1007
Completed batch 1007 in 3.125232458114624 seconds
Started batch 1008
Completed batch 1008 in 2.924687385559082 seconds
Started batch 1009


KeyboardInterrupt: 

In [4]:
df.to_csv("../data/processed/fuzzy_curated/inter-team_all_batches_fuzzy_curation.csv", index=False)