In [1]:
import pandas as pd
import os
from transformers import GPT2TokenizerFast
import tiktoken
import json

tokenizer = tiktoken.encoding_for_model("gpt-4")

from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
def count_token(text):
        num_token = len(tokenizer.encode(text))
        return num_token

#file = open("../data/raw/full_text/2018/Aachen/Team:Aachen#Attributions_-_-_TEXT.html")
#text = file.read().rstrip()

splitter = RecursiveCharacterTextSplitter(chunk_size=10000, length_function = count_token,
                                                 separators=['\n\n', '\n', "."],
                                                  chunk_overlap=30)

In [3]:
def process_chunks(text, team, year, page, chunk_size = 500, overlap_size = 30):
    
    text_encoded = tokenizer.encode(text)
    token_count = len(text_encoded)
    
    chunks = [text_encoded[i-overlap_size : i + chunk_size] for i in range(overlap_size, token_count, chunk_size)]
    chunks = [tokenizer.decode(chunk) for chunk in chunks]    
    
    chunks =  pd.DataFrame({"team": team, "year": year, "page": page, "text": chunks}) 
    
    return(chunks)

In [4]:
terms_to_remove = ['css', "js", "javascript", "react", 'plotly', 'jquery']
page_names_to_include = ["Attribution", "attribution", "#Team", "#team", "Acknowledgement", "acknowledgement", "member"
                        "Member", "Mentor", "Contribution", "contribution", "People", "About Us", "Members", "Acknowledgements"]

l = [x.lower() for x in page_names_to_include]

page_names_to_include.extend(l)

page_names_to_include = set(page_names_to_include)

In [16]:
count = 0
max_chunk_size = 500

for year in range(2008,2019):
    
    teams = os.listdir("../data/raw/full_text/" + str(year))
    pd_chunks = pd.DataFrame()
    
    for team in teams:
        
        #Have to use all pages ~45% of teams don't list a "Collaboration" page
        
        files = os.listdir("../data/raw/full_text/" + str(year) + "/" + team)
        text_files = [x for x in files if 'TEXT' in x]         
        text_files = [x for x in text_files if any(term in x for term in page_names_to_include)]
        
        #path = "../../Collab Search/data/raw/full_text/" + str(year) + "/" + team + "/Team:" + team.replace("_", " ") + "#Collaborations_-_-_TEXT.html"
        
        for text_file in text_files:
            
            path = "../data/raw/full_text/" + str(year) + "/" + team + "/" + text_file
        
            if os.path.exists(path):
        
                file = open(path)
                txt = file.read().rstrip()
            
                #chunks = process_chunks(txt, team, year, text_file, chunk_size = max_chunk_size)
                
                chunk_list = splitter.split_text(txt)
                chunks = pd.DataFrame()
                
                for item in chunk_list:
                    chunks = chunks.append(pd.DataFrame([[team, year, text_file, item]], columns=["team", "year", "page", "chunk"]))
                
                pd_chunks = pd_chunks.append(chunks)
            
            else:
                count = count + 1
            
        #pd_chunks.to_csv("../data/processed/batches_intra-team_select/" + str(year) + "/" + team + ".csv", index=False)
    
    pd_chunks.to_csv("../data/processed/text_batches_intra-team/" + str(year) + ".csv", index=False)
    print("Completed Year:" + str(year))

Completed Year:2008
Completed Year:2009
Completed Year:2010
Completed Year:2011
Completed Year:2012
Completed Year:2013
Completed Year:2014
Completed Year:2015
Completed Year:2016
Completed Year:2017
Completed Year:2018


In [17]:
count = 0
for year in range(2008,2019):
    t = pd.read_csv("../data/processed/text_batches_intra-team/" + str(year) + ".csv")
    count = count + len(t['team'].unique())

In [18]:
"""
batch_size = 50000
batch_capacity = int(batch_size/max_chunk_size)

df = pd_chunks #.sample(frac = 1)

for var in range(1,len(pd_chunks), batch_capacity):
    
    temp = df[var:var+batch_capacity]
    temp.to_csv("../data/processed/batches_intra-team_select/" + str(int(var/batch_capacity) + 1) + ".csv", index = False)
"""

'\nbatch_size = 50000\nbatch_capacity = int(batch_size/max_chunk_size)\n\ndf = pd_chunks #.sample(frac = 1)\n\nfor var in range(1,len(pd_chunks), batch_capacity):\n    \n    temp = df[var:var+batch_capacity]\n    temp.to_csv("../data/processed/batches_intra-team_select/" + str(int(var/batch_capacity) + 1) + ".csv", index = False)\n'

In [19]:
#pd_chunks.to_csv("../data/processed/text_08_18_fuzzy_select_freq.csv", index=False)
count

2173