Converts sqlite3 data into parquet format. 

In [2]:
import pandas as pd
import sqlite3
import sys
sys.path.insert(0, "../") # Required for database in different directory. 


In [3]:
top_or_recent = 'top'
mode = 'std'

class ScoreGen:
    def __init__(self, top_or_recent = 'top', mode = 'std', keepHD = True):
        self.conn = sqlite3.connect('../data/UserScores.db')
        self.cursor = self.conn.cursor()
        self.top_or_recent = top_or_recent
        self.mode = mode
        
        NF = 1
        HD = 8  # Removed only for no HD
        SD = 32
        NC = 512
        SO = 4096
        PF = 16384
        SV2 = 536870912
        
        if keepHD:
            self.mods_removed = NF | SD | SO | PF | SV2
        else:
            self.mods_removed = NF | SD | HD | SO | PF | SV2
            
    def replace_nc_with_dt(self, mods):
        # Remove NC and add DT if NC is present
        if mods & 4096:
            mods &= ~self.NC  # Remove NC
            mods |= self.DT   # Add DT
        return mods
    
    def __iter__(self):
        ids = self.cursor.execute(f"SELECT DISTINCT user_id FROM {self.top_or_recent}_scores_{self.mode}").fetchall()
        # ids = [(19008254, )]
        for id in ids:
            id = id[0]
            scores = self.cursor.execute(f"SELECT beatmap_id, mods FROM {self.top_or_recent}_scores_{self.mode} WHERE user_id = ? ORDER BY created_at DESC", (id,)).fetchall()
            
            to_yield = []
            for score in scores:
                bm_id, mods = score
                mods &= ~self.mods_removed
                mods = self.replace_nc_with_dt(mods)
                
                to_yield.append(str(bm_id) + '-' + str(mods))
            
            yield to_yield
        
        self.conn.close()

gen = ScoreGen(top_or_recent, mode)
sentences = list(gen)

In [4]:
df = pd.DataFrame({'sentences': sentences})
df.to_parquet(f'{top_or_recent}_sentences_{mode}.parquet', engine='pyarrow')
