In [None]:
import os
import regex as re
import csv
import pandas as pd

from pathlib import Path
from glob import glob
from tqdm import tqdm

path = r"PATH/TO/DOWNLOADED/EMOTES/CHANNEL/FOLDERS" 

# All channel folders containing their emotes, with file names matching emote code (ex: "PATH\TO\PROJECT\Scrapes\HasanAbi\twitch\hasL.png")
dirs = glob(path+"/*", recursive=True)

print('Folders found:',len(dirs))

# Register Emotes

In [4]:
emotes = pd.DataFrame()       # Twitch emotes
bttv_emotes = pd.DataFrame()  # BTTV emotes

# Create Twitch + BTTV emote registry (FFZ emotes were documented after the scrape)
for folder in dirs:  
  channel = os.path.basename(folder)

  # Create a list of Twitch emotes for each channel
  files = glob(folder + "/twitch/*", recursive=True)
  ch_emotes = pd.DataFrame({ channel:[] })
  for f in files:
    emote = Path(f).stem
    ch_emotes= ch_emotes.append({ channel:emote }, ignore_index=True)
  emotes = pd.concat([emotes, ch_emotes], axis=1)

  # Create a list of BTTV emotes for each channel
  files = glob(folder + "/bttv/*", recursive=True)
  ch_emotes = pd.DataFrame({ channel:[] })
  for f in files:
    emote = Path(f).stem
    ch_emotes= ch_emotes.append({ channel:emote }, ignore_index=True)
  if len(ch_emotes) > 0:
    bttv_emotes = pd.concat([bttv_emotes, ch_emotes], axis=1)

In [5]:
# These channels in the corpus have problematic emotes for file structures, such as multiple emotes with the same spelling but different capitalization
# The EmoteDownloader module may need to be adjusted to handle these cases if your corpus has the same issues. For this project, the module was altered to prepend the filename with a '!' in these cases
# This cell removes the '!' from the file names and preserves the capitalization

annoying_emotes = ['1XFRAZX1','Yogscast','Zemie','ZiggyDLive','zwebackhd']

emotes[annoying_emotes] = emotes[annoying_emotes].replace({'!':''}, regex=True)

List of global emotes from each source

In [6]:
twitch_global_emotes = r"""NewRecord
Awwdible
Lechonk
Getcamped
SUBprise
FallHalp
FallCry
FallWinning
MechaRobot
ImTyping
Shush
MyAvatar
PizzaTime
LaundryBasket
ModLove
PotFriend
Jebasted
PogBones
PoroSad
KEKHeim
CaitlynS
HarleyWink
WhySoSerious
DarkKnight
FamilyMan
RyuChamp
HungryPaimon
TransgenderPride
PansexualPride
NonbinaryPride
LesbianPride
IntersexPride
GenderFluidPride
GayPride
BisexualPride
AsexualPride
PogChamp
GlitchNRG
GlitchLit
StinkyGlitch
GlitchCat
FootGoal
FootYellow
FootBall
BlackLivesMatter
ExtraLife
VirtualHug
BOP
SingsNote
SingsMic
TwitchSings
SoonerLater
HolidayTree
HolidaySanta
HolidayPresent
HolidayLog
HolidayCookie
GunRun
PixelBob
FBPenalty
FBChallenge
FBCatch
FBBlock
FBSpiral
FBPass
FBRun
MaxLOL
TwitchRPG
MercyWing1
PinkMercy
MercyWing2
PartyHat
EarthDay
TombRaid
PopCorn
FBtouchdown
TPFufun
TwitchVotes
DarkMode
HSWP
HSCheers
PowerUpL
PowerUpR
LUL
EntropyWins
TPcrunchyroll
TwitchUnity
Squid1
Squid2
Squid3
Squid4
CrreamAwk
CarlSmile
TwitchLit
TehePelo
TearGlove
SabaPing
PunOko
KonCha
Kappu
InuyoFace
BigPhish
BegWan
ThankEgg
MorphinTime
TheIlluminati
TBAngel
MVGame
NinjaGrumpy
PartyTime
RlyTho
UWot
YouDontSay
KAPOW
ItsBoshyTime
CoolStoryBob
TriHard
SuperVinlin
FreakinStinkin
Poooound
CurseLit
BatChest
BrainSlug
PrimeMe
StrawBeary
RaccAttack
UncleNox
WTRuck
TooSpicy
Jebaited
DogFace
BlargNaut
TakeNRG
GivePLZ
imGlitch
pastaThat
copyThis
UnSane
DatSheffy
TheTarFu
PicoMause
TinyFace
DxCat
RuleFive
VoteNay
VoteYea
PJSugar
DoritosChip
OpieOP
FutureMan
ChefFrank
StinkyCheese
NomNom
SmoocherZ
cmonBruh
KappaWealth
MikeHogu
VoHiYo
KomodoHype
SeriousSloth
OSFrog
OhMyDog
KappaClaus
KappaRoss
MingLee
SeemsGood
twitchRaid
bleedPurple
duDudu
riPepperonis
NotLikeThis
DendiFace
CoolCat
KappaPride
ShadyLulu
ArgieB8
CorgiDerp
PraiseIt
TTours
mcaT
NotATK
HeyGuys
Mau5
PRChase
WutFace
BuddhaBar
PermaSmug
panicBasket
BabyRage
HassaanChop
TheThing
EleGiggle
RitzMitz
YouWHY
PipeHype
BrokeBack
ANELE
PanicVis
GrammarKing
PeoplesChamp
SoBayed
BigBrother
Keepo
Kippa
RalpherZ
TF2John
ThunBeast
WholeWheat
DAESuppy
FailFish
HotPokket
4Head
ResidentSleeper
FUNgineer
PMSTwin
ShazBotstix
BibleThump
AsianGlow
DBstyle
BloodTrail
OneHand
FrankerZ
SMOrc
ArsonNoSexy
PunchTrees
SSSsss
Kreygasm
KevinTurtle
PJSalt
SwiftRage
DansGame
GingerPower
BCWarrior
MrDestructoid
JonCarnage
Kappa
RedCoat
TheRinger
StoneLightning
OptimizePrime
JKanStyle
R)
;P
:P
;)
:/
<3
:O
B)
O_o
:|
>(
:D
:(
:)"""

In [7]:
bttv_global_emotes = r""":tf:
CiGrip
DatSauce
ForeverAlone
GabeN
HailHelix
ShoopDaWhoop
M&Mjc
bttvNice
TwaT
WatChuSay
tehPoleCat
AngelThump
TaxiBro
BroBalt
CandianRage
D:
VisLaud
KaRappa
FishMoley
Hhhehehe
KKona
PoleDoge
sosGame
CruW
RarePepe
haHAA
FeelsBirthdayMan
RonSmug
KappaCool
FeelsBadMan
bUrself
ConcernDoge
FeelsGoodMan
FireSpeed
NaM
SourPls
FeelsSnowMan
FeelsSnowyMan
LuL
SoSnowy
SaltyCorn
monkaS
VapeNation
ariW
notsquishY
FeelsAmazingMan
DuckerZ
IceCold
SqShy
Wowee
WubTF
cvR
cvL
cvHazmat
cvMask
DogChamp"""

In [8]:
ffz_global_emotes = r"""ZrehplaR
YooHoo
ManChicken
BeanieHipster
CatBag
ZreknarF
LilZ
ZliL
LaterSooner
BORT
OBOY
OiMinna
AndKnuckles
"""

In [9]:
tv7_global_emotes = r"""reckH
Stare
RainTime
PETPET
SteerR
PartyParrot
ApuApustaja
Gayge
YEAHBUT7TV
PepePls
BillyApprove
WAYTOODANK
peepoHappy
peepoSad
nymnCorn
GuitarTime
CrayonTime
Clap
Clap2
PianoTime
knaDyppaHopeep
RoxyPotato
AlienDance
AYAYA
TeaTime
BasedGod
RebeccaBlack
FeelsDankMan
FeelsOkayMan
WineTime
forsenPls
gachiGASM
FeelsStrongMan
RareParrot
EZ
FeelsWeirdMan
gachiBASS
ppL
(7TV)
"""

In [10]:
twitch_global_emotes = twitch_global_emotes.splitlines()
bttv_global_emotes = bttv_global_emotes.splitlines()
ffz_global_emotes = ffz_global_emotes.splitlines()
tv7_global_emotes = tv7_global_emotes.splitlines()

In [12]:
df = pd.DataFrame({ 'TEXT':[], 'GLOBAL_TWITCH':[], 'GLOBAL_FFZ':[], 'GLOBAL_7TV':[] })

df['GLOBAL_TWITCH'] = pd.Series(twitch_global_emotes)
df['GLOBAL_FFZ'] = pd.Series(ffz_global_emotes)
df['GLOBAL_7TV'] = pd.Series(tv7_global_emotes)

emotes = pd.concat([df, emotes], axis=1)
bttv_emotes = pd.concat([pd.DataFrame({'GLOBAL_BTTV': pd.Series(bttv_global_emotes)}), bttv_emotes], axis=1)

In [18]:
import pickle

with open('emote_dict', 'wb') as f:
    pickle.dump(emotes, f)

with open('bttv_dict','wb') as f:
    pickle.dump(bttv_emotes,f)

# Label Metadata

In [9]:
metadata_files = [r"PATH\TO\MODELS\MODEL_NAME\!metadata.tsv"] # List of metadata files to label
metadata_all = []

# Read in metadata
for f in metadata_files:
    metadata = pd.read_csv(f, sep='\t', header=None, quoting=csv.QUOTE_NONE)
    metadata["source"] = "TEXT"
    metadata_all.append(metadata)

In [None]:
# Label metadata based on Emote source
for metadata in metadata_all:
    for idx, row in tqdm(metadata.iterrows()):
        row[1] = (emotes == str(row[0]).lower()).any().idxmax()

In [None]:
# Save manually line-by-line because pd.to_csv() fails on a single " token
for save_file, metadata in zip(metadata_files, metadata_all):
    folder = '\\'.join(save_file.split(sep='\\')[:-1])
    with open(folder+'\!label_'+save_file.split(sep='\\')[-1],'w+',encoding='utf-8-sig') as f:
        f.write('word'+'\t'+'source'+'\n')
        for idx, row in tqdm(metadata.iterrows()):
            source = row['source']
            if(source == 'TEXT' or source == 'GLOBAL_TWITCH' or source == 'GLOBAL_BTTV' or source == 'GLOBAL_FFZ' or source == 'GLOBAL_7TV'):
                f.write(str(row[0])+'\t'+source+'\n')
            else:
                f.write(str(row[0])+'\t'+'TWITCH_CHANNEL'+'\n')