In [None]:
import os
import re
import numpy as np
import pandas as pd

import xgboost as xgb

from sklearn.metrics import classification_report, f1_score

SUB_DIR = '/kaggle/working/submission'
OUTPUT_DIR = '/kaggle/working/ensemble_outputs'

if not os.path.exists(SUB_DIR):
    os.makedirs(SUB_DIR)

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    
SPLITS = ['train','dev','test']

In [None]:
import regex
import re
from collections import Counter
sample = 'رنكة أقسم بالله 😂😂😂تبهليل ما بعد منتصف الليل'

def emoji_counter(text):   
    # data = regex.findall(r'\X',sample)
    data = regex.findall(r'[\U0001f600-\U0001f650]', text)
    return Counter(data)

emoji_counter(sample)

In [None]:
def extract_emoji_stats(tb,alpha=0.01):
    emodf = {}
    vocab = list({k:v for k,v in emoji_counter(' '.join(text_body['train'])).items() if v/len(tb['train']) > alpha}.keys())
    for split in SPLITS:
        emodf[split] = []
        for s in tb[split]:
            emo_count = {k:0 for k in vocab}
            for k,v in emoji_counter(s).items():
                if k in vocab:
                    emo_count[k] = v
            emodf[split].append(emo_count)
        emodf[split] = pd.DataFrame(emodf[split])
    return emodf

In [None]:
dp_map = {
    "afriberta": "abrt",
    "afroxlmr": "axlm",
    "logistic_regression": "logr",
    "naijasenti": "naij",
    "random_forest": 'rndf',
    "svm": "_svm"
}

label_map = {
    "negative": 0,
    "neutral": 1,
    "positive": 2,
}

BASE_DIR = "../input/afrisentiensemble/full_outputs/"


langs = ['am', 'dz', 'ha', 'ig', 'ma', 'pcm', 'pt', 'sw', 'yo', 'twi', 'kr', 'ts', 'multilingual']
# langs = ['ha']
# LANG = 'ha'
for LANG in langs:
    print(LANG)
    text_body = {}
    ensemble_df = {}
    labels = {}
    for dp in os.listdir(BASE_DIR):
        for fp in os.listdir(os.path.join(BASE_DIR,dp)):
            if fp.startswith(LANG) and fp.endswith('output.tsv'):
                df = pd.read_csv(os.path.join(BASE_DIR,dp,fp),sep='\t')
                split = [s for s in SPLITS if s in fp][0]
                if split not in ensemble_df:
                    ensemble_df[split] = {}
                    text_body[split] = list(df['tweet'])
                    
                for col, lab in [('negative','neg'),('neutral','neu'),('positive','pos')]:
                    ensemble_df[split][f"{dp_map[dp]}_{lab}"] = list(df[col])

                if 'train' in fp and 'train' not in labels:
                    labels['train'] = [label_map[v] for v in df['label']]
                if 'dev' in fp and 'dev' not in labels:
                    labels['dev'] = [label_map[v] for v in df['label']]
    
#     print(text_body['train'][:10])
    emo_df = extract_emoji_stats(text_body,alpha=0.01)   
#     display(emo_df['train'])
#     print(emo_df['train'].max())
    for split in SPLITS:
        ensemble_df[split] = pd.DataFrame(ensemble_df[split])
        ensemble_df[split] = pd.concat((ensemble_df[split],emo_df[split]),axis=1)
        
#     display(ensemble_df['train'])
#     break
    
    xgb_cls = xgb.XGBClassifier(
        learning_rate=1e-6,
        max_depth=32,
        colsample_bytree=0.1,
        colsample_bylevel=0.2,
        colsample_bynode=0.2,
        n_estimators=100,
#         gpu_id=0
    )

    print(xgb_cls.fit(ensemble_df['train'],labels['train']))

    preds = {}
    preds['train'] = xgb_cls.predict(ensemble_df['train'])
    preds['dev'] = xgb_cls.predict(ensemble_df['dev'])
    preds['test'] = xgb_cls.predict(ensemble_df['test'])
    


#     print(preds['test'])
    
    for split in SPLITS:
        save_dir = f'{OUTPUT_DIR}/{LANG}/'
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        df = pd.read_csv(f"{BASE_DIR}svm/{LANG}_{split}_output.tsv", sep = '\t')
        output_df = pd.DataFrame()
        output_df['ID'] = df['ID']
        output_df['tweet'] = df['tweet']
        
        if split != 'test':
            with open(f"{save_dir}/{LANG}_{split}_score.txt", 'w') as file:
                file.write((classification_report(labels[split],preds[split])))

#             print(classification_report(labels['dev'],preds['dev']))
            output_df['label'] = df['label']
        
        output_df['pred'] = preds[split]
        output_df["pred"] = [{
            0:'negative',
            1:'neutral',
            2:'positive',
        }[v] for v in output_df["pred"]]
        output_df.to_csv(f"{OUTPUT_DIR}/{LANG}/{LANG}_{split}_prediction.tsv")
        
        if split == 'test':
            sub_df = output_df[['ID', 'pred']].copy()
            sub_df.columns = ['ID', 'label']
            sub_df.to_csv(f'{SUB_DIR}/pred_{LANG}.tsv', sep='\t', index = False)
#     test_df = pd.read_csv(f"{BASE_DIR}svm/{LANG}_test_output.tsv", sep = '\t')
#     test_df = pd.DataFrame()
#     sub_df['ID'] = dev_df['ID']
#     sub_df['label'] = preds['test']
#     sub_df["label"] = [{
#         0:'negative',
#         1:'neutral',
#         2:'positive',
#     }[v] for v in sub_df["label"]]


In [None]:
from IPython.display import FileLink

# !tar -czvf submissions.tar.gz submission
!zip -r submissions.zip submission/
FileLink(r'submissions.zip')

In [None]:
# !cat ensemble_outputs/*/*_dev_score.txt
!tail -n +1 ensemble_outputs/*/*_dev_score.txt

In [None]:
!ls ensemble_outputs/ig/