In [None]:
import sys
import pandas as pd

In [None]:
sys.path.append('../code/')

In [None]:
from training_data_handler import TrainingDataHandler

In [None]:
tdh = TrainingDataHandler()

- Raw training data

## The fields

- One of the tasks is to look which text input fields suffice for good predictions
- The important text fields can be in German, Italien or French  and are 
  - `title`: Title of each document
  - `text`: Text of the document. This field will get a further division on the basis of the field `text_name_de` which describes whether it is an attachment, an answer etc.
- In addition to that, we have the field `language` which can be de, fr, it. This field is important to filter the entries by language
- The dataframe contains the original fields, which are not used 

In [None]:
tdh.raw_data_for_training.head().columns.tolist()

In [None]:
language = 'all'
tdh.get_training_data(language=language, affair_text_scope=['zh','ch'], affair_attachment_category='all')

In [None]:
tdh.training_data_df[tdh.training_data_df.text.str.contains('Infolge der geänderten Gesetzgebung betreffend die Quellenbesteuerung des Einkommens aus Erwerbstätigkeit wird in Zukunft auch Grenzgängerinnen und Grenzgängern als "Quasi-An')].affair_topic_codes_as_labels.tolist()

In [None]:
', '.join(list(tdh.label2id.keys())[:10])

## 1. Check if everything is all right

In [None]:
affairs = pd.read_excel('/Users/vetonmatoshi/Documents/Git/LEXTREME/politmonitor/data/Data/affairs.xlsx')
affairs.head()

- We will check if the topic labels per affair srcid are equal in all dataframes
- We will take the affair_srcid from `affairs` and filter the training data by this affair_srcid
- The field `affair_topic_codes` in the training data and `affairs` should be equal in both cases
- The field `affair_topic_codes_as_labels` in the training data should be equal to `affair_topic_de` in affairs

In [None]:
miss_match = list()
for item in affairs.to_dict(orient='records'):
    affair_srcid = item['affair_srcid'] 
    affair_topic_codes = set(str(item['affair_topic_codes']).split(';'))
    affair_topic_de = set(str(item['affair_topic_de']).split(';'))
    
    # Filter training data
    res = tdh.training_data_df[tdh.training_data_df.affair_text_srcid==affair_srcid]
    if res.shape[0]>0:
        affair_topic_codes_found = res.affair_topic_codes.tolist()[0]
        affair_topic_codes_found = set(affair_topic_codes_found)
        if not affair_topic_codes_found==affair_topic_codes:
            print(affair_srcid)
            miss_match.append(affair_topic_codes_found)
        affair_topic_codes_as_labels_found = res.affair_topic_codes_as_labels.tolist()[0]
        affair_topic_codes_as_labels_found = set(affair_topic_codes_as_labels_found)
        if not affair_topic_codes_as_labels_found==affair_topic_de:
            print(affair_srcid)
            print('Actual value:', affair_topic_de, affair_topic_codes)
            print('Found value:', affair_topic_codes_as_labels_found, affair_topic_codes_found)
            print('#############################')
            miss_match.append(affair_topic_codes_as_labels_found)
            
    

- It seems like if affair_text_srcid==affair_srcid, then the affair_topic_codes are equal


In [None]:
from collections import defaultdict
df_count = defaultdict(int)
for c in [x for x in columns if x!='title']:
    df_count[c]=df[c].sum()

    
df_new_count = defaultdict(int)
for c in [x for x in columns if x!='title']:
    df_new_count[c]=df_new[c].sum()


In [None]:
df_count

In [None]:
df_new_count

In [None]:
print('It has cases with an empty text field: ', tdh.raw_data_for_training[tdh.raw_data_for_training.text==''].shape[0], 'cases')
print('\n')
print('But it has no cases with an empty title field: ', tdh.raw_data_for_training[tdh.raw_data_for_training.title==''].shape[0], 'cases')

## Filtering

- The filtering is done on the basis of the following fields from `tdh.raw_data_for_training`: `language`, `title`, `text`, `affair_text_scope`
- All of these fields have default values. The default values will choose every available data
- Having default values means that you do not have to specify every field for filtering
- The filtering is done with the following methods:
  - `tdh.get_training_data(languages, affair_text_scope, inputs)`: will return a dataset that can be used for training with the transformers library. The dataset can be accessed via tdh.traing_data
  - `tdh.filter_training_data(languages, affair_text_scope, inputs)`: will return a pandas dataframe
- The following arguments are important:
  - `languages`

In [None]:
language = 'de'
tdh.get_training_data(language=language, affair_text_scope=['zh','ch'], text=False)
print('Number of examples for '+language+': ', tdh.training_data_df.shape[0])
print(tdh.training_data_df[['title','language','split']].groupby(['language','split']).count())
for split in ['train', 'validation', 'test']:
    tdh.create_barplot(tdh.training_data_df[tdh.training_data_df.split==split], split)

In [None]:
language = 'fr'
tdh.get_training_data(language=language, affair_text_scope=['zh','ch'], text=False)
print('Number of examples for '+language+': ', tdh.training_data_df.shape[0])
print(tdh.training_data_df[['title','language','split']].groupby(['language','split']).count())
for split in ['train', 'validation', 'test']:
    tdh.create_barplot(tdh.training_data_df[tdh.training_data_df.split==split], split)

In [None]:
language = 'it'
tdh.get_training_data(language=language, affair_text_scope=['zh','ch'], text=False)
print('Number of examples for '+language+': ', tdh.training_data_df.shape[0])
print(tdh.training_data_df[['title','language','split']].groupby(['language','split']).count())
for split in ['train', 'validation', 'test']:
    tdh.create_barplot(tdh.training_data_df[tdh.training_data_df.split==split], split)

In [None]:
language = 'de,fr'
tdh.get_training_data(language=language, affair_text_scope=['zh','ch'], text=False)
print('Number of examples for '+language+': ', tdh.training_data_df.shape[0])
print(tdh.training_data_df[['title','language','split']].groupby(['language','split']).count())
for split in ['train', 'validation', 'test']:
    tdh.create_barplot(tdh.training_data_df[tdh.training_data_df.split==split], split)

In [None]:
language = 'de,fr,it'
tdh.get_training_data(language=language, affair_text_scope=['zh','ch'], text=False)
print('Number of examples for '+language+': ', tdh.training_data_df.shape[0])
print(tdh.training_data_df[['title','language','split']].groupby(['language','split']).count())
for split in ['train', 'validation', 'test']:
    tdh.create_barplot(tdh.training_data_df[tdh.training_data_df.split==split], split)

In [None]:
tdh.training_data_df[['text_de','affair_topic_codes_as_labels','split']]

In [None]:
tdh.filter_training_data(languages='all', affair_text_scope='all', inputs='all')

In [None]:
tdh.raw_data_for_training.shape


In [None]:
raw_data_for_training = pd.read_json('../data/raw_data_for_training.jsonl', lines=True)
raw_data_for_training_ch_zh = raw_data_for_training[raw_data_for_training.affair_text_scope.isin(['zh','ch'])]
raw_data_for_training.head()

In [None]:
raw_data_for_training.columns.tolist()

## 1. Data analysis


In [None]:
print("The raw training data set has ", raw_data_for_training.shape[0], " examples.")
print("Each affair_text_srcid can have several texts, such as title, Vorstoss etc.")
print("If we count only the unique values of affair_text_srcid we get ", raw_data_for_training.drop_duplicates('affair_text_srcid').shape[0], " examples")
print(raw_data_for_training.drop_duplicates('affair_text_srcid').shape[0], " is the total number of examples we can work with.")
print("If we keep only the data from Zurich and Switzerland (Bund), we have ", raw_data_for_training_ch_zh.drop_duplicates('affair_text_srcid').shape[0], " examples.")


- Counting how often each label occurs
- First we do a simple count: How often does each label occur per text or row

In [None]:
# Creating overview of the frequency of all labels

def create_barplot(df, title=''):

    all_labels = list()
    for label_list in df.affair_topic_codes_as_labels:
        for label in label_list:
            all_labels.append(label)

    labels_counted = dict(Counter(all_labels))
    labels_counted = dict(sorted([x for x in labels_counted.items()], key=lambda x: x[1], reverse=True))
    #for label, count in labels_counted.items():
        #print(label,': ',count, end=' ; ')
    plot_data = pd.DataFrame([labels_counted]) 
    plot_data = plot_data.sort_values(by=0, ascending=False, axis=1)
    ax = plot_data.plot.bar(figsize = (20,15), width=2.2, title=title)
    for container in ax.containers:
        ax.bar_label(container)
        
create_barplot(raw_data_for_training,'Frequency of each label per text.')

- However, each affair_text_srcid can have several texts, such as title, Vorstoss etc.
- We will keep only uniques rows based on affair_text_srcid and then make a count


In [None]:
create_barplot(raw_data_for_training.drop_duplicates('affair_text_srcid'),'Frequency of each label per unique affair_text_srcid.')

- In the following alos the same plots for the data filtered by ch and zh


In [None]:
create_barplot(raw_data_for_training_ch_zh,'Frequency of each label per text only for ch and zh.')
create_barplot(raw_data_for_training_ch_zh.drop_duplicates('affair_text_srcid'),'Frequency of each label per unique affair_text_srcid only for ch and zh.')

- Co-occurence analysis

In [None]:
corr = data_for_split_df[list(label2id.keys())].corr()
corr.style.background_gradient(cmap='coolwarm')

## 2. Create split


- Next we need to create a train, validation, test split
- According to [this post](https://medium.com/gumgum-tech/creating-balanced-multi-label-datasets-for-model-training-and-evaluation-16b6a3a2d912) this package is good: https://github.com/trent-b/iterative-stratification
- Therefore, I will use this python package: https://github.com/trent-b/iterative-stratification
- The splits will be done only on the unique affair_text_srcid
- We will do the first split only on the basis of the data for *ch* and *zh*


In [None]:
with open('../utils/label2id.json') as f:
    label2id = js.load(f)


def create_df_for_split(initial_df, colum_for_duplicate_removal='affair_text_srcid'):    
    
    initial_df = initial_df.drop_duplicates(colum_for_duplicate_removal)


    data_for_split = list()

    for r in initial_df.to_dict(orient='records'):
        item = dict()
        affair_text_srcid = r['affair_text_srcid']
        affair_topic_codes_as_labels = r['affair_topic_codes_as_labels']
        affair_topic_codes = r['affair_topic_codes']
        item['affair_text_srcid'] = affair_text_srcid
        item['affair_topic_codes_as_labels'] = affair_topic_codes_as_labels
        item['affair_topic_codes'] = affair_topic_codes

        one_hot_affair_topic_codes = []
        for label in sorted(list(label2id.keys())):
            if label in affair_topic_codes_as_labels:
                item[label]=1
                one_hot_affair_topic_codes.append(1)
            else:
                item[label]=0
                one_hot_affair_topic_codes.append(0)

        item['one_hot_affair_topic_codes'] = one_hot_affair_topic_codes

        data_for_split.append(item)

    data_for_split_df = pd.DataFrame(data_for_split)
    
    data_for_split_df = data_for_split_df.reset_index(drop=True)
    
    return data_for_split_df
    


In [None]:
create_df_for_split(raw_data_for_training_ch_zh)

In [None]:
# https://github.com/trent-b/iterative-stratification



X = np.array(['Bla', 'bla', 'bla', 'bla', 'bla', 'bla', 'bla', 'bla'])
y = np.array([[0,0,1], [0,0,0], [1,0,1], [1,0,1], [1,1,1], [0,1,1], [1,0,0], [1,0,0]])



def create_split(dataframe, test_size=0.4):
    
    if 'split' not in dataframe.columns:
        dataframe['split']=''
    
    X = dataframe.affair_text_srcid.values
    y = np.array(dataframe.one_hot_affair_topic_codes.tolist())
    
    msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=0)

    for train_index, test_index in msss.split(X, y):
        dataframe.loc[(dataframe.index.isin(train_index)), 'split'] = 'train'
        dataframe.loc[(dataframe.index.isin(test_index)), 'split'] = 'other'
        
        dataframe_filtered = dataframe[dataframe.split=='other'].reset_index(drop=False)
        X = dataframe_filtered.affair_text_srcid.values
        y = np.array(dataframe_filtered.one_hot_affair_topic_codes.tolist())
        
        msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0)

        for validation_index, test_index in msss.split(X, y):
            for i,_ in dataframe_filtered.iterrows():
                _index = dataframe_filtered.at[i, 'index']
                if i in validation_index:
                    dataframe.at[_index,'split']='validation'
                if i in test_index:
                    dataframe.at[_index,'split']='test'
            
    return dataframe
    


In [None]:
data_for_split_df = create_df_for_split(raw_data_for_training_ch_zh)

data_for_split_df = create_split(data_for_split_df)
data_for_split_df

In [None]:
data_for_split_df[['affair_text_srcid', 'split']].groupby('split').count().plot.bar()

- Checking if the labels have an equal distribution over all splits


In [None]:
for split in ['train', 'validation', 'test']:
    train_affair_text_srcid = data_for_split_df[data_for_split_df.split==split].affair_text_srcid.tolist()
    train_affair_text_srcid = [int(x) for x in train_affair_text_srcid]
    create_barplot(raw_data_for_training_ch_zh_unique[(raw_data_for_training_ch_zh_unique.affair_text_srcid.isin(train_affair_text_srcid))], title=split)

## 3. Train model


In [None]:
from datasets import Dataset


In [None]:
Dataset.from_list

In [None]:
from datasets import load_dataset
from sentence_transformers.losses import CosineSimilarityLoss

from setfit import SetFitModel, SetFitTrainer, sample_dataset


# Load a dataset from the Hugging Face Hub
# dataset = load_dataset("sst2")



dataset = load_dataset("joelito/lextreme", "covid19_emergency_event")

# Simulate the few-shot regime by sampling 8 examples per class
train_dataset = dataset["train"] #[:20]
eval_dataset = dataset["validation"]
train_dataset = train_dataset.remove_columns('language')
eval_dataset = eval_dataset.remove_columns('language')

# Load a SetFit model from Hub
model = SetFitModel.from_pretrained(
    "sentence-transformers/paraphrase-mpnet-base-v2",
    multi_target_strategy="one-vs-rest",
)

# Create trainer
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss_class=CosineSimilarityLoss,
    metric="accuracy",
    batch_size=16,
    num_iterations=20, # The number of text pairs to generate for contrastive learning
    num_epochs=1, # The number of epochs to use for contrastive learning
    column_mapping={"input": "text", "label": "label"} # Map dataset columns to text/label expected by trainer
)

# Train and evaluate
trainer.train()
metrics = trainer.evaluate()

# Push model to the Hub
trainer.push_to_hub("my-awesome-setfit-model")

# Download from Hub and run inference
model = SetFitModel.from_pretrained("lewtun/my-awesome-setfit-model")
# Run inference
preds = model(["i loved the spiderman movie!", "pineapple on pizza is the worst 🤮"])

In [None]:
train_dataset

In [None]:
sample_dataset?

In [None]:
train_dataset = sample_dataset(dataset["train"], label_column="label", num_samples=8)

In [None]:
dataset['train'] #.features['label'].feature.names

## Old Code

In [None]:
df = pd.read_excel('../data/Data/affairs.xlsx')
#df = df[df.affair_scope=="zh"]
df = df[df.affair_topic_codes.isin(['1696','other'])]
df = df[df.affair_topic_codes.isnull()==False]
df.shape

In [None]:
label2id = dict()

topic_labels = pd.read_excel('../data/topics_politmonitor.xlsx')

for r in topic_labels.to_dict(orient='records'):
    item = dict()
    label2id[r["keyword_de"]]=int(r['keyword_id'])
    
for label_list in df.affair_topic_codes.tolist():
    for label in label_list:
        if int(label) not in label2id.values():
            label2id[int(label)]='not_specified_label_'+str(label)
    
with open('../data/label2id.json','w') as f:
    js.dump(label2id, f, indent=2, ensure_ascii=False)

In [None]:
len(label2id.keys())

In [None]:
def process_topic_codes(affair_topic_codes):
    affair_topic_codes_as_list = affair_topic_codes.split(';')
    affair_topic_codes_as_list = [int(x) for x in affair_topic_codes_as_list if x]
    
    return sorted(affair_topic_codes_as_list)
   
df = df.fillna('')
df["affair_topic_codes"]=df.affair_topic_codes.apply(process_topic_codes)


In [None]:
df.head()

In [None]:
dataset = list()
X = df.title_de.to_numpy()
y = df.affair_topic_codes.to_numpy()
for r in df.to_dict(orient="records"):
    if len(r['affair_topic_codes'])>0:
        item = dict()
        item['input']=r['title_de']
        labels_as_string = list()
        for label, _id in label2id.items():
            if _id in r['affair_topic_codes']:
                item[label]=1
                #labels_as_string.append(str(_id))
            else:
                item[label]=0
                #labels_as_string.append(str('None'))
        item['labels_as_string']=';'.join([str(x) for x in r['affair_topic_codes']])
        item['label']=r['affair_topic_codes']
        dataset.append(item)
    
dataset = pd.DataFrame(dataset)
dataset['language']='de'
dataset.head()

frequency_count = Counter(dataset.labels_as_string)

# StratifiedShuffleSplit cannot process labels that occur only once
# Therefore, for all labels that occur only once we give the value other in the field labels_as_string
for i, _ in dataset.iterrows():
    labels_as_string = dataset.at[i, 'labels_as_string']
    if frequency_count[labels_as_string]<5:
        dataset.at[i, 'labels_as_string']='other'

frequency_count = Counter(dataset.labels_as_string)
frequency_count

In [None]:
from sklearn.model_selection import train_test_split




X = dataset.input
y = dataset.labels_as_string


split_indices = dict()

X_train, X_1, y_train, y_1 = train_test_split(X, y, test_size=0.4, random_state=42, shuffle=True)

split_indices['train']=X_train.index
    
X_validation, X_test, y_validation, y_test = train_test_split(X_1, y_1, test_size=0.5, random_state=42, shuffle=True)
split_indices['validation']=X_validation.index
split_indices['test']=X_test.index

dataset['split']=''
for i, _ in dataset.iterrows():
    if i in split_indices['train']:
        dataset.at[i,'split']='train'
    if i in split_indices['validation']:
        dataset.at[i,'split']='validation'
    if i in split_indices['test']:
        dataset.at[i,'split']='test'
    #else:
        #dataset.at[i,'split']='train'
        

In [None]:
dataset[['input','split']].groupby('split').count().plot.bar()

In [None]:
dataset[dataset.split=='train'][['input','labels_as_string']].groupby('labels_as_string').count().plot.bar()

In [None]:
dataset[dataset.split=='validation'][['input','labels_as_string']].groupby('labels_as_string').count().plot.bar()

In [None]:
dataset[dataset.split=='test'][['input','labels_as_string']].groupby('labels_as_string').count().plot.bar()

In [None]:
dataset.to_json('../data/dataset_for_training.jsonl',lines=True, force_ascii=False, orient="records")

In [None]:
ds = DatasetDict()

for split in ['train','validation','test']:
    x = Dataset.from_pandas(dataset[dataset.split==split][['input','label','language']])
    ds[split] = x

In [None]:
ds['validation']

In [None]:
pd.read_json('../data/dataset_for_training.jsonl', lines=True)