In [1]:
import json

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier

from gensim.models import FastText

### Load the data generated using `convert_to_bio.py`

**Note:** Following bit of code was modified in the `convert_to_bio.py`

original code:
```python
bio_output += bio_tokens + '\n__END_ESSAY__\n'
```

modified code:
```python
bio_output += bio_tokens + '\n__END_ESSAY__\tO\n'
```

In [2]:
with open("data/output.txt") as f:
    d = f.read()

# some corruption with direct pandas reading led to this workaround
df = pd.DataFrame([x.split('\t') for x in d.split('\n')], columns=["word", "label"])
df = df.iloc[:-1, :]  # remove the extra last row

df.word = df.word.str.strip()
df.label = df.label.str.strip()
df.word.fillna("", inplace=True)
df.label.fillna("O", inplace=True)
df.head()

Unnamed: 0,word,label
0,Way,O
1,to,O
2,reduce,O
3,the,O
4,amount,O


### Load the train test split

In [3]:
train_test = pd.read_csv("data/train-test-split.csv", sep=";", index_col=0)
train_test.head()

Unnamed: 0_level_0,SET
ID,Unnamed: 1_level_1
essay001,TRAIN
essay002,TRAIN
essay003,TRAIN
essay004,TEST
essay005,TEST


### Load the JSON data

In [4]:
with open("data/essay_corpus.json") as f:
    data = f.read()
    data = json.loads(data)

In [5]:
# extract and correct the essay names
essay_names = ["essay" + str(essay["id"]).zfill(3) for essay in data]
exp_len = 8
assert all([len(x) == exp_len for x in essay_names])

### Assign the essay number to each word in the BIO format

In [6]:
df.loc[df[df.word == "__END_ESSAY__"].index, "essay_number"] = essay_names
df.essay_number.fillna(method="bfill", inplace=True)
df.head()

Unnamed: 0,word,label,essay_number
0,Way,O,essay365
1,to,O,essay365
2,reduce,O,essay365
3,the,O,essay365
4,amount,O,essay365


### Assign a train/test flag to each word in the BIO format

In [7]:
df = df.merge(train_test, how="left", left_on="essay_number", right_index=True)
df.reset_index(drop=True, inplace=True)  # reset index for easier train/test split
df.head()

Unnamed: 0,word,label,essay_number,SET
0,Way,O,essay365,TRAIN
1,to,O,essay365,TRAIN
2,reduce,O,essay365,TRAIN
3,the,O,essay365,TRAIN
4,amount,O,essay365,TRAIN


### Create a column with lowercase words and add the word's count (within the essay) as a column

In [8]:
df["word_lower"] = df.word.str.lower()

dfgrpby = df.groupby(by=["essay_number", "word_lower"])
word_counts = dfgrpby.agg({"word_lower": "count"})
word_counts = word_counts.rename({"word_lower": "word_count"}, axis=1)
word_counts.reset_index(inplace=True)

df = df.merge(word_counts, how="left", on=["essay_number", "word_lower"])
assert df.word_count.notna().all()
df.head()

Unnamed: 0,word,label,essay_number,SET,word_lower,word_count
0,Way,O,essay365,TRAIN,way,2
1,to,O,essay365,TRAIN,to,6
2,reduce,O,essay365,TRAIN,reduce,1
3,the,O,essay365,TRAIN,the,22
4,amount,O,essay365,TRAIN,amount,1


### Create a column of position within an essay for each word

In [9]:
total_counts = df.groupby("essay_number", as_index=False, sort=False).SET.count().values.tolist()
positions = [np.arange(x[1]) for x in total_counts]
df["positions"] = np.hstack(positions)
df.head()

Unnamed: 0,word,label,essay_number,SET,word_lower,word_count,positions
0,Way,O,essay365,TRAIN,way,2,0
1,to,O,essay365,TRAIN,to,6,1
2,reduce,O,essay365,TRAIN,reduce,1,2
3,the,O,essay365,TRAIN,the,22,3
4,amount,O,essay365,TRAIN,amount,1,4


### Train word embeddings on the corpora using FastText and add the word vector as a column

Originally the plan was to use **BERT embeddings** but due to the restriction of using nltk's tokenizer, it tended to be a little bit trickier to achieve it in the given time.

<div class="alert alert-warning">

**Note:** Also the embeddings were trained over the entire. This could arguably introduce some "leakage" i.e. information from the test set seeping into the training set but I have often seen it in practice.
    
</div>

In [10]:
fasttext_train_data = [x["text"].split() for x in data]
ft = FastText(fasttext_train_data, size=50, min_count=1, workers=-1)
df["ft_vec"] = df.word.apply(ft.wv.get_vector)
df.head()

Unnamed: 0,word,label,essay_number,SET,word_lower,word_count,positions,ft_vec
0,Way,O,essay365,TRAIN,way,2,0,"[-0.001202546, -0.008073888, 0.0014490829, 0.0..."
1,to,O,essay365,TRAIN,to,6,1,"[0.0014644335, 0.0026838281, -0.004891229, -0...."
2,reduce,O,essay365,TRAIN,reduce,1,2,"[0.0026979328, -0.0014656027, 0.0022566058, 0...."
3,the,O,essay365,TRAIN,the,22,3,"[-0.005500579, 0.0011195417, -0.0015077062, 0...."
4,amount,O,essay365,TRAIN,amount,1,4,"[-0.0034768363, -0.0022226984, -0.0023712418, ..."


### Combine all the features

In [11]:
counts = df.word_count.values
vectors  = np.stack(df.ft_vec.values)
positions = df.positions

X = np.c_[counts, vectors, positions]

### Encode the labels

In [12]:
label_enc = LabelEncoder()
y = label_enc.fit_transform(df.label.values)

### Split them into train and test

In [13]:
train_indices = df[df.SET == "TRAIN"].index.tolist()
test_indices = df[df.SET == "TEST"].index.tolist()
X_train = X[train_indices]
X_test = X[test_indices]
y_train = y[train_indices]
y_test = y[test_indices]

### Define the classifier and run a 10-fold cross-validation

The parameters have already been optimized using trial and error. Same goes for the embedding dimensions (50).

In [14]:
rf_clf = RandomForestClassifier(n_estimators=100, min_samples_leaf=10, random_state=42, n_jobs=-1)
score_dict = cross_validate(rf_clf, X_train, y_train, cv=10, n_jobs=-1, return_train_score=True, scoring=["f1_macro", "f1_micro"])

In [15]:
print()
print("F1 Macro scores:")
print(f"F1 macro score mean for the train set {score_dict['train_f1_macro'].mean():.4f} with std. dev. {score_dict['train_f1_macro'].std():.4f}")
print(f"F1 macro score mean for the validation set {score_dict['test_f1_macro'].mean():.4f} with std. dev. {score_dict['test_f1_macro'].std():.4f}")
print()
print("F1 Micro scores:")
print(f"F1 macro score mean for the train set {score_dict['train_f1_micro'].mean():.4f} with std. dev. {score_dict['train_f1_micro'].std():.4f}")
print(f"F1 macro score mean for the validation set {score_dict['test_f1_micro'].mean():.4f} with std. dev. {score_dict['test_f1_micro'].std():.4f}")


F1 Macro scores:
F1 macro score mean for the train set 0.3147 with std. dev. 0.0014
F1 macro score mean for the validation set 0.2750 with std. dev. 0.0053

F1 Micro scores:
F1 macro score mean for the train set 0.6765 with std. dev. 0.0009
F1 macro score mean for the validation set 0.6459 with std. dev. 0.0064


<div class="alert alert-warning">

There is a **some overfitting** (both f1 macro and micro are greater for train sets than test sets) specially on the macro level but the relatively low standard deviation shows that the model generalizes with consistency.
    
</div>

### Retrain the model with the entire training set and generate predictions for the test set

In [16]:
rf_clf = RandomForestClassifier(n_estimators=100, min_samples_leaf=10, random_state=42, n_jobs=-1)
rf_clf.fit(X_train, y_train)

RandomForestClassifier(min_samples_leaf=10, n_jobs=-1, random_state=42)

In [17]:
y_pred = rf_clf.predict(X_test)
y_train_pred = rf_clf.predict(X_train)

### Check performance on the final train and test sets

In [22]:
print(f"F1 macro train set: {f1_score(y_train, y_train_pred, average='macro'):.4f}")
print(f"F1 micro train set: {f1_score(y_train, y_train_pred, average='micro'):.4f}")

F1 macro train set: 0.3172
F1 micro train set: 0.6771


In [21]:
print(f"F1 macro test set: {f1_score(y_test, y_pred, average='macro'):.4f}")
print(f"F1 micro test set: {f1_score(y_test, y_pred, average='micro'):.4f}")

F1 macro test set: 0.2892
F1 micro test set: 0.6598


The final train and test scores follow the trend established in the cross-validation very closely.

In the FAQ, the following was mentioned:

<blockquote>For reference, the F1-score of our baseline is 0.216 (macro F1-score) and 0.456 (weighted F1-score), so try to achieve as higher as possible than this.</blockquote>

The scores of **0.289 (macro F1-score)** and **0.660 (weighted F1-score)** on the test set achieve this!

I tried originally to only do it with the embeddings but got a real boost from the `positions` feature.

### Create a final BIO format file with all predictions (train and test dataset) combined for `evaluation.py`

In [20]:
bio_labels = label_enc.inverse_transform(rf_clf.predict(X))
df["pred_label"] = bio_labels
export = df.loc[:, ["word", "pred_label"]]
export.to_csv("data/pred-output.txt", sep="\t", index=False, header=None)

In [23]:
with open("data/output.txt") as f:
    d = f.read()

# some corruption with direct pandas reading led to this workaround
df_orig = pd.DataFrame([x.split('\t') for x in d.split('\n')], columns=["word", "label"])
df_orig = df.iloc[:-1, :]  # remove the extra last row

### Execute `evaluation.py`

In [31]:
!python evaluation.py --gt_bio_path data/output.txt --pred_bio_path data/pred-output.txt

Macro F1-Score:  0.31
Weighted F1-Score:  0.597
