# ML4NLP1
## Starting Point for Exercise 1, part II

This notebook is supposed to serve as a starting point and/or inspiration when starting exercise 1, part II.

One of the goals of this exercise is o make you acquainted with **skorch**. You will probably need to consult the [documentation](https://skorch.readthedocs.io/en/stable/).

# Installing skorch and loading libraries

In [None]:
import subprocess

# Installation on Google Colab
try:
    import google.colab
    subprocess.run(['python', '-m', 'pip', 'install', 'skorch'])
except ImportError:
    pass

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
from skorch import NeuralNetClassifier

In [None]:
torch.manual_seed(0)
torch.cuda.manual_seed(0)

In [None]:
import pandas as pd
import numpy as np
import csv
import re
import string
from collections import defaultdict

## Training a classifier and making predictions

In [None]:
# download dataset
!gdown 1QP6YuwdKFNUPpvhOaAcvv2Pcp4JMbIRs # x_train
!gdown 1QVo7PZAdiZKzifK8kwhEr_umosiDCUx6 # x_test
!gdown 1QbBeKcmG2ZyAEFB3AKGTgSWQ1YEMn2jl # y_train
!gdown 1QaZj6bI7_78ymnN8IpSk4gVvg-C9fA6X # y_test

Downloading...
From: https://drive.google.com/uc?id=1QP6YuwdKFNUPpvhOaAcvv2Pcp4JMbIRs
To: /content/x_train.txt
100% 64.1M/64.1M [00:00<00:00, 190MB/s]
Downloading...
From: https://drive.google.com/uc?id=1QVo7PZAdiZKzifK8kwhEr_umosiDCUx6
To: /content/x_test.txt
100% 65.2M/65.2M [00:00<00:00, 154MB/s]
Downloading...
From: https://drive.google.com/uc?id=1QbBeKcmG2ZyAEFB3AKGTgSWQ1YEMn2jl
To: /content/y_train.txt
100% 480k/480k [00:00<00:00, 107MB/s]
Downloading...
From: https://drive.google.com/uc?id=1QaZj6bI7_78ymnN8IpSk4gVvg-C9fA6X
To: /content/y_test.txt
100% 480k/480k [00:00<00:00, 23.5MB/s]


In [None]:
with open(f'x_train.txt') as f:
    x_train = f.read().splitlines()
with open(f'y_train.txt') as f:
    y_train = f.read().splitlines()
with open(f'x_test.txt') as f:
    x_test = f.read().splitlines()
with open(f'y_test.txt') as f:
    y_test = f.read().splitlines()

In [None]:
import pandas as pd
# combine x_train and y_train into one dataframe
train_df = pd.DataFrame({'text': x_train, 'label': y_train})

#combine x_test and y_test into one dataframe
test_df = pd.DataFrame({'text': x_test, 'label': y_test})

In [None]:
# T: Please use again the train/test data that includes English, German, Dutch, Danish, Swedish and Norwegian, plus 20 additional languages of your choice (the labels can be found in the file labels.csv)
# and adjust the train/test split if needed

from sklearn.model_selection import train_test_split

# Combine train and test for initial merge
combined_df = pd.concat([train_df, test_df])

# Split combined_df into 80% training and 20% testing while stratifying by label
train_df, test_df = train_test_split(combined_df, test_size=0.2, random_state=42, stratify=combined_df['label'])

selected_labels = ['eng', 'deu', 'nld', 'dan', 'swe', 'nno', 'ace', 'afr', 'als', 'amh', 'ang', 'ara', 'arg', 'arz', 'asm',
                   'ast', 'ava', 'aym', 'azb', 'aze', 'bak', 'bar', 'bcl', 'kom', 'bel', 'jpn']

train_subset = train_df[train_df['label'].isin(selected_labels)]
test_subset = test_df[test_df['label'].isin(selected_labels)]

# Define X_train, y_train, X_test, y_test
X_train = train_subset['text']
y_train = train_subset['label']
X_test = test_subset['text']
y_test = test_subset['label']

In [None]:
# T: use your adjusted code to encode the labels here

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC

# Preprocessing
label_encoder = LabelEncoder()
le_fitted = label_encoder.fit(y_train)

y_train = le_fitted.transform(y_train)
y_test = le_fitted.transform(y_test)

In [None]:
print(len(le_fitted.classes_))

26


In [None]:
# T: In the following, you can find a small (almost) working example of a neural network. Unfortunately, again, the cat messed up some of the code. Please fix the code such that it is executable.

In [None]:
# First, we extract some simple features as input for the neural network
# vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 2), max_features=100, binary=True)
vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 1), binary=False)
X = vectorizer.fit_transform(X_train.to_numpy())

In [None]:
X = X.astype(np.float32)
# y = y_train_dev.astype(np.int64)
y = y_train.astype(np.int64)

X.shape

(20800, 3773)

In the following, we define a vanilla neural network with two hidden layers. The output layer should have as many outputs as there are classes. In addition, it should have a nonlinearity function.

In [None]:
class ClassifierModule(nn.Module):
    def __init__(
            self,
            num_units=200,
            nonlin=F.relu,
    ):
        super(ClassifierModule, self).__init__()
        self.num_units = num_units
        self.nonlin = nonlin

        self.dense0 = nn.Linear(3773, num_units)
        self.nonlin = nonlin
        self.dense1 = nn.Linear(num_units, 50)
        self.output = nn.Linear(50, 26)

    def forward(self, X, **kwargs):
      X = self.nonlin(self.dense0(X))
      X = F.relu(self.dense1(X))
      X = self.output(X)
      return X.squeeze(dim=1)

In [None]:
net = NeuralNetClassifier(
    ClassifierModule,
    max_epochs=20,
    criterion=nn.CrossEntropyLoss(),
    lr=0.1,
    device='cuda',  # comment this to train with CPU
)

In [None]:
net.fit(X, y)

  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m1.9458[0m       [32m0.5474[0m        [35m1.4507[0m  2.6593
      2        [36m1.2858[0m       [32m0.5767[0m        1.7106  2.2737
      3        [36m1.2798[0m       0.5361        1.6135  3.0214
      4        [36m0.8908[0m       [32m0.7755[0m        [35m0.7108[0m  2.1752
      5        [36m0.8529[0m       0.6851        0.9602  2.2089
      6        1.0722       0.6288        1.1909  2.2045
      7        1.2545       0.6325        1.0642  2.1938
      8        0.9495       [32m0.8202[0m        [35m0.6266[0m  3.1278
      9        [36m0.7006[0m       0.8034        0.6292  2.2572
     10        [36m0.5889[0m       0.8175        [35m0.5894[0m  2.2957
     11        [36m0.5795[0m       [32m0.8365[0m        [35m0.5370[0m  2.2505
     12        0.7443       [32m0.8495[0m        [35m0.5285[0m  2.2892
     13        [36m0.

<class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=ClassifierModule(
    (dense0): Linear(in_features=3773, out_features=200, bias=True)
    (dense1): Linear(in_features=200, out_features=50, bias=True)
    (output): Linear(in_features=50, out_features=26, bias=True)
  ),
)

In [None]:
from sklearn.metrics import accuracy_score

y_pred = net.predict(X)
accuracy = accuracy_score(y, y_pred)

print(f'Accuracy: {accuracy}')


Accuracy: 0.5506730769230769


In [None]:
from skorch.callbacks import EarlyStopping

class Module1(nn.Module):
    def __init__(
            self,
            num_units=2500,
            nonlin=F.relu,
    ):
        super(Module1, self).__init__()
        self.num_units = num_units
        self.nonlin = nonlin

        self.dense0 = nn.Linear(3773, num_units)
        self.nonlin = nonlin
        self.dense1 = nn.Linear(num_units, 256)
        self.output = nn.Linear(256, 26)

    def forward(self, X, **kwargs):
      X = self.nonlin(self.dense0(X))
      X = F.tanh(self.dense1(X))
      X = self.output(X)
      return X.squeeze(dim=1)

# Add early stopping
early_stopping = EarlyStopping(
    monitor='valid_loss',
    patience=10,
    threshold=0.0001,  # Minimum threshold for loss improvement
    threshold_mode='rel',
    lower_is_better=True
)

net1 = NeuralNetClassifier(
    Module1,
    max_epochs=20,
    criterion=nn.CrossEntropyLoss(),
    lr=0.001,
    optimizer=torch.optim.RMSprop,
    device='cuda',
    callbacks=[early_stopping],
)

net1.fit(X, y)

  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m1.5324[0m       [32m0.8115[0m        [35m0.5470[0m  2.8338
      2        [36m0.3680[0m       [32m0.9161[0m        [35m0.2823[0m  3.5324
      3        [36m0.2529[0m       0.9161        [35m0.2645[0m  3.4568
      4        [36m0.2169[0m       0.9156        0.2722  6.2322
      5        [36m0.1913[0m       [32m0.9209[0m        [35m0.2574[0m  3.5753
      6        [36m0.1702[0m       [32m0.9219[0m        0.2671  2.7637
      7        [36m0.1540[0m       0.9202        0.2621  2.6958
      8        [36m0.1386[0m       [32m0.9281[0m        [35m0.2509[0m  2.7752
      9        [36m0.1288[0m       [32m0.9325[0m        [35m0.2400[0m  3.4730
     10        [36m0.1146[0m       [32m0.9327[0m        0.2434  2.8422
     11        [36m0.1048[0m       0.9269        0.2553  2.8619
     12        [36m0.0896[0m       [32m0

<class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=Module1(
    (dense0): Linear(in_features=3773, out_features=2500, bias=True)
    (dense1): Linear(in_features=2500, out_features=256, bias=True)
    (output): Linear(in_features=256, out_features=26, bias=True)
  ),
)

In [None]:
y_pred_1 = net1.predict(X)
accuracy_1 = accuracy_score(y, y_pred_1)

print(f'Accuracy1: {accuracy_1}')

Accuracy1: 0.9711057692307692
