In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
os.chdir('./drive/MyDrive')
os.listdir()

## Import

In [None]:
# Standard library imports
from collections import Counter

# Third-party imports
import numpy as np
from scipy.sparse import hstack

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, accuracy_score, f1_score, classification_report
from sklearn.utils import compute_class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

## Load data

In [None]:
X, Y = [], []
words = []
with open('data_set.txt', 'rt', encoding='utf8') as fr:
    lines = fr.read().split('\n')
    for line in lines:
        w, label = line.split('\t')
        X.append(w), Y.append(int(label))
        words.append(w)
X, Y = np.array(X), np.array(Y)
print(f'total examples: {len(X), X.shape, len(Y), Y.shape}')
print('-' * 50)

total examples: (10336, (10336,), 10336, (10336,))
--------------------------------------------------


### split dataset (train / dev / test)

In [None]:
# split first into train, test
train_examples, test_examples, train_labels, test_labels = train_test_split(X, Y, train_size=.8, test_size=.2,
                                                                            random_state=1, shuffle=True)

# split the train further into dev
train_examples, dev_examples, train_labels, dev_labels = train_test_split(train_examples, train_labels, train_size=.9,
                                                                          test_size=.1, random_state=1, shuffle=True)

print(f'train examples: {len(train_examples), len(train_labels)}\n'
      f'dev examples: {len(dev_examples), len(dev_labels)}\n'
      f'test examples: {len(test_examples), len(test_labels)}')



train examples: (7441, 7441)
dev examples: (827, 827)
test examples: (2068, 2068)


### Extract Features

In [None]:
test_examples

array(['sprungdeckel', 'esoteriker', 'mtv', ..., 'kunstschaffender',
       'arezzo', 'lösungsmenge'], dtype='<U33')

In [None]:
def to_features(examples):
  return np.array([len(w) for w in examples]).reshape(-1, 1)

vectorizer = TfidfVectorizer(analyzer = 'char', ngram_range = (1, 3))

train_vectors = vectorizer.fit_transform(train_examples, train_labels)
train_vectors = hstack([train_vectors, to_features(train_examples)])

dev_vectors = vectorizer.transform(dev_examples)
dev_vectors = hstack([dev_vectors, to_features(dev_examples)])

test_vectors = vectorizer.transform(test_examples)
test_vectors = hstack([test_vectors, to_features(test_examples)])

print(train_vectors.shape, dev_vectors.shape, test_vectors.shape)

(7441, 6591) (827, 6591) (2068, 6591)


Extracting Features with pipeline

In [None]:
X_feats = np.array([
    [
        len(w),
        sum([1 for c in w if c in 'aeiouyäöü']),
        int('-' in w),

    ]
    for w in words
])

y = Y

In [None]:
class WordLengthTransformer(BaseEstimator, TransformerMixin):
  def fit(self, X, y = None): return self
  def transform(self, X):
    return np.array([[len(w)] for w in X]).astype(int)

class VowelRatioTransforer(BaseEstimator, TransformerMixin):
  def fit(self, X, y = None): return self
  def transformer(self, X):
    features = []
    for word in X:
      vowels = sum(1 for ch in word.strip().lower() if ch in 'aeiouäöü')
      consonants = sum(1 for ch in word.strip().lower() if ch.isalpha() and ch not in 'aeiouäöü')
      ratio = vowels / consonants if consonants > 0 else 0
      features.append([ratio])

    return np.array(features)

class WordEndingTransformer(BaseEstimator, TransformerMixin):
  def fit(self, X, y = None):
    self.vec = DictVectorizer()
    endings = [{'suffix': word[-3:] if len(word) >= 3 else word} for word in X]
    self.vec.fit(endings)
    return self

  def transform(self, X):
    suffixes = [{'suffix': word[-3:] if len(word) >= 3 else word} for word in X]
    return self.vec.transform(suffixes)

class Hyphen(BaseEstimator, TransformerMixin):
  def fit(self, X, y = None): return self
  def transformer(self, X):
    return np.array([[1 if '-' in word else 0] for word in X]).astype(int)

250620

------------------------------
BaseEstimator, TransformerMixin
Recap


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
class FeatureExtractor(BaseEstimator, TransformerMixin):
  def fit(self, X, y = None):
    return self

  def transform(self, X):
    return np.array([[len(word)] for word in X]).astype(int)

still find it uneasy to recall DictVectorizer() class

In [None]:
from sklearn.feature_extraction import DictVectorizer

In [None]:
v = DictVectorizer(sparse = True)
D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
X = v.fit_transform(D)
X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 4 stored elements and shape (2, 3)>

In [None]:
v = DictVectorizer(sparse = False)
D = [{'vinegar': 3, 'sesame oil': 2, 'soy sauce': 5},
     {'vinegar': 4, 'sesame oil': 1, 'soy paste': 3}]
X = v.fit_transform(D)
X

array([[2., 0., 5., 3.],
       [1., 3., 0., 4.]])

In [None]:
v.get_feature_names_out()

array(['sesame oil', 'soy paste', 'soy sauce', 'vinegar'], dtype=object)

In [None]:
v.get_feature_names_out

In [None]:
v.inverse_transform(X)

[{'sesame oil': np.float64(2.0),
  'soy sauce': np.float64(5.0),
  'vinegar': np.float64(3.0)},
 {'sesame oil': np.float64(1.0),
  'soy paste': np.float64(3.0),
  'vinegar': np.float64(4.0)}]

In [None]:
from sklearn.feature_selection import SelectKBest, chi2

support = SelectKBest(chi2, k = 2).fit(X, [0, 1])
v.restrict(support.get_support())

In [None]:
v.get_feature_names_out()

array(['soy paste', 'soy sauce'], dtype=object)

In [None]:
class RandomFeature(BaseEstimator, TransformerMixin):
  def fit(self, X, y = None):
    self.vec = DictVectorizer(sparse = False)
    self.vec.fit(D)
    return self

  def transform(self, X):
    return self.vec.transform(D)

In [None]:
Random = RandomFeature()
Random.fit(D)

In [None]:
Random.transform(D)

array([[2., 0., 5., 3.],
       [1., 3., 0., 4.]])

In [None]:
D

[{'vinegar': 3, 'sesame oil': 2, 'soy sauce': 5},
 {'vinegar': 4, 'sesame oil': 1, 'soy paste': 3}]

### Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
grid = GridSearchCV(LogisticRegression(random_state = 1,
                                       penalty = 'l2'),
                    param_grid = {'C':[0.01, 0.05, 0.1, 0.5, 1],
                                  'max_iter': [30, 50, 100, 200],
                                  'class_weight': ['balanced']},
                                  cv = 5,
                                  scoring = 'f1_macro',)
grid.fit(train_vectors, train_labels)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

### Other way to do the Cross Validaiton

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report

In [None]:
best_lr, best_pr, best_r, best_f1 = None, 0.0, 0.0, 0.0

for param in grid.cv_results_['params']:
  model = LogisticRegression(**param, random_state = 1, penalty = 'l2')
  model.fit(train_vectors, train_labels)
  y_dev_preds = model.predict(dev_vectors)

  f1 = f1_score(dev_labels, y_dev_preds, average = 'macro')
  if f1 > best_f1:
    best_f1 = f1
    best_pr = precision_score(dev_labels, y_dev_preds, average = 'macro')
    best_r = recall_score(dev_labels, y_dev_preds, average = 'macro')
    best_lr = model

print(best_lr)
print(f'precision-macro: {round(best_pr, 4) * 100}%\n'
      f'recall-macro: {round(best_r, 4) * 100}%\n'
      f'f1-macro: {round(best_f1, 4) * 100}%')


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegression(C=1, class_weight='balanced', max_iter=50, random_state=1)
precision-macro: 91.83%
recall-macro: 92.67999999999999%
f1-macro: 92.23%


In [None]:
y_pred = best_lr.predict(test_vectors)
pmacro, rmacro, f1macro = precision_score(test_labels, y_pred, average = 'macro'), recall_score(test_labels, y_pred, average = 'macro'), f1_score(test_labels, y_pred, average = 'macro')
pmicro, rmicro, f1micro = precision_score(test_labels, y_pred, average = 'micro'), recall_score(test_labels, y_pred, average = 'micro'), f1_score(test_labels, y_pred, average = 'micro')

print(
    f'precision-micro: {round(pmicro, 4) * 100}% recall-micro: {round(rmicro, 4) * 100}% f1-micro: {round(f1micro, 4) * 100}%')
print('-' * 50)
print(
    f'precision-macro: {round(pmacro, 4) * 100}% recall-macro: {round(rmacro, 4) * 100}% f1-macro: {round(f1macro, 4) * 100}%')


precision-micro: 93.42% recall-micro: 93.42% f1-micro: 93.42%
--------------------------------------------------
precision-macro: 92.83% recall-macro: 92.88% f1-macro: 92.85%


Ongoing(LogisticRegression, SVM, RandomForest, Perceptron), Bayesian


### Bayesian
#### Feature Union


In [None]:
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion

class WordLengthTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X):
        return np.array([[len(w)] for w in X]).astype(int)

class VowelRatioTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X):
        features = []
        for word in X:
            vowels = sum(1 for ch in word.lower() if ch in 'aeiouäöü')
            consonants = sum(1 for ch in word.lower() if ch.isalpha() and ch not in 'aeiouäöü')
            ratio = vowels / consonants if consonants > 0 else 0
            features.append([ratio])
        return np.array(features)

class WordEndingTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.vec = DictVectorizer()
        endings = [{'suffix': word[-3:] if len(word) >= 3 else word} for word in X]
        self.vec.fit(endings)
        return self

    def transform(self, X):
        suffixes = [{'suffix': word[-3:] if len(word) >= 3 else word} for word in X]
        return self.vec.transform(suffixes)

class HyphenFeatureTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([[1 if '-' in word else 0] for word in X]).astype(int)

In [None]:
combined_features = FeatureUnion([
    ('length>8', WordLengthTransformer()),
    ('vowel_ratio', VowelRatioTransformer()),
    ('suffix', WordEndingTransformer()),
    ('hyphen', HyphenFeatureTransformer())
])

In [None]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import BayesianRidge


In [None]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import BayesianRidge

bayesian_pipeline = Pipeline([
    ('features', combined_features),
    ('to_dense', FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)),  # Convert to dense
    ('clf', BayesianRidge())
])

bayesian_pipeline.fit(train_examples, np.array(train_labels).astype(float))


In [None]:
bayesian_preds = bayesian_pipeline.predict(test_examples)
bayesian_preds = (bayesian_preds >= 0.5).astype(int) # threshold for binary classification

In [None]:
test_labels = np.array([int(x) for x in test_labels])
bayesian_preds = np.array([int(x) for x in bayesian_preds])

In [None]:
# Evaluation
for name, preds in [('BayesianRidge', bayesian_preds)]:
    print(f'\n{name} Evaluation:')
    print('F1 Macro:', f1_score(test_labels, preds, average='macro'))
    print('Recall Macro:', recall_score(test_labels, preds, average='macro'))
    print('Precision Macro:', precision_score(test_labels, preds, average='macro'))



BayesianRidge Evaluation:
F1 Macro: 0.9154429852155803
Recall Macro: 0.9019670357273974
Precision Macro: 0.936294898116873
