<a href="https://colab.research.google.com/github/LUMII-AILab/NLP_Course/blob/main/notebooks/TextClassification.ipynb" target="_new"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab"/></a>

# Using existing libs for language detection

In [None]:
# Language identification
# LangID (2016): lightweight, trained on Wikipedia etc., character n-grams, NB classifier
# See also fastText, etc.

!pip install langid

import langid

In [None]:
print(langid.classify("this is a test"))
print(langid.classify("šis ir tests"))

In [None]:
langid.set_languages(['en','lv'])

print(langid.classify("this is a test"))
print(langid.classify("šis ir tests"))

In [None]:
langid.set_languages(['en','lv','uk','ru'])

print(langid.classify("Краматорськ зазнав ракетного удару агресора"))
print(langid.classify("Краматорськ зазнав ракетного удару агресора: є загиблі"))

In [None]:
# fastText:
# * a library for text classification and word embeddings
# * pre-trained models for 176 languages
# * robust language detection (incl. short texts)

!pip install fasttext

!wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin

import fasttext

In [None]:
# Load the pre-trained language detection model
model = fasttext.load_model('lid.176.bin')

text = "Краматорськ зазнав ракетного удару агресора"
#text = "šis ir tests"

predictions = model.predict(text, k=1)  # Return the TOP 1 prediction

print("Language:", predictions[0][0])
print("Confidence:", predictions[1][0])

# Training and using a NB classifier

Hands-on dataset: *20 Newsgroup* assembled by Ken Lang @ CMU.

https://www.kaggle.com/datasets/au1206/20-newsgroup-original

We will use a format-converted, single-file version available from the course GitHub repo.

In [None]:
!wget https://raw.githubusercontent.com/LUMII-AILab/NLP_Course/main/notebooks/resources/news20/20_newsgroup.tsv
!wget https://raw.githubusercontent.com/LUMII-AILab/NLP_Course/main/notebooks/resources/news20/20_newsgroup-freq.tsv

!wget https://raw.githubusercontent.com/LUMII-AILab/NLP_Course/main/notebooks/resources/news20/stoplist.txt

In [None]:
!pip install nltk
!pip install scipy
!pip install sklearn

nltk.download('punkt')

In [None]:
import re
import os
import sys
import datetime
import pickle
import json

import nltk
import scipy
import numpy

from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
def initialise(stop_txt, freq_tsv):
	global stoplist
	stoplist = set()

	with open(stop_txt) as txt:
		for word in txt:
			stoplist.add(normalize_text(word.strip()))

	print("[I] Word stoplist is read (" + str(len(stoplist)) + ").")

	global whitelist
	whitelist = set()

	with open(freq_tsv) as tsv:
		for entry in tsv:
			freq, word = entry.strip().split("\t")

			if int(freq) < 5:
        # TODO: experiment with the threshold (e.g., 3 / 5 / 10)
				# Ignore the long tail: 2/3 of words occure less than N times
				continue

			whitelist.add(normalize_text(word))

	print("[I] Word whitelist is read (" + str(len(whitelist)) + ").")

In [None]:
def normalize_text(text):
	text = text.lower()
	text = re.sub(r"\d+", "100", text)

	return text.strip()

def normalize_vector(vector):
	words = list(vector.keys())

	for w in words:
		if w in stoplist or len(w) == 1 or w not in whitelist:
			vector.pop(w)

	return vector

def vectorize_text(text):
	return normalize_vector({word: True for word in nltk.word_tokenize(normalize_text(text))})

In [None]:
def read_data(file):
	data_set = {}  # topic => annotated examples

	with open(file) as data:
		for entry in data:
			topic, text = entry.strip().split("\t")

			sub_set = []
			if topic in data_set:
				sub_set = data_set[topic]

			sub_set.append((vectorize_text(text), topic))
			data_set[topic] = sub_set

	return data_set

def join_data(data_set):
	union = []

	for cat in data_set:
		union += data_set[cat]

	return union

In [None]:
def validate_accuracy(data_set, k):
	kfold = KFold(n_splits=k, shuffle=True)

	data_split = {}

	for cat in data_set:
		# K-Fold split for each class to ensure balanced training and test data sets
		folds = []

		for train, test in kfold.split(data_set[cat]):    # k loops
			train_data = numpy.array(data_set[cat])[train]  # vs. data[train]
			test_data = numpy.array(data_set[cat])[test]    # vs. data[test]
			folds.append({"train": train_data, "test": test_data})

		data_split[cat] = folds

	validations = []

	gold_result = []
	silver_result = []

	for i in range(k):
		# Join the training and test data into two respective sets
		train_data = numpy.array([])
		test_data = numpy.array([])

		for cat in data_split:
			if len(train_data) > 0:
				train_data = numpy.append(train_data, data_split[cat][i]["train"], axis=0)
			else:
				train_data = data_split[cat][i]["train"]

			if len(test_data) > 0:
				test_data = numpy.append(test_data, data_split[cat][i]["test"], axis=0)
			else:
				test_data = data_split[cat][i]["test"]

		# Naive Bayes classifier: training and evaluation
		nb = nltk.NaiveBayesClassifier.train(train_data)
		validations.append(nltk.classify.accuracy(nb, test_data))

		for t in test_data:
			gold_result.append(t[1])
			silver_result.append(nb.classify(t[0]))

	return (validations, gold_result, silver_result)

In [None]:
def run_validation(data_path, k, n):
	print("\n\t" + str(k) + "-fold cross-validation:\n")

	iterations = []
	gold_total = []
	silver_total = []

	start_time = datetime.datetime.now().replace(microsecond=0)

	for i in range(n):
		validations, gold, silver = validate_accuracy(read_data(data_path), k)
		iterations.append(numpy.mean(validations))

		gold_total += gold
		silver_total += silver

		print("\t{0}.\t".format(i+1), end='')
		for v in validations:
			print("{0:.2f}  ".format(v), end='')
		print("\t{0:.0%}".format(numpy.mean(validations)))

	end_time = datetime.datetime.now().replace(microsecond=0)
	print("\n\tTotal validation time: " + str(end_time - start_time))

	print("\n\tAverage accuracy in {0} iterations: {1:.0%}\n".format(n, numpy.mean(iterations)))

	print(classification_report(gold_total, silver_total))

	print("Confusion matrix:")
	print(nltk.ConfusionMatrix(gold_total, silver_total))
	#print(confusion_matrix(gold_total, silver_total))

In [None]:
def run_training(data_path, verbose):
	print("[I] Training an NB classifier...")

	start_time = datetime.datetime.now().replace(microsecond=0)

	# The final (production) model is trained by using all available data (train+test)
	nb = nltk.NaiveBayesClassifier.train(join_data(read_data(data_path)))

	end_time = datetime.datetime.now().replace(microsecond=0)
	print("[I] Training time: " + str(end_time - start_time))

	if verbose:
		nb.show_most_informative_features(n=100)

	dmp = open("nb_classifier.pickle", "wb")
	pickle.dump(nb, dmp)
	dmp.close()

	print("[I] NB classifier trained and serialised in a file.")

In [None]:
def run():
	dmp = open("nb_classifier.pickle", "rb")
	nb = pickle.load(dmp)
	dmp.close()

	print("[I] NB classifier loaded from a file.")

	while True:
		message = input("\nEnter a text:\n")

		if len(message) == 0:
			break

		features = vectorize_text(message)
		topic = nb.prob_classify(features)

		print("\n{0}\n".format(list(features.keys())))

		for t in topic.samples():
			print("{0}: {1:.3f}".format(t, topic.prob(t)))

		print("\nGuess: " + nb.classify(features))

In [None]:
# Initialise the lexicons
initialise('stoplist.txt', '20_newsgroup-freq.tsv')

In [None]:
# Experiment with 'hyperparameters'
run_validation("20_newsgroup.tsv", 5, 2) # 5=folds, 2=iterations

In [None]:
# Train and save the final model
run_training("20_newsgroup.tsv", True) # True=verbose

In [None]:
# Run the pre-trained model
run()

# Test [alt.atheism]: This was a conflict between the atheists and the religious.
# Test [soc.religion.christian]: Whoever humbles himself like this child is the greatest in the kingdom of heaven.
# Test [sci.med]: My wife had hives during the first two months of her pregnancy.
# Test [sci.space]: Correct, we have no parallax measurements on the bursts.