# Machine Learning for NLP
*This project requires Python 3.9+*
### 1. Parsing and saving the data

In [None]:
from src.dataset_parser import parse_data_to_csv

raw_path = "./original_data"
parsed_path = "./parsed_data"
# parse_data_to_csv(raw_path, parsed_path) // DO NOT EXECUTE UNLESS PARSED DATA IS LOST, OR USE OTHER parsed_path

### 2. Loading parsed data & Dummy Classifier

In [None]:
from sklearn.dummy import DummyClassifier

from src.dataset import DataSet

ds = DataSet(parsed_path)
dummy_domain = DummyClassifier(strategy="most_frequent")
dummy_polarity = DummyClassifier(strategy="most_frequent")
dummy_rating_str = DummyClassifier(strategy="most_frequent")

#### Fitting Dummy classifier with training data

In [None]:
# print(ds.training, ds.training.dtypes)
dummy_domain.fit(ds.training["review_text"], ds.training["domain"])
dummy_polarity.fit(ds.training["review_text"], ds.training["polarity"])
dummy_rating_str.fit(ds.training["review_text"], ds.training["rating_str"].astype(str))  # column seems to automatically reconvert to float if not forced in str

#### Predictions and scores with Dummy classifier

In [None]:
print(dummy_domain.predict(ds.testing["review_text"]))
print(dummy_polarity.predict(ds.testing["review_text"]))
print(dummy_rating_str.predict(ds.testing["review_text"]))

In [None]:
print(dummy_domain.score(ds.testing["review_text"], ds.testing["domain"]))
print(dummy_polarity.score(ds.testing["review_text"], ds.testing["polarity"]))
print(dummy_rating_str.score(ds.testing["review_text"], ds.testing["rating_str"].astype(str)))

### 3. Example of Vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer  # CountVectorizer will be used later

vectorizer = TfidfVectorizer(lowercase=True, analyzer="word", stop_words="english")
X_training = vectorizer.fit_transform(ds.training["review_text"])
print(X_training.shape)
print(X_training.toarray())
print(vectorizer.get_params(), vectorizer.get_stop_words(), sep='\n')
X_testing = vectorizer.transform(ds.testing["review_text"])
print(X_testing.shape)
print(X_testing)

### 4. Example of Perceptron Classifier

In [None]:
from sklearn.linear_model import Perceptron

perceptron = Perceptron(shuffle=False)
perceptron.fit(X_training, ds.training["polarity"])
perceptron.score(X_testing, ds.testing["polarity"])

### 5. Preprocessing experiments

We are going to use 2 different vectorizer types: TF-IDF and Count.  
N-grams will be word based (= whitespace separated).  
We are going to try different n-grams lengths: 1 to 3.  

In [None]:
tfidf_unigram_vectorizer = TfidfVectorizer(lowercase=True, analyzer="word", ngram_range=(1, 1))
tfidf_bigram_vectorizer = TfidfVectorizer(lowercase=True, analyzer="word", ngram_range=(2, 2))
tfidf_trigram_vectorizer = TfidfVectorizer(lowercase=True, analyzer="word", ngram_range=(3, 3))

count_unigram_vectorizer = CountVectorizer(lowercase=True, analyzer="word", ngram_range=(1, 1))
count_bigram_vectorizer = CountVectorizer(lowercase=True, analyzer="word", ngram_range=(2, 2))
count_trigram_vectorizer = CountVectorizer(lowercase=True, analyzer="word", ngram_range=(3, 3))

In [None]:
from copy import deepcopy

base_perceptron = Perceptron(shuffle=False)
tfidf_unigram_perceptron = deepcopy(base_perceptron)  # deepcopying avoids having to copy parameters between multiple initializations
tfidf_bigram_perceptron = deepcopy(base_perceptron)
tfidf_trigram_perceptron = deepcopy(base_perceptron)
count_unigram_perceptron = deepcopy(base_perceptron)
count_bigram_perceptron = deepcopy(base_perceptron)
count_trigram_perceptron = deepcopy(base_perceptron)

In [None]:
X_training_tfidf_unigram = tfidf_unigram_vectorizer.fit_transform(ds.training["review_text"], ds.training["polarity"])
X_testing_tfidf_unigram = tfidf_unigram_vectorizer.transform(ds.testing["review_text"])
X_training_tfidf_bigram = tfidf_bigram_vectorizer.fit_transform(ds.training["review_text"], ds.training["polarity"])
X_testing_tfidf_bigram = tfidf_bigram_vectorizer.transform(ds.testing["review_text"])
X_training_tfidf_trigram = tfidf_trigram_vectorizer.fit_transform(ds.training["review_text"], ds.training["polarity"])
X_testing_tfidf_trigram = tfidf_trigram_vectorizer.transform(ds.testing["review_text"])

X_training_count_unigram = count_unigram_vectorizer.fit_transform(ds.training["review_text"], ds.training["polarity"])
X_testing_count_unigram = count_unigram_vectorizer.transform(ds.testing["review_text"])
X_training_count_bigram = count_bigram_vectorizer.fit_transform(ds.training["review_text"], ds.training["polarity"])
X_testing_count_bigram = count_bigram_vectorizer.transform(ds.testing["review_text"])
X_training_count_trigram = count_trigram_vectorizer.fit_transform(ds.training["review_text"], ds.training["polarity"])
X_testing_count_trigram = count_trigram_vectorizer.transform(ds.testing["review_text"])

In [None]:
tfidf_unigram_perceptron.fit(X_training_tfidf_unigram, ds.training["polarity"])
tfidf_bigram_perceptron.fit(X_training_tfidf_bigram, ds.training["polarity"])
tfidf_trigram_perceptron.fit(X_training_tfidf_trigram, ds.training["polarity"])

count_unigram_perceptron.fit(X_training_count_unigram, ds.training["polarity"])
count_bigram_perceptron.fit(X_training_count_bigram, ds.training["polarity"])
count_trigram_perceptron.fit(X_training_count_trigram, ds.training["polarity"])

In [None]:
from sklearn.metrics import precision_recall_fscore_support

print("Macro average values:")
s = "\t\t\tPrecision: {0}\tRecall: {1}\tF-Score: {2}"

print("\tTFIDF vectorizer:")
print("\t\tUnigrams:")
print(s.format(*precision_recall_fscore_support(ds.testing["polarity"], tfidf_unigram_perceptron.predict(X_testing_tfidf_unigram), average="macro")))
print("\t\tBigrams: ")
print(s.format(*precision_recall_fscore_support(ds.testing["polarity"], tfidf_bigram_perceptron.predict(X_testing_tfidf_bigram), average="macro")))
print("\t\tTrigrams:")
print(s.format(*precision_recall_fscore_support(ds.testing["polarity"], tfidf_trigram_perceptron.predict(X_testing_tfidf_trigram), average="macro")))
print()
print("\tCounter vectorizer:")
print("\t\tUnigrams:")
print(s.format(*precision_recall_fscore_support(ds.testing["polarity"], count_unigram_perceptron.predict(X_testing_count_unigram), average="macro")))
print("\t\tBigrams: ")
print(s.format(*precision_recall_fscore_support(ds.testing["polarity"], count_bigram_perceptron.predict(X_testing_count_bigram), average="macro")))
print("\t\tTrigrams:")
print(s.format(*precision_recall_fscore_support(ds.testing["polarity"], count_trigram_perceptron.predict(X_testing_count_trigram), average="macro")))

It seems that both vectorizer types have similar behaviors. Trigrams are clearly less accurate than unigrams or bigrams, but the difference between those last two is harder to understand.  
Apparently, bigrams work better with the TF-IDF vectorizer, when unigrams are the best choice when paired with a Counter vectorizer, although the difference between the two is thin (a delta of less than 2% in each vectorizer).


### Perceptron

In [None]:
def perceptron_calc(domain, it):
    vec = TfidfVectorizer(lowercase=True, analyzer="word", stop_words="english")
    training = vec.fit_transform(ds.training_by_domain[domain]["review_text"])
    testing = vec.transform(ds.testing_by_domain[domain]['review_text'])
    percep = Perceptron(shuffle=False, max_iter=it)
    percep.fit(training, ds.training_by_domain[domain]["polarity"])
    percep.score(testing, ds.testing_by_domain[domain]["polarity"])
    print(s.format(*precision_recall_fscore_support(ds.testing_by_domain[domain]["polarity"], percep.predict(testing), average="macro")))


#### Books

In [None]:
perceptron_calc('books', 100)
perceptron_calc('books', 1000)
perceptron_calc('books', 10000)
perceptron_calc('books', 100000)
perceptron_calc('books', 1000000)
perceptron_calc('books', 10000000)
perceptron_calc('books', 100000000)
perceptron_calc('books', 1000000000)

#### DVD

In [None]:
perceptron_calc('dvd')

#### Electronics

In [None]:
perceptron_calc('electronics')

#### Kitchen And Housewares

In [None]:
perceptron_calc('kitchen & housewares')


### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
vec_nb = TfidfVectorizer(lowercase=True, analyzer="word", stop_words="english")
nb_train_data = vec_nb.transform(ds.training['review_text'])
nb_train_result = ds.training['rating_str']
books_nb_test = vec_nb.transform(ds.testing_by_domain['books']['review_text'])


gnb = GaussianNB()
pred = gnb.fit(nb_train_data, nb_train_result).predict(books_nb_test)
print(pred)


### Decision Trees

In [None]:
from sklearn import tree
vec_dt = TfidfVectorizer(lowercase=True, analyzer="word", stop_words="english")
training_dt = vec_dt.fit_transform(ds.training["review_text"])
testing_dt = vec_dt.transform(ds.testing["review_text"])
clf = tree.DecisionTreeClassifier()
clf.fit(training_dt, ds.training["polarity"]).score(testing_dt, ds.testing["polarity"])
print(s.format(*precision_recall_fscore_support(ds.testing["polarity"], clf.predict(testing_dt), average="macro")))

- max_depth: number of tree's nodes the higher the value, the higher and closer values are
    - the higher the value, the higher and closer values are
-

### Support Vector Machines

#### Linear

In [None]:
from sklearn import svm
vec_svm = TfidfVectorizer(lowercase=True, analyzer="word", stop_words="english")
training_svm = vec_svm.fit_transform(ds.training["review_text"])
testing_svm = vec_svm.transform(ds.testing["review_text"])
clf = svm.SVC(kernel="linear")
clf.fit(training_svm, ds.training["polarity"])
clf.score(testing_svm, ds.testing["polarity"])
print(s.format(*precision_recall_fscore_support(ds.testing["polarity"], clf.predict(testing_svm), average="macro")))


#### RBF

In [None]:
from sklearn import svm
vec_svm = TfidfVectorizer(lowercase=True, analyzer="word", stop_words="english")
training_svm = vec_svm.fit_transform(ds.training["review_text"])
testing_svm = vec_svm.transform(ds.testing["review_text"])
clf = svm.SVC(kernel="rbf")
clf.fit(training_svm, ds.training["polarity"])
clf.score(testing_svm, ds.testing["polarity"])
print(s.format(*precision_recall_fscore_support(ds.testing["polarity"], clf.predict(testing_svm), average="macro")))