# Machine Learning for NLP

### 1. Parsing and saving the data

In [None]:
from src.dataset_parser import parse_data_to_csv

raw_path = "./original_data"
parsed_path = "./parsed_data"
# parse_data_to_csv(raw_path, parsed_path) // DO NOT EXECUTE UNLESS PARSED DATA IS LOST, OR USE OTHER parsed_path

### 2. Loading parsed 

In [None]:
from sklearn.dummy import DummyClassifier

from src.dataset import DataSet

ds = DataSet(parsed_path)
dummy_domain = DummyClassifier(strategy="most_frequent")
dummy_polarity = DummyClassifier(strategy="most_frequent")
dummy_rating_str = DummyClassifier(strategy="most_frequent")

In [None]:
# print(ds.training, ds.training.dtypes)
dummy_domain.fit(ds.training["review_text"], ds.training["domain"])
dummy_polarity.fit(ds.training["review_text"], ds.training["polarity"])
dummy_rating_str.fit(ds.training["review_text"], ds.training["rating_str"].astype(str))  # column seems to automatically reconvert to float if not forced in str

In [None]:
print(dummy_domain.predict(ds.testing["review_text"]))
print(dummy_polarity.predict(ds.testing["review_text"]))
print(dummy_rating_str.predict(ds.testing["review_text"]))

In [None]:
print(dummy_domain.score(ds.testing["review_text"], ds.testing["domain"]))
print(dummy_polarity.score(ds.testing["review_text"], ds.testing["polarity"]))
print(dummy_rating_str.score(ds.testing["review_text"], ds.testing["rating_str"].astype(str)))

In [None]:
from copy import deepcopy
from sklearn.feature_extraction.text import TfidfVectorizer

base_vectorizer = TfidfVectorizer(lowercase=True, analyzer="word", stop_words="english")
vectorizer = deepcopy(base_vectorizer)  # so we don't need to rewrite TfidfVectorizer instance initialisations later, thus only one place to change parameters for all
X_training = vectorizer.fit_transform(ds.training["review_text"])
print(X_training.shape)
print(vectorizer.get_params(), vectorizer.get_stop_words(), sep='\n')
X_testing = vectorizer.transform(ds.testing["review_text"])
print(X_testing.shape)
print(X_testing)

In [None]:
from sklearn.linear_model import Perceptron

perceptron = Perceptron()
perceptron.fit(X_training, ds.training["polarity"])
perceptron.score(X_testing, ds.testing["polarity"])