## Define local constants
Change these constants based on your needs.

In [75]:
import numpy

# Input data path
INPUT_TRAINING_FILE = "../data/preprocessed_data/training_dataset.csv"
# Evaluation dataset should always stay the same
INPUT_EVALUATION_FILE = "../data/preprocessed_data/evaluation_dataset.csv"

# Output parameters
METHOD_NAME = "tf_idf_multinomial_nb"
PREPROCESSOR_NAME = "baseline_and_bayess_specific"
OUTPUT_MODEL = f"../data/models/{METHOD_NAME}_model.pkl"
OUTPUT_RESULTS = f"../data/results/{METHOD_NAME}_model.txt"

# Hyper parameter alternatives
HYPER_PARAMETER_MIN_DF = list(range(0, 100, 10))
HYPER_PARAMETER_MAX_DF = list(numpy.arange(0.01, 0.10, 0.02))
HYPER_PARAMETER_MAX_FEATURES = list(range(5000, 100000, 5000))
HYPER_PARAMETER_BINARY = [False, True]
HYPER_PARAMETER_USE_IDF = [False, True]

# Hyper parameter optimization parameters
HYPER_PARAMETER_OPTIMIZATION_SCORING = "accuracy"
HYPER_PARAMETER_OPTIMIZATION_CV = 5

# Other constants
LABELS = ["negative", "positive"]

## Import necessary libraries for your machine learning method

In [76]:
import pickle
import yaml
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

## Load the datasets
Note that the preprocessed data should contain at least the following fields:
[prep_text],[sentiment]

Loading training and testing datasets.

In [77]:
training_dataset = pd.read_csv(INPUT_TRAINING_FILE, engine="python", delimiter=",")
training_tweets = training_dataset["prep_text"].apply(
    lambda tweet: str(tweet))
training_sentiment_targets = training_dataset["sentiment"].apply(
    lambda sentiment: int(sentiment))

evaluation_dataset = pd.read_csv(INPUT_EVALUATION_FILE, engine="python", delimiter=",")
evaluation_tweets = evaluation_dataset["prep_text"].apply(
    lambda tweet: str(tweet))
evaluation_sentiment_targets = evaluation_dataset["sentiment"].apply(
    lambda sentiment: int(sentiment))

## Define the preprocessor and do some preprocessing for the training dataset
Preprocessing part should only include conversion techniques that are required by the algorithm. General preprocessing should be done in the separate file.

In [78]:
# Remove very short tweets from the training dataset
mask = training_tweets.str.len() > 30
training_tweets = training_tweets[mask]
training_sentiment_targets = training_sentiment_targets[mask]

# Define the count vectorizer with certain sanity limits
preprocessor = TfidfVectorizer()

## Define rest of the pipeline
Definition should include splitting of the data using cross validator and hyper parameter optimization.

In [79]:
# Create internal pipeline
classifier = MultinomialNB()
pipeline = Pipeline(steps=[("preprocessing", preprocessor), ("classification", classifier)])

# Specify the tunable hyper parameters
parameters = {
    "preprocessing__min_df": HYPER_PARAMETER_MIN_DF,
    "preprocessing__max_df": HYPER_PARAMETER_MAX_DF,
    "preprocessing__max_features": HYPER_PARAMETER_MAX_FEATURES,
    "preprocessing__binary": HYPER_PARAMETER_BINARY,
    "preprocessing__use_idf": HYPER_PARAMETER_USE_IDF,
}

# Define KFold parameters
cv = StratifiedKFold(n_splits=HYPER_PARAMETER_OPTIMIZATION_CV, shuffle=True, random_state=42)

estimator = GridSearchCV(pipeline, parameters,
    scoring=HYPER_PARAMETER_OPTIMIZATION_SCORING, cv=cv, n_jobs=-1)

## Do training

In [80]:
estimator.fit(training_tweets, training_sentiment_targets)

KeyboardInterrupt: 

## Calculate metric values

In [None]:
# Use all training data to calculate confusion matrix for training data
training_estimates = estimator.predict(training_tweets)
training_accuracy = accuracy_score(training_sentiment_targets, training_estimates)
training_confusion_matrix = confusion_matrix(training_sentiment_targets, training_estimates)
training_classification_report = classification_report(training_sentiment_targets, training_estimates, output_dict=True, target_names=LABELS)

# Use model to estimate manually labeled evaluation Tweets
evaluation_estimates = estimator.predict(evaluation_tweets)
evaluation_accuracy = accuracy_score(evaluation_sentiment_targets, evaluation_estimates)
evaluation_confusion_matrix = confusion_matrix(evaluation_sentiment_targets, evaluation_estimates)
evaluation_classification_report = classification_report(evaluation_sentiment_targets, evaluation_estimates, output_dict=True, target_names=LABELS)

## Save trained model for future reference

In [None]:
with open(OUTPUT_MODEL, "wb") as handle:
    pickle.dump(estimator, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Save result statistics
These should be always saved in the same fashion, so the results can be compared between different models.

In [None]:
# Dictionary object, where results will be accumulated
result_dict= {}

# Metadata section
metadata_dict = {}
metadata_dict["preprocessor_name"] = PREPROCESSOR_NAME
metadata_dict["method_name"] = METHOD_NAME
metadata_dict["estimator_name"] = str(classifier)
result_dict["metadata"] = metadata_dict

# Hyper parameter optimization values
hyper_parameter_optimization_dict = {}
hyper_parameter_optimization_dict["scoring"] = HYPER_PARAMETER_OPTIMIZATION_SCORING
hyper_parameter_optimization_dict["cv"] = HYPER_PARAMETER_OPTIMIZATION_CV

parameter_optimization_dict = {}
parameter_optimization_dict["hyper_parameter_optimization"] = hyper_parameter_optimization_dict
metadata_dict["parameter_optimization"] = parameter_optimization_dict

# Save best parameters
result_dict["best_parameters"] = estimator.best_params_

# Different kind of scores
scores_dict = {}

training_scores_dict = {}
training_scores_dict["accuracy"] = float(training_accuracy)
training_confusion_matrix_dict = {}
training_confusion_matrix_dict["true_negative"] = int(training_confusion_matrix[0][0])
training_confusion_matrix_dict["true_positive"] = int(training_confusion_matrix[1][1])
training_confusion_matrix_dict["false_negative"] = int(training_confusion_matrix[0][1])
training_confusion_matrix_dict["false_positive"] = int(training_confusion_matrix[1][0])
training_scores_dict["confusion_matrix"] = training_confusion_matrix_dict
training_scores_dict["classification_report"] = training_classification_report
scores_dict["training_scores"] = training_scores_dict

evaluation_scores_dict = {}
evaluation_scores_dict["accuracy"] = float(evaluation_accuracy)
evaluation_confusion_matrix_dict = {}
evaluation_confusion_matrix_dict["true_negative"] = int(evaluation_confusion_matrix[0][0])
evaluation_confusion_matrix_dict["true_positive"] = int(evaluation_confusion_matrix[1][1])
evaluation_confusion_matrix_dict["false_negative"] = int(evaluation_confusion_matrix[0][1])
evaluation_confusion_matrix_dict["false_positive"] = int(evaluation_confusion_matrix[1][0])
evaluation_scores_dict["confusion_matrix"] = evaluation_confusion_matrix_dict
evaluation_scores_dict["classification_report"] = evaluation_classification_report
scores_dict["evaluation_scores"] = evaluation_scores_dict

result_dict["scores"] = scores_dict

# Convert statistics to pretty YAML
results = yaml.dump(result_dict)

# Print results
print(results)

# Save results to the file
with open(OUTPUT_RESULTS, "w") as file:
    file.write(results)

metadata:
  estimator_name: MultinomialNB()
  method_name: naive_bayes
  parameter_optimization:
    hyper_parameter_optimization: &id001
      cv: 2
      scoring: accuracy
  preprocessor_name: baseline_and_bayess_specific
parameters:
  hyper_parameter_optimization_values: *id001
  selected_hyper_parameters:
    classification__fit_prior: true
scores:
  evaluation_scores:
    accuracy: 0.8793969849246231
    classification_report:
      accuracy: 0.8793969849246231
      macro avg:
        f1-score: 0.8793939393939394
        precision: 0.8795211153768439
        recall: 0.8794444444444445
        support: 199
      negative:
        f1-score: 0.88
        precision: 0.8712871287128713
        recall: 0.8888888888888888
        support: 99
      positive:
        f1-score: 0.8787878787878789
        precision: 0.8877551020408163
        recall: 0.87
        support: 100
      weighted avg:
        f1-score: 0.8793908938632556
        precision: 0.8795624921942506
        recall: 0.879