## Define local constants
Change these constants based on your needs.

In [49]:
import numpy

# Input data path
INPUT_TRAINING_FILE = "../data/preprocessed_data/training_dataset.csv"
# Evaluation dataset should always stay the same
INPUT_EVALUATION_FILE = "../data/preprocessed_data/evaluation_dataset.csv"

# Output parameters
METHOD_NAME = "tf_idf_multinomial_nb"
PREPROCESSOR_NAME = "baseline_and_bayess_specific"
OUTPUT_MODEL = f"../data/models/{METHOD_NAME}_model.pkl"
OUTPUT_RESULTS = f"../data/results/{METHOD_NAME}_model.txt"

# Hyper parameter alternatives
HYPER_PARAMETER_MIN_DF = list(range(0, 100, 5))
HYPER_PARAMETER_MAX_DF = list(numpy.arange(0.02, 0.12, 0.02))
HYPER_PARAMETER_ALPHA = list(numpy.arange(0.1, 1.1, 0.1))

# Hyper parameter optimization parameters
HYPER_PARAMETER_OPTIMIZATION_SCORING = "accuracy"
HYPER_PARAMETER_OPTIMIZATION_CV = 5

# Other constants
VERBOSITY = 3
LABELS = ["negative", "positive"]

## Import necessary libraries for your machine learning method

In [50]:
import pickle
import yaml
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

## Load the datasets
Note that the preprocessed data should contain at least the following fields:
[prep_text],[sentiment]

Loading training and testing datasets.

In [51]:
training_dataset = pd.read_csv(INPUT_TRAINING_FILE, engine="python", delimiter=",")
training_tweets = training_dataset["prep_text"].apply(
    lambda tweet: str(tweet))
training_sentiment_targets = training_dataset["sentiment"].apply(
    lambda sentiment: int(sentiment))

evaluation_dataset = pd.read_csv(INPUT_EVALUATION_FILE, engine="python", delimiter=",")
evaluation_tweets = evaluation_dataset["prep_text"].apply(
    lambda tweet: str(tweet))
evaluation_sentiment_targets = evaluation_dataset["sentiment"].apply(
    lambda sentiment: int(sentiment))

## Define the preprocessor and do some preprocessing for the training dataset
Preprocessing part should only include conversion techniques that are required by the algorithm. General preprocessing should be done in the separate file.

In [52]:
# Remove very short tweets from the training dataset
mask = training_tweets.str.len() > 30
training_tweets = training_tweets[mask]
training_sentiment_targets = training_sentiment_targets[mask]

# Define the count vectorizer with certain sanity limits
preprocessor = TfidfVectorizer()

## Define rest of the pipeline
Definition should include splitting of the data using cross validator and hyper parameter optimization.

In [53]:
best_estimator = None
best_accuracy = 0
for MIN_DF in HYPER_PARAMETER_MIN_DF:

    # Create internal pipeline
    classifier = MultinomialNB()
    pipeline = Pipeline(steps=[("preprocessing", preprocessor), ("classification", classifier)])

    # Specify the tunable hyper parameters
    parameters = {
        "preprocessing__min_df": [MIN_DF],
        "preprocessing__max_df": HYPER_PARAMETER_MAX_DF,
        "classification__alpha": HYPER_PARAMETER_ALPHA
    }

    # Define KFold parameters
    cv = StratifiedKFold(n_splits=HYPER_PARAMETER_OPTIMIZATION_CV, shuffle=True, random_state=42)

    estimator = GridSearchCV(pipeline, parameters,
        scoring=HYPER_PARAMETER_OPTIMIZATION_SCORING, cv=cv, n_jobs=-1, verbose=VERBOSITY)

    estimator.fit(training_tweets, training_sentiment_targets)

    training_estimates = estimator.predict(training_tweets)
    training_accuracy = accuracy_score(training_sentiment_targets, training_estimates)

    if best_accuracy < training_accuracy:
        best_accuracy = training_accuracy
        best_estimator = estimator

estimator = best_estimator

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END classification__alpha=0.1, preprocessing__max_df=0.02, preprocessing__min_df=0;, score=0.682 total time=  54.5s
[CV 2/5] END classification__alpha=0.1, preprocessing__max_df=0.02, preprocessing__min_df=0;, score=0.682 total time=  55.0s
[CV 3/5] END classification__alpha=0.1, preprocessing__max_df=0.02, preprocessing__min_df=0;, score=0.682 total time=  57.3s
[CV 4/5] END classification__alpha=0.1, preprocessing__max_df=0.02, preprocessing__min_df=0;, score=0.682 total time= 1.0min
[CV 3/5] END classification__alpha=0.1, preprocessing__max_df=0.04, preprocessing__min_df=0;, score=0.686 total time=  56.7s
[CV 2/5] END classification__alpha=0.1, preprocessing__max_df=0.04, preprocessing__min_df=0;, score=0.685 total time=  58.4s
[CV 5/5] END classification__alpha=0.1, preprocessing__max_df=0.02, preprocessing__min_df=0;, score=0.684 total time= 1.0min
[CV 1/5] END classification__alpha=0.1, preprocessing__max_df=0

## Calculate metric values

In [54]:
# Use all training data to calculate confusion matrix for training data
training_estimates = estimator.predict(training_tweets)
training_accuracy = accuracy_score(training_sentiment_targets, training_estimates)
training_confusion_matrix = confusion_matrix(training_sentiment_targets, training_estimates)
training_classification_report = classification_report(training_sentiment_targets, training_estimates, output_dict=True, target_names=LABELS)

# Use model to estimate manually labeled evaluation Tweets
evaluation_estimates = estimator.predict(evaluation_tweets)
evaluation_accuracy = accuracy_score(evaluation_sentiment_targets, evaluation_estimates)
evaluation_confusion_matrix = confusion_matrix(evaluation_sentiment_targets, evaluation_estimates)
evaluation_classification_report = classification_report(evaluation_sentiment_targets, evaluation_estimates, output_dict=True, target_names=LABELS)

## Save trained model for future reference

In [55]:
with open(OUTPUT_MODEL, "wb") as handle:
    pickle.dump(estimator, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Save result statistics
These should be always saved in the same fashion, so the results can be compared between different models.

In [56]:
# Dictionary object, where results will be accumulated
result_dict= {}

# Metadata section
metadata_dict = {}
metadata_dict["preprocessor_name"] = PREPROCESSOR_NAME
metadata_dict["method_name"] = METHOD_NAME
metadata_dict["estimator_name"] = str(classifier)
result_dict["metadata"] = metadata_dict

# Hyper parameter optimization values
hyper_parameter_optimization_dict = {}
hyper_parameter_optimization_dict["scoring"] = HYPER_PARAMETER_OPTIMIZATION_SCORING
hyper_parameter_optimization_dict["cv"] = HYPER_PARAMETER_OPTIMIZATION_CV

parameter_optimization_dict = {}
parameter_optimization_dict["hyper_parameter_optimization"] = hyper_parameter_optimization_dict
metadata_dict["parameter_optimization"] = parameter_optimization_dict

# Save best parameters
result_dict["best_parameters"] = estimator.best_params_

# Different kind of scores
scores_dict = {}

training_scores_dict = {}
training_scores_dict["accuracy"] = float(training_accuracy)
training_confusion_matrix_dict = {}
training_confusion_matrix_dict["true_negative"] = int(training_confusion_matrix[0][0])
training_confusion_matrix_dict["true_positive"] = int(training_confusion_matrix[1][1])
training_confusion_matrix_dict["false_negative"] = int(training_confusion_matrix[0][1])
training_confusion_matrix_dict["false_positive"] = int(training_confusion_matrix[1][0])
training_scores_dict["confusion_matrix"] = training_confusion_matrix_dict
training_scores_dict["classification_report"] = training_classification_report
scores_dict["training_scores"] = training_scores_dict

evaluation_scores_dict = {}
evaluation_scores_dict["accuracy"] = float(evaluation_accuracy)
evaluation_confusion_matrix_dict = {}
evaluation_confusion_matrix_dict["true_negative"] = int(evaluation_confusion_matrix[0][0])
evaluation_confusion_matrix_dict["true_positive"] = int(evaluation_confusion_matrix[1][1])
evaluation_confusion_matrix_dict["false_negative"] = int(evaluation_confusion_matrix[0][1])
evaluation_confusion_matrix_dict["false_positive"] = int(evaluation_confusion_matrix[1][0])
evaluation_scores_dict["confusion_matrix"] = evaluation_confusion_matrix_dict
evaluation_scores_dict["classification_report"] = evaluation_classification_report
scores_dict["evaluation_scores"] = evaluation_scores_dict

result_dict["scores"] = scores_dict

# Convert statistics to pretty YAML
results = yaml.dump(result_dict)

# Print results
print(results)

# Save results to the file
with open(OUTPUT_RESULTS, "w") as file:
    file.write(results)

best_parameters:
  classification__alpha: !!python/object/apply:numpy.core.multiarray.scalar
  - &id001 !!python/object/apply:numpy.dtype
    args:
    - f8
    - false
    - true
    state: !!python/tuple
    - 3
    - <
    - null
    - null
    - null
    - -1
    - -1
    - 0
  - !!binary |
    AAAAAAAA8D8=
  preprocessing__max_df: !!python/object/apply:numpy.core.multiarray.scalar
  - *id001
  - !!binary |
    exSuR+F6tD8=
  preprocessing__min_df: 0
metadata:
  estimator_name: MultinomialNB()
  method_name: tf_idf_multinomial_nb
  parameter_optimization:
    hyper_parameter_optimization:
      cv: 5
      scoring: accuracy
  preprocessor_name: baseline_and_bayess_specific
scores:
  evaluation_scores:
    accuracy: 0.8442211055276382
    classification_report:
      accuracy: 0.8442211055276382
      macro avg:
        f1-score: 0.8441581407098648
        precision: 0.8450404858299595
        recall: 0.8443434343434344
        support: 199
      negative:
        f1-score: 0.847290