## Define local constants
Change these constants based on your needs.

In [65]:
# Input data path
INPUT_TRAINING_FILE = "../data/preprocessed_data/tweets_combined_preprocessed_1190k.csv"
# Evaluation dataset should always stay the same
INPUT_EVALUATION_FILE = "../data/preprocessed_data/verification_dataset.csv"

# Output parameters
METHOD_NAME = "baseline"
PREPROCESSOR_NAME = "baseline"
OUTPUT_MODEL = f"../data/models/{METHOD_NAME}_model.pkl"
OUTPUT_RESULTS = f"../data/results/{METHOD_NAME}_model.txt"

# Hyper parameter alternatives
HYPER_PARAMETER_KERNEL = ["rfb"]
HYPER_PARAMETER_GAMMA = ["scale"]

# Hyper parameter optimization parameters
HYPER_PARAMETER_OPTIMIZATION_SCORING = "accuracy"
HYPER_PARAMETER_OPTIMIZATION_CV = 2

# Cross validation parameters
CROSS_VALIDATION_SCORING = "accuracy"
CROSS_VALIDATION_CV = 5

# Other constants
LABELS = ["negative", "positive"]

## Import necessary libraries for your machine learning method

In [66]:
import pickle
import yaml
import pandas as pd
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

## Load the datasets
Note that the preprocessed data should contain at least the following fields:
[prep_text],[sentiment]

Loading training and testing datasets.

In [67]:
training_dataset = pd.read_csv(INPUT_TRAINING_FILE, engine="python", delimiter=",")
training_tweets = training_dataset["prep_text"].apply(
    lambda tweet: str(tweet))
training_sentiment_targets = training_dataset["sentiment"].apply(
    lambda sentiment: int(sentiment))

evaluation_dataset = pd.read_csv(INPUT_EVALUATION_FILE, engine="python", delimiter=",")
evaluation_tweets = evaluation_dataset["prep_text"].apply(
    lambda tweet: str(tweet))
evaluation_sentiment_targets = evaluation_dataset["sentiment"].apply(
    lambda sentiment: int(sentiment))

## Define the preprocessor
Preprocessing part should only include conversion techniques that are required by the algorithm. General preprocessing should be done in the separate file.

In [68]:
preprocessor = CountVectorizer()

## Define rest of the pipeline
Definition should include splitting of the data using cross validator and hyper parameter optimization.

In [69]:
# Create internal pipeline
classifier = svm.SVC()
pipeline = Pipeline(steps=[("preprocessing", preprocessor), ("classification", classifier)])

# Specify the tunable hyper parameters
parameters = {
    "classification__kernel": HYPER_PARAMETER_KERNEL,
    "classification__gamma": HYPER_PARAMETER_GAMMA
}

# Define KFold parameters
cv = StratifiedKFold(n_splits=HYPER_PARAMETER_OPTIMIZATION_CV, shuffle=True, random_state=42)

estimator = GridSearchCV(pipeline, parameters,
    scoring=HYPER_PARAMETER_OPTIMIZATION_SCORING, cv=cv)

## Do training

In [70]:
training_accuracy = cross_val_score(estimator, training_tweets, training_sentiment_targets,
    scoring=CROSS_VALIDATION_SCORING, cv=CROSS_VALIDATION_CV, n_jobs=-1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

## Calculate metric values

In [71]:
# Use all training data to calculate confusion matrix for training data
training_estimates = estimator.predict(training_tweets)
training_confusion_matrix = confusion_matrix(training_sentiment_targets, training_estimates)
training_classification_report = classification_report(training_sentiment_targets, training_estimates, target_names=LABELS)

# Use model to estimate manually labeled evaluation Tweets
evaluation_estimates = estimator.predict(evaluation_tweets)
evaluation_accuracy = accuracy_score(evaluation_sentiment_targets, evaluation_estimates)
evaluation_confusion_matrix = confusion_matrix(evaluation_sentiment_targets, evaluation_estimates)
evaluation_classification_report = classification_report(evaluation_sentiment_targets, evaluation_estimates, target_names=LABELS)

NotFittedError: This GridSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

## Save trained model for future reference

In [None]:
with open(OUTPUT_MODEL, "wb") as handle:
    pickle.dump(estimator, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Save result statistics
These should be always saved in the same fashion, so the results can be compared between different models.

In [None]:
# Dictionary object, where results will be accumulated
result_dict= {}

# Metadata section
metadata_dict = {}
metadata_dict["preprocessor_name"] = PREPROCESSOR_NAME
metadata_dict["method_name"] = METHOD_NAME
metadata_dict["estimator_name"] = classifier.__name__
result_dict["metadata"] = metadata_dict

# Hyper parameter optimization values
hyper_parameter_optimization_dict = {}
hyper_parameter_optimization_dict["scoring"] = HYPER_PARAMETER_OPTIMIZATION_SCORING
hyper_parameter_optimization_dict["cv"] = HYPER_PARAMETER_OPTIMIZATION_CV

# Cross validation parameters
cross_validation_dict = {}
cross_validation_dict["scoring"] = CROSS_VALIDATION_SCORING
cross_validation_dict["cv"] = CROSS_VALIDATION_CV

parameter_optimization_dict = {}
parameter_optimization_dict["hyper_parameter_optimization"] = hyper_parameter_optimization_dict
parameter_optimization_dict["cross_validation_parameters"] = cross_validation_dict
metadata_dict["parameter_optimization"] = parameter_optimization_dict

# Combine parameters under parameters dict
parameters_dict = {}
parameters_dict["hyper_parameter_optimization_values"] = hyper_parameter_optimization_dict
parameters_dict["cross_validation_parameters"] =  cross_validation_dict
parameters_dict["selected_hyper_parameters"] = estimator.best_params_
result_dict["parameters"] = parameters_dict

# Different kind of scores
scores_dict = {}

training_scores_dict = {}
training_scores_dict["accuracy"] = training_accuracy
training_confusion_matrix_dict = {}
training_confusion_matrix_dict["true_negative"] = training_confusion_matrix[1][0]
training_confusion_matrix_dict["true_positive"] = training_confusion_matrix[1][1]
training_confusion_matrix_dict["false_negative"] = training_confusion_matrix[0][0]
training_confusion_matrix_dict["false_positive"] = training_confusion_matrix[0][1]
training_scores_dict["confusion_matrix"] = training_confusion_matrix_dict
training_scores_dict["classification_report"] = str(training_classification_report)
scores_dict["training_scores"] = training_scores_dict

evaluation_scores_dict = {}
evaluation_scores_dict["accuracy"] = training_accuracy
evaluation_confusion_matrix_dict = {}
evaluation_confusion_matrix_dict["true_negative"] = evaluation_confusion_matrix[1][0]
evaluation_confusion_matrix_dict["true_positive"] = evaluation_confusion_matrix[1][1]
evaluation_confusion_matrix_dict["false_negative"] = evaluation_confusion_matrix[0][0]
evaluation_confusion_matrix_dict["false_positive"] = evaluation_confusion_matrix[0][1]
evaluation_scores_dict["confusion_matrix"] = evaluation_confusion_matrix_dict
training_scores_dict["classification_report"] = str(evaluation_classification_report)
scores_dict["evaluation_scores"] = evaluation_scores_dict

result_dict["scores"] = scores_dict

# Convert statistics to pretty YAML
results = yaml.dump(result_dict)

# Print results
print(results)

# Save results to the file
with open(OUTPUT_RESULTS, "r") as file:
    file.write(results)