In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [1]:
import numpy as np
import pandas as pd

from functions.load_data import load_labelled_data
from functions.preprocessing import preprocess_tweets, concat_tweet_files
from functions.tokenizer import create_keras_tokenizer, apply_keras_tokenizer, save_model_and_tokenizer, load_model_and_tokenizer
from functions.models import train_rf, confusion_matrix
from functions.data_modification import aggregate_sentiment
from functions.apply_models import apply_model


Using TensorFlow backend.


In [None]:
# Load and preprocess labelled tweets
df_tweets = load_labelled_data('data/sentiment_data_1500_manual.csv')
df_tweets = preprocess_tweets(df_tweets)

In [None]:
# Create and apply tokenizer on labelled tweets
maxlen, tokenizer = create_keras_tokenizer(df_tweets)
tokenized_tweets = apply_keras_tokenizer(df_tweets, tokenizer, maxlen)

In [None]:
# Train a randomforest model on the labelled tweets
acc, forest_mod, x_train, x_test, y_train, y_test = train_rf(df_tweets, tokenized_tweets, test_size=0.25, n_estimators=1000)

In [None]:
# Save the model and tokenizer to /models_and_tokenizer/
save_model_and_tokenizer('models_and_tokenizers/randomforest.sav', 'models_and_tokenizers/tokenizer_1500_man.sav', mod, tokenizer)

In [None]:
# Create confusion matrix
my_pred = forest_mod.predict(x_test)
my_ground_truth = y_test.values

conf_mat = confusion_matrix(my_pred, my_ground_truth, 3)


#           Pred
#         A  B  C
# L    A aa ab ac  
# a    B ba bb bc
# b    C ca cb cc
# 
# Seen from class A:
# class A: - aa:       True Positives
#          - ab + ac : False Negatives
#          - ba + ca : False Positives
#          - bb + cc:  True Negative

In [None]:
# Test the spread
unique, counts = np.unique(my_pred, return_counts=True)
unique2, counts2 = np.unique(my_ground_truth, return_counts=True)

print('Prediction: ', unique, counts)
print('Ground Truth: ', unique2, counts2)

In [None]:
# In case you need to reload the model and tokenizer from /models_and_tokenizer/
my_model, my_tokenizer = load_model_and_tokenizer('models_and_tokenizers/randomforest.sav', 'models_and_tokenizers/tokenizer_900_man.sav')