In [91]:
#!pip install tensorflow_text==2.5.0.
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.stats import mode
import re
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow_hub as hub
import tensorflow_text as text

In [92]:
twitter = pd.read_csv("labeled_tweet_table.csv")
twitter = twitter[twitter["Race"].notna()]
twitter = twitter[twitter["Race"] != 5]
twitter["Race"] -= 1
twitter["Race"].value_counts()

3.0    242886
0.0     28720
1.0     17797
2.0      9885
Name: Race, dtype: int64

In [93]:
regexMap={r"<[\w'/'\s]*>": "",r"[\'\"\-]+": "",r"@[\w]+":"",r"http\S+" : ""}
def preprocess(datainput):
    t=datainput
    for regx in regexMap.keys():
        try:
            t = re.sub(regx, regexMap[regx], t)
        except TypeError:
            continue
    return t
twitter["Tweet"] = twitter["Tweet"].apply(preprocess)
twitter

Unnamed: 0,Tweet,Name,Screen Name,Description,Lang,img_path,Race,Under 21
0,"YKAR, a futuristic sans serif font by #Freeb...",Chad Boyce,djsnipa1,"Multimedia Developer, Graphic Designer, DJ, an...",,profile pics/60147.jpeg,3.0,0
1,Who can I contact about the very rude and poo...,Chad Boyce,djsnipa1,"Multimedia Developer, Graphic Designer, DJ, an...",,profile pics/60147.jpeg,3.0,0
2,I’d like to win!,Chad Boyce,djsnipa1,"Multimedia Developer, Graphic Designer, DJ, an...",,profile pics/60147.jpeg,3.0,0
3,,Chad Boyce,djsnipa1,"Multimedia Developer, Graphic Designer, DJ, an...",,profile pics/60147.jpeg,3.0,0
4,Now Im heading to B1000th Floor! #quickrogue,Chad Boyce,djsnipa1,"Multimedia Developer, Graphic Designer, DJ, an...",,profile pics/60147.jpeg,3.0,0
...,...,...,...,...,...,...,...,...
313398,Im raising money for Help Feed the Homeless. C...,Joseph Aragunde,thomasaragu,General Contractor plus construction worker go...,,profile pics/64221.jpeg,3.0,0
313399,Im raising money for Help Feed the Homeless. C...,Joseph Aragunde,thomasaragu,General Contractor plus construction worker go...,,profile pics/64221.jpeg,3.0,0
313400,Im raising money for Help Feed the Homeless. C...,Joseph Aragunde,thomasaragu,General Contractor plus construction worker go...,,profile pics/64221.jpeg,3.0,0
313401,Im raising money for Help Feed the Homeless. C...,Joseph Aragunde,thomasaragu,General Contractor plus construction worker go...,,profile pics/64221.jpeg,3.0,0


In [94]:
names = twitter["Name"].unique()
labels = []
tweet_dict = {}
for name in names:
    tweet_dict[name] = twitter[twitter["Name"] == name]["Tweet"].tolist()

In [110]:
label_dict = twitter.set_index("Name").to_dict()["Race"]

In [115]:
def split():
    np.random.shuffle(names)
    train_names = names[: int(len(names) * .8)]
    test_names = names[int(len(names) * .8) : ]
    test_user_labels = []

    #training tweets
    x_train = []
    #training labels
    y_train = []
    for train_name in train_names:
        x_train += tweet_dict[train_name]
        y_train += [label_dict[train_name]] * len(tweet_dict[train_name])

    x_train = np.array(x_train)
    y_train = np.array(y_train)

    #testing tweets
    x_test = []
    #testing labels
    y_test = []

    for test_name in test_names:
        x_test += tweet_dict[test_name]
        y_test += [label_dict[test_name]] * len(tweet_dict[test_name])
        test_user_labels.append(label_dict[test_name])

    x_test = np.array(x_test)
    y_test = np.array(y_test)
    test_user_labels = np.array(test_user_labels)
    
    return test_names, x_train, y_train, x_test, y_test, test_user_labels


In [116]:
def constructModel():
    bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
    bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")
    text_input = keras.layers.Input(shape=(), dtype=tf.string)
    preprocessed_text = bert_preprocess(text_input)
    outputs = bert_encoder(preprocessed_text)

    dense = keras.layers.Dense(4, activation='softmax')(outputs["pooled_output"])

    model = keras.Model(inputs=[text_input], outputs=[dense])
    model.compile(optimizer='adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])
    return model

In [117]:
def evaluateModel(model, x_test, y_test, test_user_labels, test_names):
    raw_tweet_preds = np.argmax(model.predict(x_test), axis=1)
    overall_acc = accuracy_score(raw_tweet_preds, y_test)

    per_user_pred = []
    index = 0

    for name in test_names:
        voteRange = index + len(tweet_dict[name])
        candidates = raw_tweet_preds[index : voteRange]
        index = voteRange
        per_user_pred.append(mode(candidates)[0])
        
    aggregate_acc = accuracy_score(per_user_pred, test_user_labels)
    report = classification_report(test_user_labels, per_user_pred)
    report_dict = classification_report(test_user_labels, per_user_pred, output_dict=True)
    confusion = confusion_matrix(test_user_labels, per_user_pred, normalize='true')
    return overall_acc, aggregate_acc, report, report_dict, confusion

In [None]:
model_accs = []
report_dicts = []
confusions = []

for fold in range(1, 6):
    print("Fold %d:" % fold)
    test_names, x_train, y_train, x_test, y_test, test_user_labels = split()
    model = constructModel()
    model.fit(x_train, y_train, epochs=3)
    overall_acc, aggregate_acc, report, report_dict, confusion = evaluateModel(model, x_test, y_test, test_user_labels, test_names)
    model_accs.append(aggregate_acc)
    print("Overall Tweet Accuracy: ", overall_acc)
    print("Per User Accuracy: ", aggregate_acc)
    print()
    print("Classification Report Per User: ")
    print(report)
    print()
    print("Confusion Matrix Per User: ")
    print(confusion)
    report_dicts.append(report_dict)
    confusions.append(confusion)
    print()
    

Fold 1:
Overall Tweet Accuracy:  1.0
Per User Accuracy:  1.0

Classification Report Per User: 
              precision    recall  f1-score   support

         3.0       1.00      1.00      1.00        10

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10


Confusion Matrix Per User: 
[[1.]]

Fold 2:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Overall Tweet Accuracy:  0.7823721436343852
Per User Accuracy:  0.8

Classification Report Per User: 
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         1
         1.0       0.00      0.00      0.00         1
         3.0       0.80      1.00      0.89         8

    accuracy                           0.80        10
   macro avg       0.27      0.33      0.30        10
weighted avg       0.64      0.80      0.71        10


Confusion Matrix Per User: 
[[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]]

Fold 3:

In [None]:
np.average(model_accs)

In [None]:
sum(confusions) / 5

In [None]:
for report in report_dicts:
    print(report)