In [11]:
#!pip install tensorflow_text==2.5.0.
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.stats import mode
import re
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.model_selection import KFold

In [92]:
df = pd.read_csv("labeled_tweet_table.csv")
df = df[df["Race"].notna()]
df = df[df["Race"] != 5]
df["Race"] -= 1
df["Race"].value_counts()

3.0    242886
0.0     28720
1.0     17797
2.0      9885
Name: Race, dtype: int64

In [93]:
x = df
y = df['Race']

In [94]:
x.drop(['Race'], inplace=True,axis=1)

In [96]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

rus = RandomUnderSampler(random_state=42, replacement=True)
x_rus, y_rus = rus.fit_resample(x, y)

print('original dataset shape:', Counter(y))
print('Resample dataset shape', Counter(y_rus))

original dataset shape: Counter({3.0: 242886, 0.0: 28720, 1.0: 17797, 2.0: 9885})
Resample dataset shape Counter({0.0: 9885, 1.0: 9885, 2.0: 9885, 3.0: 9885})


In [97]:
x_rus['Race'] = y_rus
x_rus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39540 entries, 0 to 39539
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Tweet        39540 non-null  object 
 1   Name         39540 non-null  object 
 2   Screen Name  39538 non-null  object 
 3   Description  31986 non-null  object 
 4   Lang         0 non-null      float64
 5   img_path     39540 non-null  object 
 6   Under 21     39540 non-null  int64  
 7   Race         39540 non-null  float64
dtypes: float64(2), int64(1), object(5)
memory usage: 2.4+ MB


In [98]:
twitter = x_rus
twitter.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39540 entries, 0 to 39539
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Tweet        39540 non-null  object 
 1   Name         39540 non-null  object 
 2   Screen Name  39538 non-null  object 
 3   Description  31986 non-null  object 
 4   Lang         0 non-null      float64
 5   img_path     39540 non-null  object 
 6   Under 21     39540 non-null  int64  
 7   Race         39540 non-null  float64
dtypes: float64(2), int64(1), object(5)
memory usage: 2.4+ MB


In [99]:
regexMap={r"<[\w'/'\s]*>": "",r"[\'\"\-]+": "",r"@[\w]+":"",r"http\S+" : ""}
def preprocess(datainput):
    t=datainput
    for regx in regexMap.keys():
        try:
            t = re.sub(regx, regexMap[regx], t)
        except TypeError:
            continue
    return t
twitter["Tweet"] = twitter["Tweet"].apply(preprocess)
twitter

Unnamed: 0,Tweet,Name,Screen Name,Description,Lang,img_path,Under 21,Race
0,I have.,The Lean Show,ItsLeaninBaby,Catch me crapping on the news and wasting my b...,,profile pics/61792.jpeg,0,0.0
1,All of these “patriots” chiming in talking ab...,Angel,whoangelnever,32. Mom. Style. Horror Aficionado.,,profile pics/60770.jpeg,0,0.0
2,no really because WHAT is this lmfao,hot girl midsommar,verymimi,"gemini, screenwriter, pop culture slut, real d...",,profile pics/60277.jpeg,0,0.0
3,QueenGEe 1 Train Verse via,Queen.,Goddess__Queen,Tell em' all I love em and tell them niggas I'...,,profile pics/63107.jpeg,0,0.0
4,"Actually, it’s taking away the option for F...",JaSm,JNSmi,A Veteran who doesn't let Fear...Interfere! #S...,,profile pics/61625.jpeg,0,0.0
...,...,...,...,...,...,...,...,...
39535,A little heads up about the Europa league set...,TC,GoBlue71TC,,,profile pics/64022.jpeg,0,3.0
39536,My Hero Academia Blind Reaction Season 5 Episo...,Jessica Starks,kuramasgurl018,I'm a youtuber trying to provide entertaining ...,,profile pics/63862.jpeg,0,3.0
39537,I will!!,Too Fat,iwantwhatshehas,SW: 155 CW: let's not talk about that... GW:10...,,profile pics/64249.jpeg,0,3.0
39538,I cant log in or load the forgot password pag...,Shelby Kling,shellbellkling,"Nerd, writer, marketing pro - living and lovin...",,profile pics/63360.jpeg,0,3.0


In [100]:
twitter['Race'].value_counts()

0.0    9885
2.0    9885
3.0    9885
1.0    9885
Name: Race, dtype: int64

In [101]:
names = twitter["Name"].unique()
labels = []
tweet_dict = {}
for name in names:
    tweet_dict[name] = twitter[twitter["Name"] == name]["Tweet"].tolist()

In [102]:
label_dict = twitter.set_index("Name").to_dict()["Race"]

In [103]:
concatenated_tweets = []
labels = []
for name in names:
    concat = ''.join(str(tweet_dict[name]))
    concatenated_tweets.append(concat)
    labels.append(label_dict[name])

concat_df = pd.DataFrame({"User": np.array(names), "Tweet": concatenated_tweets, "Race": np.array(labels)})
len(names), len(concatenated_tweets), len(labels)

(2976, 2976, 2976)

In [104]:
concat_df.info()
concat_df["Race"].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2976 entries, 0 to 2975
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   User    2976 non-null   object 
 1   Tweet   2976 non-null   object 
 2   Race    2976 non-null   float64
dtypes: float64(1), object(2)
memory usage: 69.9+ KB


3.0    2405
0.0     291
1.0     178
2.0     102
Name: Race, dtype: int64

In [105]:
def constructModel():
    bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
    bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")
    text_input = keras.layers.Input(shape=(), dtype=tf.string)
    preprocessed_text = bert_preprocess(text_input)
    outputs = bert_encoder(preprocessed_text)

    dense = keras.layers.Dense(4, activation='softmax')(outputs["pooled_output"])

    model = keras.Model(inputs=[text_input], outputs=[dense])
    model.compile(optimizer='adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])
    model.summary()
    return model

model = constructModel()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None,)]            0                                            
__________________________________________________________________________________________________
keras_layer_6 (KerasLayer)      {'input_type_ids': ( 0           input_4[0][0]                    
__________________________________________________________________________________________________
keras_layer_7 (KerasLayer)      {'default': (None, 7 109482241   keras_layer_6[0][0]              
                                                                 keras_layer_6[0][1]              
                                                                 keras_layer_6[0][2]              
____________________________________________________________________________________________

In [68]:
accs = []
reports = []
confusion_matrices = []

kf = KFold(n_splits = 5)
i = 1
for train, test in kf.split(concat_df):
    print("Fold %d:" % i)
    print()
    i += 1
    train_df = concat_df.iloc[train]
    test_df = concat_df.iloc[test]
    
    x_train, y_train = train_df["Tweet"], train_df["Race"]
    x_test, y_test = test_df["Tweet"], test_df["Race"]
    
    model.fit(x_train, y_train, epochs = 5)
    y_pred = np.argmax(model.predict(x_test), axis=1)
    acc = accuracy_score(y_test, y_pred)
    accs.append(acc)
    print("Accuracy: ", acc)
    print()
    report = classification_report(y_test, y_pred)
    print(report)
    print()
    reports.append(report)
    cm = confusion_matrix(y_test, y_pred, normalize='true')
    print(cm)
    print()
    confusion_matrices.append(cm)

Fold 1:

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy:  0.04194630872483222

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       291
         1.0       0.00      0.00      0.00       178
         2.0       0.00      0.00      0.00       102
         3.0       0.04      1.00      0.08        25

    accuracy                           0.04       596
   macro avg       0.01      0.25      0.02       596
weighted avg       0.00      0.04      0.00       596


[[0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]]

Fold 2:

Epoch 1/5
 1/75 [..............................] - ETA: 9s - loss: 2.5211 - accuracy: 0.7500

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy:  0.9764705882352941

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         0
         1.0       0.00      0.00      0.00         0
         3.0       1.00      0.98      0.99       595

    accuracy                           0.98       595
   macro avg       0.33      0.33      0.33       595
weighted avg       1.00      0.98      0.99       595


[[0.         0.         0.        ]
 [0.         0.         0.        ]
 [0.02184874 0.00168067 0.97647059]]

Fold 3:

Epoch 1/5
 1/75 [..............................] - ETA: 10s - loss: 0.6975 - accuracy: 0.7188

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy:  0.9495798319327731

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         0
         3.0       1.00      0.95      0.97       595

    accuracy                           0.95       595
   macro avg       0.50      0.47      0.49       595
weighted avg       1.00      0.95      0.97       595


[[0.         0.        ]
 [0.05042017 0.94957983]]

Fold 4:

Epoch 1/5
 1/75 [..............................] - ETA: 9s - loss: 0.5874 - accuracy: 0.7500

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy:  0.9663865546218487

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         0
         1.0       0.00      0.00      0.00         0
         3.0       1.00      0.97      0.98       595

    accuracy                           0.97       595
   macro avg       0.33      0.32      0.33       595
weighted avg       1.00      0.97      0.98       595


[[0.         0.         0.        ]
 [0.         0.         0.        ]
 [0.01176471 0.02184874 0.96638655]]

Fold 5:

Epoch 1/5
 1/75 [..............................] - ETA: 9s - loss: 0.4707 - accuracy: 0.7812

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy:  0.9966386554621849

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         0
         3.0       1.00      1.00      1.00       595

    accuracy                           1.00       595
   macro avg       0.50      0.50      0.50       595
weighted avg       1.00      1.00      1.00       595


[[0.         0.        ]
 [0.00336134 0.99663866]]



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [70]:
np.average(accs)

0.7862043877953866

In [72]:
sum(confusion_matrices) / 5

ValueError: operands could not be broadcast together with shapes (4,4) (3,3) 