In [314]:
#!pip install tensorflow_text==2.5.0.
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.stats import mode
import re
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.model_selection import KFold

In [318]:
twitter = pd.read_csv("labeled_tweet_table.csv")
twitter = twitter[twitter["Race"].notna() & twitter["Tweet"].notna()]
twitter = twitter[twitter["Race"] != 5]
twitter["Race"] -= 1
twitter["Race"].value_counts()

3.0    242886
0.0     28719
1.0     17797
2.0      9885
Name: Race, dtype: int64

In [319]:
regexMap={r"<[\w'/'\s]*>": "",r"[\'\"\-]+": "",r"@[\w]+":"",r"http\S+" : ""}
def preprocess(datainput):
    t=datainput
    for regx in regexMap.keys():
        try:
            t = re.sub(regx, regexMap[regx], t)
        except TypeError:
            continue
    return t
twitter["Tweet"] = twitter["Tweet"].apply(preprocess)
twitter

Unnamed: 0,Tweet,Name,Screen Name,Description,Lang,img_path,Race,Under 21
0,"YKAR, a futuristic sans serif font by #Freeb...",Chad Boyce,djsnipa1,"Multimedia Developer, Graphic Designer, DJ, an...",,profile pics/60147.jpeg,3.0,0
1,Who can I contact about the very rude and poo...,Chad Boyce,djsnipa1,"Multimedia Developer, Graphic Designer, DJ, an...",,profile pics/60147.jpeg,3.0,0
2,I’d like to win!,Chad Boyce,djsnipa1,"Multimedia Developer, Graphic Designer, DJ, an...",,profile pics/60147.jpeg,3.0,0
3,,Chad Boyce,djsnipa1,"Multimedia Developer, Graphic Designer, DJ, an...",,profile pics/60147.jpeg,3.0,0
4,Now Im heading to B1000th Floor! #quickrogue,Chad Boyce,djsnipa1,"Multimedia Developer, Graphic Designer, DJ, an...",,profile pics/60147.jpeg,3.0,0
...,...,...,...,...,...,...,...,...
313398,Im raising money for Help Feed the Homeless. C...,Joseph Aragunde,thomasaragu,General Contractor plus construction worker go...,,profile pics/64221.jpeg,3.0,0
313399,Im raising money for Help Feed the Homeless. C...,Joseph Aragunde,thomasaragu,General Contractor plus construction worker go...,,profile pics/64221.jpeg,3.0,0
313400,Im raising money for Help Feed the Homeless. C...,Joseph Aragunde,thomasaragu,General Contractor plus construction worker go...,,profile pics/64221.jpeg,3.0,0
313401,Im raising money for Help Feed the Homeless. C...,Joseph Aragunde,thomasaragu,General Contractor plus construction worker go...,,profile pics/64221.jpeg,3.0,0


In [320]:
names = twitter["Name"].unique()
labels = []
tweet_dict = {}
for name in names:
    tweet_dict[name] = twitter[twitter["Name"] == name]["Tweet"].tolist()

In [332]:
label_dict = twitter.set_index("Name").to_dict()["Race"]

In [333]:
concatenated_tweets = []
labels = []
for name in names:
    concat = ' '.join([str(x) for x in tweet_dict[name]])
    concatenated_tweets.append(concat)
    labels.append(label_dict[name])

concat_df = pd.DataFrame({"User": np.array(names), "Tweet": concatenated_tweets, "Race": np.array(labels)})
len(names), len(concatenated_tweets), len(labels)

(3056, 3056, 3056)

In [334]:
concat_df["Race"].value_counts()

3.0    2475
0.0     298
1.0     181
2.0     102
Name: Race, dtype: int64

In [374]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)

x_train, x_test, y_train, y_test = train_test_split(concat_df["Tweet"], concat_df["Race"], test_size=.2)
tfidf.fit(x_train)
x_train = tfidf.transform(x_train)
x_test = tfidf.transform(x_test)

x_train.shape, x_test.shape

((2444, 5000), (612, 5000))

In [375]:
from sklearn.linear_model import LogisticRegression
log = LogisticRegression(class_weight = 'balanced', max_iter = 10 ** 6)
log.fit(x_train, y_train)

LogisticRegression(class_weight='balanced', max_iter=1000000)

In [376]:
y_pred = log.predict(x_test)
print(accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

0.7532679738562091
              precision    recall  f1-score   support

         0.0       0.46      0.50      0.48        68
         1.0       0.16      0.16      0.16        38
         2.0       0.17      0.21      0.19        19
         3.0       0.87      0.86      0.86       487

    accuracy                           0.75       612
   macro avg       0.42      0.43      0.42       612
weighted avg       0.76      0.75      0.76       612



In [377]:
print(confusion_matrix(y_pred, y_test, normalize='true'))

[[0.45945946 0.12162162 0.01351351 0.40540541]
 [0.05405405 0.16216216 0.08108108 0.7027027 ]
 [0.08695652 0.13043478 0.17391304 0.60869565]
 [0.06276151 0.041841   0.02301255 0.87238494]]


In [193]:
labeled_users = pd.read_csv("labeled_users.csv")
labeled_users["race"].value_counts()

4.0    3184
1.0     374
2.0     241
5.0     153
3.0     140
Name: race, dtype: int64

In [184]:
x = concat_df
y = concat_df['Race']

In [185]:
x.drop('Race',inplace=True,axis=1)
x

Unnamed: 0,User,Tweet
0,Chad Boyce,"YKAR, a futuristic sans serif font by #Freeb..."
1,Tomato 😷,"In other words, it’s good news about the vacci..."
2,parker,❤️ 🙏 bonk #FFXIV400kSweepstakes Nice Ed McB...
3,Kevin Jones,How about pizza dipped in water 🤦🏻‍♂️Day 21 o...
4,Joe Duhamel,\n1) Hire better programmers\n2) Your websit...
...,...,...
3051,Shortie,. on #Periscope: Im back someone called hungry...
3052,Ben,
3053,Meagan Lovely,Womens Guide to Burn Fat and Build Muscle: The...
3054,Megan Schuitema,Even though school is cancelled and grades do...


In [188]:
#!pip install imblearn
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

ros = RandomOverSampler(random_state=42)

x_ros, y_ros = ros.fit_resample(x, y)

print('Original dataset shape', Counter(y))
print('Resample dataset shape', Counter(y_ros))

Original dataset shape Counter({3.0: 2475, 0.0: 298, 1.0: 181, 2.0: 102})
Resample dataset shape Counter({3.0: 2475, 2.0: 2475, 0.0: 2475, 1.0: 2475})


In [189]:
df = x_ros
df['Race'] = y_ros
df

Unnamed: 0,User,Tweet,Race
0,Chad Boyce,"YKAR, a futuristic sans serif font by #Freeb...",3.0
1,Tomato 😷,"In other words, it’s good news about the vacci...",3.0
2,parker,❤️ 🙏 bonk #FFXIV400kSweepstakes Nice Ed McB...,3.0
3,Kevin Jones,How about pizza dipped in water 🤦🏻‍♂️Day 21 o...,3.0
4,Joe Duhamel,\n1) Hire better programmers\n2) Your websit...,3.0
...,...,...,...
9895,Abhishek Chaliha,The four cylinder WagonR makes a comeback in t...,2.0
9896,Devendra,I think you mean... Bruh I deleted all that...,2.0
9897,díẹ̀kọláàánútímorígba,Ọmọ this album that Adekunle Gold is cooking i...,2.0
9898,jun 💌,*grapples the baki* DOES ANYONE HAVE THAT ONE ...,2.0


In [84]:
def constructModel():
    bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
    bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")
    text_input = keras.layers.Input(shape=(), dtype=tf.string)
    preprocessed_text = bert_preprocess(text_input)
    outputs = bert_encoder(preprocessed_text)

    dense = keras.layers.Dense(4, activation='softmax')(outputs["pooled_output"])

    model = keras.Model(inputs=[text_input], outputs=[dense])
    model.compile(optimizer='adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])
    model.summary()
    return model

model = constructModel()

Model: "model_10"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_12 (InputLayer)           [(None,)]            0                                            
__________________________________________________________________________________________________
keras_layer_22 (KerasLayer)     {'input_word_ids': ( 0           input_12[0][0]                   
__________________________________________________________________________________________________
keras_layer_23 (KerasLayer)     {'default': (None, 7 109482241   keras_layer_22[0][0]             
                                                                 keras_layer_22[0][1]             
                                                                 keras_layer_22[0][2]             
___________________________________________________________________________________________

In [191]:
'''
accs = []
reports = []
confusion_matrices = []

kf = KFold(n_splits = 5)
i = 1
for train, test in kf.split(concat_df):
    
    print("Fold %d:" % i)
    print()
    i += 1
    train_df = df.iloc[train]
    test_df = df.iloc[test]
    
    x_train, y_train = train_df["Tweet"], train_df["Race"]
    x_test, y_test = test_df["Tweet"], test_df["Race"]
    
    model.fit(x_train, y_train, epochs = 5)
    y_pred = np.argmax(model.predict(x_test), axis=1)
    acc = accuracy_score(y_test, y_pred)
    accs.append(acc)
    print("Accuracy: ", acc)
    print()
    report = classification_report(y_test, y_pred)
    print(report)
    print()
    reports.append(report)
    cm = confusion_matrix(y_test, y_pred, normalize='true')
    print(cm)
    print()
    confusion_matrices.append(cm)
'''

Fold 1:

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy:  0.8316993464052288

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        52
         1.0       0.00      0.00      0.00        36
         2.0       0.00      0.00      0.00        15
         3.0       0.83      1.00      0.91       509

    accuracy                           0.83       612
   macro avg       0.21      0.25      0.23       612
weighted avg       0.69      0.83      0.76       612


[[0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]]

Fold 2:

Epoch 1/5


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/5
Epoch 3/5

KeyboardInterrupt: 

In [38]:
np.average(accs)

0.8092203930126333

In [39]:
sum(confusion_matrices) / 5

array([[2.85714286e-03, 0.00000000e+00, 0.00000000e+00, 9.97142857e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [7.93650794e-04, 0.00000000e+00, 3.96825397e-04, 9.98809524e-01]])