In [1]:
import pandas as pd
import random as rnd
import numpy as np
from gensim.models import Word2Vec
import tensorflow as tf
from tensorflow import keras
from sklearn.feature_extraction.text import TfidfVectorizer



In [56]:
# Read in the dataframe of labeled tweets

tweet_df = pd.read_json('Twibot-20/tweet.json')

In [59]:
def melt_tweets(df):
    """
    "Melts" a Twibot dataframe with lists in the tweet field into indexed rows for each tweet
    Skips rows that have no tweets.
    """
    out_df = pd.DataFrame()
    for row in df.index:
        if df.loc[row, 'tweet']:
            df_dict = {}
            df_dict['tweet'] = df.loc[row, 'tweet']
            df_dict['tweet_index'] = range(len(df.loc[row, 'tweet']))
            df_dict['ID'] = [ df.loc[row, 'ID'] for _ in range(len(df.loc[row, 'tweet'])) ]
            df_dict['label'] = [ df.loc[row, 'label'] for _ in range(len(df.loc[row, 'tweet'])) ]
            row_df = pd.DataFrame(df_dict)
            out_df = pd.concat([out_df,row_df])
        
            
    return out_df.reset_index()

In [83]:
def sample_tweets(df, sample_size=10, random_state=None):
    """
    Creates a random sample of the Twibot dataframe with one tweet per line
    """
    
    out_df = df.sample(sample_size, replace=True, random_state=random_state).dropna()
    
    while len(out_df.index) < sample_size:
        extra_df = df.sample(sample_size - len(out_df.index), replace=True, random_state=random_state).dropna()
        out_df = pd.concat([out_df, extra_df])
        
    
    out_df['tweet'] = out_df['tweet'].apply(lambda x: rnd.choice(x))
    
    return out_df.reset_index(drop=True)

In [91]:
n = 100000

sample = sample_tweets(tweet_df, sample_size = n)

In [92]:
vectorizer = TfidfVectorizer(stop_words = 'english', min_df=10, ngram_range=(1,4))

X = vectorizer.fit_transform(sample.tweet)

feat_size = X.shape[1]

In [93]:
feat_size

14873

In [94]:
X = X.toarray()

In [95]:
y = np.array(sample.label)
y.shape

(100000,)

In [96]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(20, activation='relu'),
    tf.keras.layers.Dense(2, activation='softmax')
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.fit(X, y, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1831f78de50>

In [100]:
test_df = sample_tweets(tweet_df, sample_size = 1000)
X = vectorizer.transform(test_df.tweet).toarray()
y = test_df.label

In [101]:
model.evaluate(X, y, verbose=2)

32/32 - 0s - loss: 0.6560 - accuracy: 0.6460


[0.6559779047966003, 0.6460000276565552]

In [97]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (32, 20)                  297480    
_________________________________________________________________
dense_5 (Dense)              (32, 2)                   42        
Total params: 297,522
Trainable params: 297,522
Non-trainable params: 0
_________________________________________________________________
