In [1]:
!pip install tweet-preprocessor
!pip install swifter
!pip install wordcloud

Collecting tweet-preprocessor
  Downloading tweet_preprocessor-0.6.0-py3-none-any.whl (27 kB)
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0
Collecting swifter
  Downloading swifter-1.1.2.tar.gz (633 kB)
[K     |████████████████████████████████| 633 kB 5.1 MB/s 
Collecting psutil>=5.6.6
  Downloading psutil-5.9.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (280 kB)
[K     |████████████████████████████████| 280 kB 63.7 MB/s 
Collecting partd>=0.3.10
  Downloading partd-1.2.0-py3-none-any.whl (19 kB)
Collecting fsspec>=0.6.0
  Downloading fsspec-2022.3.0-py3-none-any.whl (136 kB)
[K     |████████████████████████████████| 136 kB 64.9 MB/s 
Collecting locket
  Downloading locket-0.2.1-py2.py3-none-any.whl (4.1 kB)
Building wheels for collected packages: swifter
  Building wheel for swifter (setup.py) ... [?25l[?25hdone
  Created wheel for swifter: filename=swifter-1.1.2-py3-none

Collecting urlexpander
  Downloading urlexpander-0.0.37.tar.gz (11 kB)
Collecting tldextract
  Downloading tldextract-3.2.0-py3-none-any.whl (87 kB)
[K     |████████████████████████████████| 87 kB 3.8 MB/s 
Collecting unshortenit
  Downloading unshortenit-0.4.0.tar.gz (8.9 kB)
Collecting requests-file>=1.4
  Downloading requests_file-1.5.1-py2.py3-none-any.whl (3.7 kB)
Building wheels for collected packages: urlexpander, unshortenit
  Building wheel for urlexpander (setup.py) ... [?25l[?25hdone
  Created wheel for urlexpander: filename=urlexpander-0.0.37-py3-none-any.whl size=11116 sha256=30c328090010b992f61f4bc72f75d5a374422840a3481c2255c1075767da4348
  Stored in directory: /root/.cache/pip/wheels/36/f2/aa/9319bc326946db050e310f27f18f63327cf083a8a80aff78c2
  Building wheel for unshortenit (setup.py) ... [?25l[?25hdone
  Created wheel for unshortenit: filename=unshortenit-0.4.0-py3-none-any.whl size=12418 sha256=4af4d6b89a1297d2e8ee37c4d077e72b394977241a1f0cf77817b51364adbba9
  St

In [2]:
import pandas as pd 
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import swifter
import matplotlib.pyplot as plt
import html
import preprocessor as p  
import json

In [3]:
def year_to_party(year):
  if int(float(year)) < 2017:
    return 0 
  else:
    return 1

def decode_full_text(text):
  return html.unescape(eval(text).decode("utf-8"))

def parse_mention(text):
  parsed_text = p.parse(text)
  mentions = list(set([mention.match.replace("@", "") for mention in parsed_text.mentions])) if parsed_text.mentions else []
  mentions = [x for x in mentions if x != ""] 
  if len(mentions) > 0:
    return mentions 
  else:
    return ["Nobody"]

def parse_url(text):
  parsed_text = p.parse(text)
  urls = list(set([url.match for url in parsed_text.urls])) if parsed_text.urls else []
  urls = [x for x in urls if x != ""] 
  if len(urls) > 0:
    return urls 
  else:
    return ["no urls"]

def expand_url(urls):
    results = []
    for url in urls:
        if url == "no urls":
            continue 
        if url in url_dict:
            results.append(url_dict[url])
    return results

def extract_info_from_urls(urls):
    res = []
    urls = expand_url(urls)
    for url in urls:
        if "twitter" in url[0]:
            try:
                res.append(url[0].split("/")[3])
            except:
                res.append(url[1])
        else:
            res.append(url[1])
    final_result = " ".join(res)
    if final_result == "":
        return "no_infos"
    else:
        return final_result


def preprocess_data(df, url_dict):

  # log transformation for continuous features
  df["favorite_count_log"] = np.log(df["favorite_count"] + 1)
  df["retweet_count_log"] = np.log(df["retweet_count"] + 1)
  
  # convert year into the party that the president in that year was a member of 
  df["year"] = df["year"].fillna("2016.0")
  df["year_party"] = df["year"].swifter.apply(year_to_party) 
  
  # remove duplicates hashtags 
  df["hash_tags"] = df["hashtags"].swifter.apply(lambda x: list(set(x.split())))
  
  # convert full text into normal string format
  df["full_text"] = df["full_text"].swifter.apply(decode_full_text)

  # extract mentions from the tweet 
  df["mentions"] = df["full_text"].swifter.apply(parse_mention)


  df["urls"] = df["full_text"].swifter.apply(parse_url)
  df["url_infos"] = df["urls"].swifter.apply(extract_info_from_urls)


  p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.NUMBER, p.OPT.SMILEY, p.OPT.ESCAPE_CHAR, p.OPT.RESERVED)

  # remove url, emoji, number from the full text
  df["cleaned_text"] = df["full_text"].swifter.apply(lambda x: p.clean(x))
  return df

In [4]:
def get_final_data(df):
  sentences = df["cleaned_text"].tolist()
  hashtags = [" ".join(x) for x in df["hash_tags"].tolist()]
  mentions = [" ".join(x) for x in df["mentions"].tolist()]
  url_infos = df["url_infos"].tolist()
  num_feats = df[["favorite_count_log", "retweet_count_log", "year_party"]].values
  return sentences, hashtags, mentions, url_infos, num_feats

In [5]:
# we find the destination of all the shortened URLs and save it into json file. 
# we use urlexpander python package to do this. It tooks us roughly 10 hours. 
# all_urls.json file has 653314 distinct URLs
# the original json file can be download here: https://drive.google.com/file/d/1bQKXmamSbCbDSqh3rZn1JMfLrOa2UsK3/view?usp=sharing
# don't try to expand the URLs yourself. 
# e.g. {"original_url": "https://t.co/lfdYotZsuh", "resolved_domain": "twitter.com", "resolved_url": "https://twitter.com/SenToomey/status/1086320463004028934/photo/1"}

url_dict = {}
with open("./all_urls.json") as f:
    for line in f:
        url = json.loads(line)
        url_dict[url["original_url"]] = (url["resolved_url"], url["resolved_domain"])

In [6]:
train_data_path = "./congressional_tweet_training_data.csv"
train_data = pd.read_csv(train_data_path)
train_data = preprocess_data(train_data, url_dict)

Pandas Apply:   0%|          | 0/592803 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/592803 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/592803 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/592803 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/592803 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/592803 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/592803 [00:00<?, ?it/s]

In [7]:
train_df, val_df = train_test_split(train_data, test_size=0.1, random_state=42)
train_sentences, train_hashtags, train_mentions, train_url_infos, train_num_feats = get_final_data(train_df)
val_sentences, val_hashtags, val_mentions, val_url_infos, val_num_feats = get_final_data(val_df)

## TF-IDF + logistic model

In [8]:
combine_train_sentences = [" ".join(list(x)) for x in zip(train_sentences, train_hashtags, train_mentions, train_url_infos)]
combine_val_sentences = [" ".join(list(x)) for x in zip(val_sentences, val_hashtags, val_mentions, val_url_infos)]

In [9]:
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_df.party_id.to_numpy())
val_labels_encoded = label_encoder.transform(val_df.party_id.to_numpy())

In [10]:
model_0 = Pipeline([
  ("tf-idf", TfidfVectorizer(min_df=2)),
  ("clf", LogisticRegression(max_iter=1000))
])
model_0.fit(X=combine_train_sentences, y=train_labels_encoded)

Pipeline(steps=[('tf-idf', TfidfVectorizer(min_df=2)),
                ('clf', LogisticRegression(max_iter=1000))])

In [11]:
model_0.score(X=combine_val_sentences, y=val_labels_encoded)

0.9213913395523018

##Word Embedding + Deep Learning model 

In [12]:
import tensorflow as tf 
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

In [13]:
sent_lens = [len(sentence.split()) for sentence in train_sentences]
hashtag_lens = [len(hash_tag.split()) for hash_tag in train_hashtags]
mention_lens = [len(mention.split()) for mention in train_mentions]
urlinfos_lens = [len(info.split()) for info in train_url_infos]

In [14]:
avg_sent_len, avg_tag_len, avg_mention_len, avg_info_len = np.mean(sent_lens), np.mean(hashtag_lens), np.mean(mention_lens), np.mean(urlinfos_lens)
avg_sent_len, avg_tag_len, avg_mention_len, avg_info_len

(24.102914219094995, 1.4753618407488351, 1.1971558811070584, 1.05560408005668)

### create text vectorizer layer

In [15]:
from tensorflow.keras.layers import TextVectorization

In [16]:
max_tokens = 50000
output_seq_len = int(np.percentile(sent_lens, 99))
output_tag_len = int(np.percentile(hashtag_lens, 99))
output_ment_len = int(np.percentile(mention_lens, 99))
output_info_len = int(np.percentile(urlinfos_lens, 99))

In [17]:
text_vectorizer = TextVectorization(max_tokens=max_tokens, output_sequence_length=output_seq_len)
tag_vectorizer = TextVectorization(max_tokens=max_tokens, output_sequence_length=output_tag_len, standardize=None)
ment_vectorizer = TextVectorization(max_tokens=max_tokens, output_sequence_length=output_ment_len, standardize=None)
info_vectorizer = TextVectorization(max_tokens=30000, output_sequence_length=output_info_len, standardize=None)

In [18]:
text_vectorizer.adapt(train_sentences)
tag_vectorizer.adapt(train_hashtags)
ment_vectorizer.adapt(train_mentions)
info_vectorizer.adapt(train_url_infos)

In [27]:
tweet_text_vocab = text_vectorizer.get_vocabulary()
tweet_tag_vocab = tag_vectorizer.get_vocabulary()
tweet_ment_vocab = ment_vectorizer.get_vocabulary()
tweet_info_vocab = info_vectorizer.get_vocabulary()

### Create dataset

In [28]:
train_dataset = tf.data.Dataset.from_tensor_slices(((train_sentences, train_hashtags, train_mentions, train_url_infos, train_num_feats), train_labels_encoded))
val_dataset = tf.data.Dataset.from_tensor_slices(((val_sentences, val_hashtags, val_mentions, val_url_infos, val_num_feats), val_labels_encoded))

In [29]:
train_dataset = train_dataset.batch(512).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(512).prefetch(tf.data.AUTOTUNE)

### Create Model using Bi-LSTM and CNN

In [30]:
class TextCNN(tf.keras.Model):

    def __init__(self,
                 token_embed,
                 tag_embed,
                 ment_embed,
                 info_embed,
                 text_vectorizer,
                 tag_vectorizer,
                 ment_vectorizer,
                 info_vectorizer,
                 kernel_sizes=[2],
                 dropout_rate=0.4,
                 dilation_rate=1
                 ):
        super(TextCNN, self).__init__()
        self.token_embed = token_embed
        self.ment_embed = ment_embed 
        self.tag_embed = tag_embed 
        self.info_embed = info_embed
        self.text_vectorizer = text_vectorizer
        self.tag_vectorizer = tag_vectorizer
        self.ment_vectorizer = ment_vectorizer
        self.info_vectorizer = info_vectorizer
        self.kernel_sizes = kernel_sizes
        self.convs = []
        self.max_poolings = []
        for kernel_size in self.kernel_sizes:
            self.convs.append(layers.Conv1D(16, kernel_size, activation='relu', dilation_rate=dilation_rate))
            self.max_poolings.append(layers.GlobalMaxPooling1D())
        self.bi_lstm = layers.Bidirectional(layers.LSTM(16, return_sequences=True, return_state=False))
        self.num_feat_layer = layers.Dense(16, activation="relu")
        self.classifier = tf.keras.Sequential(
                    [
                      layers.Dropout(dropout_rate),
                      layers.Dense(64, activation="relu"),
                      layers.Dropout(dropout_rate),
                      layers.Dense(1, activation="sigmoid")
                    ]
                  )

    def call(self, inputs):

        text_inputs, tag_inputs, ment_inputs, info_inputs, num_feat = inputs
        text_vectors = self.text_vectorizer(text_inputs)
        token_embeddings = self.token_embed(text_vectors)
        token_embeddings_bilstm = self.bi_lstm(token_embeddings) 

        tag_vectors = self.tag_vectorizer(tag_inputs)
        tag_embeddings = self.tag_embed(tag_vectors)

        ment_vectors = self.ment_vectorizer(ment_inputs)
        ment_embeddings = self.ment_embed(ment_vectors)

        info_vectors = self.info_vectorizer(info_inputs)
        info_embeddings = self.info_embed(info_vectors)
        convs = []
        for i in range(len(self.kernel_sizes)):
            c = layers.Dropout(0.2)(token_embeddings_bilstm)
            c = self.convs[i](c)
            c = self.max_poolings[i](c)
            convs.append(c)
        
        tag_embeddings = layers.Dropout(0.2)(tag_embeddings)
        tag_embedding = tf.reduce_sum(tag_embeddings, axis=1)
        ment_embeddings = layers.Dropout(0.2)(ment_embeddings)
        ment_embedding = tf.reduce_sum(ment_embeddings, axis=1)
        info_embeddings = layers.Dropout(0.2)(info_embeddings)
        info_embedding = tf.reduce_sum(info_embeddings, axis=1)

        num_embedding = self.num_feat_layer(num_feat)
        x = layers.Concatenate()(convs + [tag_embedding, ment_embedding, info_embedding, num_embedding])
        output = self.classifier(x)
        return output

### Train the model

In [31]:
token_embed = layers.Embedding(input_dim=len(tweet_text_vocab), output_dim=16, mask_zero=True)
tag_embed = layers.Embedding(input_dim=len(tweet_tag_vocab), output_dim=16, mask_zero=True)
ment_embed = layers.Embedding(input_dim=len(tweet_ment_vocab), output_dim=16, mask_zero=True)
info_embed = layers.Embedding(input_dim=len(tweet_info_vocab), output_dim=16, mask_zero=True)

In [32]:
model = TextCNN(token_embed, tag_embed, ment_embed, info_embed, text_vectorizer, tag_vectorizer, ment_vectorizer, info_vectorizer, kernel_sizes=[2], dilation_rate=1)

In [33]:
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-3,
    decay_steps=100,
    decay_rate=0.9
)
model.compile(loss="binary_crossentropy", optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule), metrics=["accuracy"])
early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)
history_model_textcnn = model.fit(train_dataset, steps_per_epoch=len(train_dataset),
                              epochs=20, 
                              validation_data=val_dataset, 
                              validation_steps=len(val_dataset),
                              callbacks=[early_stopping]
                              )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20


In [None]:
model.save_weights("./text_cnn_model_final")

### Test data prediction

In [34]:
test_data_path = "./congressional_tweet_test_data.csv"
test_data = pd.read_csv(test_data_path)
test_data = preprocess_data(test_data, url_dict)

Pandas Apply:   0%|          | 0/265000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/265000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/265000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/265000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/265000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/265000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/265000 [00:00<?, ?it/s]

In [35]:
test_sentences, test_hashtags, test_mentions, test_url_infos, test_num_feats = get_final_data(test_data)

In [36]:
test_dataset = tf.data.Dataset.from_tensor_slices((test_sentences, test_hashtags, test_mentions, test_url_infos, test_num_feats)).batch(1024).prefetch(tf.data.AUTOTUNE)

In [37]:
test_probas = []
for data in test_dataset:
  y_proba = model(data, training=False)
  test_probas.append(y_proba)
test_probas = np.concatenate(test_probas, axis=0)

In [38]:
final_predict_label = np.round(test_probas)

In [39]:
final_classes = label_encoder.inverse_transform(final_predict_label.astype(int).ravel())

In [40]:
test_data.party = final_classes

In [41]:
test_data[["Id", "party"]].to_csv("./test_submission.csv", index=False)