# Pipeline for keyword extraction from Tweet text data

The below Jupyter Notebook contains a Python pipeline for extraxting keywords from Twitter text data using tf idf, and a comparison point of the KeyBERT algorithm. This notebook can be easily adapted for future use for basic keyword extraction tasks. 

Also bundled with this work is an R script for extracting keywords from Tweet text, using the RAKE algorithm. This too can be easily adapted for future work, with only minor modifications.

## Keyword generation from scraped twitter text data

## Imports

In [2]:
# general
import re
import pandas as pd
import json

# keywords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords 
from keybert import KeyBERT

## Definitions

In [3]:
def pre_process(text):
    text = text.lower()
    # remove tags
    text = re.sub("&lt;/?.*?&gt;", "&lt;&gt;", text)
    # remove special characters and digits
    text = re.sub("(\\d|\\W)+", " ", text)
    return text

def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key = lambda x: (x[1], x[0]), reverse = True)

def extract_topN_from_vector(feature_names, sorted_items, topN = 10):
    sorted_items = sorted_items[:topN]
    score_vals = []
    feature_vals = []
    for idx, score in sorted_items: 
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
    results = {}
    for idx in range(len(feature_vals)): 
        results[feature_vals[idx]] = score_vals[idx]
    return results

def extract_all_from_vector(feature_names, sorted_items):
    score_vals = []
    feature_vals = []
    for idx, score in sorted_items: 
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
    results = {}
    for idx in range(len(feature_vals)): 
        results[feature_vals[idx]] = score_vals[idx]
    return results

## Data load and preprocessing

In [5]:
# load json of tweets / text 
df_idf = pd.read_json("../data/tweet_text.json", lines = True) # change path as appropriate

# process 
df_idf['text'] = df_idf['text'].apply(lambda x:pre_process(x))
docs = df_idf['text'].tolist()

## tf idf calculation

Note custom stopwords should be used as opposed to the default 'english' set used here

In [6]:
cv = CountVectorizer(max_df=0.85, stop_words='english') 
word_count_vector=cv.fit_transform(docs)
tfidf_transformer = TfidfTransformer(smooth_idf = True, use_idf= True)
tfidf_transformer.fit(word_count_vector)
feature_names = cv.get_feature_names()
tf_idf_vector = tfidf_transformer.transform(cv.transform(docs))

## Keyword extraction

In [7]:
sorted_items = sort_coo(tf_idf_vector.tocoo())
keywords = extract_topN_from_vector(feature_names, sorted_items, 10)

In [None]:
all_keywords = extract_all_from_vector(feature_names, sorted_items)

## Write out data for later use

In [None]:
json_object = json.dumps(all_keywords)

with open("data/all_keywords.json", "w") as outfile:
    outfile.write(json_object)

df_idf.to_csv("../data/text_for_R.csv", index=False)

## Show the first N (in this case 10) keywords generated

In [None]:
print("\n===Top 10 Keywords===")
for k in keywords:
    print(k, keywords[k])

## Compare with KeyBERT generated keywords

In [None]:
# KeyBERT 
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(docs)

print(kw_model.extract_keywords(docs, keyphrase_ngram_range=(2, 5), stop_words=None))