In [2]:
import glob
import json
import nltk
from nltk.tokenize import TweetTokenizer
import pandas as pd
import re
from string import punctuation

## Loading Data

In [3]:
# Load tweets as JSON
allTweets_json = {'tweets': []}
for file in glob.glob('Your/Data/Path/*'):
    with open(file) as data_file:
        json_data = json.load(data_file)
        for tweet in json_data['tweet_data']:
            allTweets_json['tweets'].append(tweet)
allTweets_json

{'tweets': [{'in_reply_to_user_id': '1021209821695991810',
   'entities': {'urls': [{'start': 30,
      'end': 53,
      'url': 'https://t.co/Z4HWMRlwMU',
      'expanded_url': 'https://twitter.com/haejin28_/status/1079957943343144960/photo/1',
      'display_url': 'pic.twitter.com/Z4HWMRlwMU'}]},
   'public_metrics': {'retweet_count': 0,
    'reply_count': 0,
    'like_count': 1,
    'quote_count': 0},
   'lang': 'en',
   'id': '1079957943343144960',
   'reply_settings': 'everyone',
   'referenced_tweets': [{'type': 'replied_to', 'id': '1079957322275799040'}],
   'conversation_id': '1079710990441181185',
   'text': '@Panchin083 Hahahah i wuv you https://t.co/Z4HWMRlwMU',
   'created_at': '2019-01-01T04:30:12.000Z',
   'author_id': '989088763094732805'},
  {'in_reply_to_user_id': '772925896130961408',
   'public_metrics': {'retweet_count': 1,
    'reply_count': 1,
    'like_count': 1,
    'quote_count': 0},
   'lang': 'en',
   'id': '1079957943334912000',
   'reply_settings': 'everyone

In [4]:
# Convert JSON tweets to DataFrame
data = [pd.json_normalize(allTweets_json['tweets'])]
df = pd.concat(data, axis=0, ignore_index=True)

# Parse datetime column to datetime
df.created_at = pd.to_datetime(df.created_at)
df

Unnamed: 0,in_reply_to_user_id,lang,id,reply_settings,referenced_tweets,conversation_id,text,created_at,author_id,entities.urls,...,entities.mentions,entities.annotations,geo.place_id,entities.hashtags,geo.coordinates.type,geo.coordinates.coordinates,entities.cashtags,withheld.copyright,withheld.country_codes,withheld.scope
0,1021209821695991810,en,1079957943343144960,everyone,"[{'type': 'replied_to', 'id': '107995732227579...",1079710990441181185,@Panchin083 Hahahah i wuv you https://t.co/Z4H...,2019-01-01 04:30:12+00:00,989088763094732805,"[{'start': 30, 'end': 53, 'url': 'https://t.co...",...,,,,,,,,,,
1,772925896130961408,en,1079957943334912000,everyone,"[{'type': 'replied_to', 'id': '107995300863559...",1079953008635592704,@ViciousRalph Why you doubt yourself? Figure o...,2019-01-01 04:30:12+00:00,954046885467246592,,...,,,,,,,,,,
2,,en,1079957943334899712,everyone,"[{'type': 'retweeted', 'id': '1079957890784481...",1079957943334899712,RT @tcm: Still struggling to find the perfect ...,2019-01-01 04:30:12+00:00,28752714,,...,"[{'start': 3, 'end': 7, 'username': 'tcm', 'id...",,,,,,,,,
3,,en,1079957943330705409,everyone,"[{'type': 'retweeted', 'id': '1079280554560765...",1079957943330705409,RT @emilyxsnapp: fuck https://t.co/oBcfD6HPgj,2019-01-01 04:30:12+00:00,231199325,"[{'start': 22, 'end': 45, 'url': 'https://t.co...",...,,,,,,,,,,
4,,en,1079957943330627584,everyone,"[{'type': 'retweeted', 'id': '1079957919792353...",1079957943330627584,RT @entwinedglobal: We wish you all a prospero...,2019-01-01 04:30:12+00:00,809111804035104768,,...,"[{'start': 3, 'end': 18, 'username': 'entwined...","[{'start': 71, 'end': 73, 'probability': 0.917...",,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
919524,,en,1437119038228926466,everyone,"[{'type': 'retweeted', 'id': '1437118272453885...",1437119038228926466,RT @EliseiNicole: #UK Manchester Parklife Fest...,2021-09-12 18:20:53+00:00,141679950,"[{'start': 80, 'end': 103, 'url': 'https://t.c...",...,"[{'start': 3, 'end': 16, 'username': 'EliseiNi...","[{'start': 22, 'end': 49, 'probability': 0.330...",,"[{'start': 18, 'end': 21, 'tag': 'UK'}]",,,,,,
919525,,en,1437119038228865025,everyone,,1437119038228865025,Yaqoob (ع) cried for 40 years even though Yous...,2021-09-12 18:20:53+00:00,2199814184,"[{'start': 194, 'end': 217, 'url': 'https://t....",...,,"[{'start': 42, 'end': 47, 'probability': 0.848...",,"[{'start': 170, 'end': 193, 'tag': 'Leadership...",,,,,,
919526,1352321120368799852,en,1437119038224744449,everyone,"[{'type': 'replied_to', 'id': '143711889764425...",1437118897644257280,@sopeoIogist outro tear is yours,2021-09-12 18:20:53+00:00,1372615442993414144,,...,"[{'start': 0, 'end': 12, 'username': 'sopeoIog...",,,,,,,,,
919527,,en,1437119038220652551,everyone,,1437119038220652551,Turn on Air-Con! The current temperature is 26...,2021-09-12 18:20:53+00:00,1397006132460232705,,...,,,,,,,,,,


## Tweet DataFrame Modification (NLP)

In [5]:
TWEET_TOKENIZER = TweetTokenizer(preserve_case=False,
                                 strip_handles=True, reduce_len=False)
STOPWORDS = (set(nltk.corpus.stopwords.words("english")) |
             {"...", '…', '•', '’', "com"} |
             set(punctuation))

# Functions
# --- Replace URLs in `string` with text of `replacement`
def replace_urls(string, replacement=None):
    if replacement is None:
        replacement = "<-URL->"
    pattern = re.compile('(https?://)?(\w*[.]\w+)+([/?=&]+\w+)*')
    return re.sub(pattern, replacement, string)

# --- Tokenize text and remove punctuation/stopwords etc
def tokenizer(tweet_text, custom_words=None):
    text = (replace_urls(tweet_text))
    tokens = TWEET_TOKENIZER.tokenize(text.lower())
    tokens = (token for token in tokens if token not in punctuation)
    tokens = (token for token in tokens if token.islower())
    tokens = (token for token in tokens if token not in STOPWORDS)
    tokens = (token for token in tokens if len(token) >= 3)
    if custom_words:
        tokens = (token for token in tokens if token not in custom_words)
    return list(tokens)

# Main
df = df.assign(text_processed=df['text'].apply(lambda t: tokenizer(t)))
df

Unnamed: 0,in_reply_to_user_id,lang,id,reply_settings,referenced_tweets,conversation_id,text,created_at,author_id,entities.urls,...,entities.annotations,geo.place_id,entities.hashtags,geo.coordinates.type,geo.coordinates.coordinates,entities.cashtags,withheld.copyright,withheld.country_codes,withheld.scope,text_processed
0,1021209821695991810,en,1079957943343144960,everyone,"[{'type': 'replied_to', 'id': '107995732227579...",1079710990441181185,@Panchin083 Hahahah i wuv you https://t.co/Z4H...,2019-01-01 04:30:12+00:00,989088763094732805,"[{'start': 30, 'end': 53, 'url': 'https://t.co...",...,,,,,,,,,,"[hahahah, wuv, <-url->]"
1,772925896130961408,en,1079957943334912000,everyone,"[{'type': 'replied_to', 'id': '107995300863559...",1079953008635592704,@ViciousRalph Why you doubt yourself? Figure o...,2019-01-01 04:30:12+00:00,954046885467246592,,...,,,,,,,,,,"[doubt, figure, need, work, better, keep, head..."
2,,en,1079957943334899712,everyone,"[{'type': 'retweeted', 'id': '1079957890784481...",1079957943334899712,RT @tcm: Still struggling to find the perfect ...,2019-01-01 04:30:12+00:00,28752714,,...,,,,,,,,,,"[still, struggling, find, perfect, cocktail, r..."
3,,en,1079957943330705409,everyone,"[{'type': 'retweeted', 'id': '1079280554560765...",1079957943330705409,RT @emilyxsnapp: fuck https://t.co/oBcfD6HPgj,2019-01-01 04:30:12+00:00,231199325,"[{'start': 22, 'end': 45, 'url': 'https://t.co...",...,,,,,,,,,,"[fuck, <-url->]"
4,,en,1079957943330627584,everyone,"[{'type': 'retweeted', 'id': '1079957919792353...",1079957943330627584,RT @entwinedglobal: We wish you all a prospero...,2019-01-01 04:30:12+00:00,809111804035104768,,...,"[{'start': 71, 'end': 73, 'probability': 0.917...",,,,,,,,,"[wish, prosperous, new, year, full, god's, ble..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
919524,,en,1437119038228926466,everyone,"[{'type': 'retweeted', 'id': '1437118272453885...",1437119038228926466,RT @EliseiNicole: #UK Manchester Parklife Fest...,2021-09-12 18:20:53+00:00,141679950,"[{'start': 80, 'end': 103, 'url': 'https://t.c...",...,"[{'start': 22, 'end': 49, 'probability': 0.330...",,"[{'start': 18, 'end': 21, 'tag': 'UK'}]",,,,,,,"[#uk, manchester, parklife, festival, <-url->,..."
919525,,en,1437119038228865025,everyone,,1437119038228865025,Yaqoob (ع) cried for 40 years even though Yous...,2021-09-12 18:20:53+00:00,2199814184,"[{'start': 194, 'end': 217, 'url': 'https://t....",...,"[{'start': 42, 'end': 47, 'probability': 0.848...",,"[{'start': 170, 'end': 193, 'tag': 'Leadership...",,,,,,,"[yaqoob, cried, years, even, though, yousuf, a..."
919526,1352321120368799852,en,1437119038224744449,everyone,"[{'type': 'replied_to', 'id': '143711889764425...",1437118897644257280,@sopeoIogist outro tear is yours,2021-09-12 18:20:53+00:00,1372615442993414144,,...,,,,,,,,,,"[outro, tear]"
919527,,en,1437119038220652551,everyone,,1437119038220652551,Turn on Air-Con! The current temperature is 26...,2021-09-12 18:20:53+00:00,1397006132460232705,,...,,,,,,,,,,"[turn, air-con, current, temperature, <-url->,..."


## Save DataFrame

In [6]:
df.to_pickle('Your/Data/Path/sample_dataframe.pkl')