In [6]:
import json, os, glob
import pandas as pd
import feather
from collections import Counter
from pathlib import Path

# This nb contains the processes which save individual jsons to individual ftrs.
# This nb also contains the processes which find whether a tweets contains...
# ...or does not contain pronouns and saves this to their respective seperate file.

In [1]:
json_dir = '../jsons'
# Uses the json_dir variable to navigate to the dataset directory and 'get' all json files
json_pattern = os.path.join(json_dir, '*.json')

# Uses Glob with the json pattern variable to place all the json files and their directories in a list
file_list = glob.glob(json_pattern)

# Opens the JSON file
for file in file_list:
    print(file)
    rows = []
    links = []
    count=0

    with open(file, 'r') as f:
        # directory to save data to
        save_dir = file.replace('.json', '.ftr')

        for line in f:
            if count > 10000:
                break
            count = 0
            data = json.loads(line)

            if count == 0:
                # Retweet?
                if data["text"][:2] != "RT":
                    #username
                    user_id = data["user"]["screen_name"]
                    #location
                    user_location = data["user"]["location"]
                    #Tweet ID
                    tweet_id = data["id_str"]
                    #Hashtags
                    hashtags = data["entities"]["hashtags"]
                    # Retweet?
                    retweet = data["retweeted"]
                    #tweet text
                    if "extended_tweet" in data:
                        text = data["extended_tweet"]["full_text"]
                    else:
                        text = data["text"]
                    rows.append((user_id, user_location, tweet_id, hashtags, text))
        pd.options.display.max_colwidth = 500
        datas = pd.DataFrame(rows, columns=["user_id", "user_location", "tweet_id", "hashtags", "text"])
        datas.to_feather(save_dir)
        print("DONE")

../jsons\2020-03-25.json
DONE
../jsons\2020-03-26.json
DONE
../jsons\2020-03-27.json
DONE
../jsons\2020-03-28.json
DONE
../jsons\2020-03-29.json
DONE
../jsons\2020-03-30.json
DONE
../jsons\2020-03-31.json
DONE
../jsons\2020-04-01.json
DONE


The following creates individual files including and excluding pronouns.

In [3]:
import re

In [1]:
# Checks for pronouns
def whole_word_checker(pronouns):
    return re.compile(r'\b({0})\b'.format(pronouns), flags=re.IGNORECASE).search

In [4]:
def pronoun_checker(text):
    vocab = ["i", "me", "you", "we", "they", "my", "us", "you’re", "I'm", "we're", "they're", "you've", "they've", "you'll", "i'll", "we'll", "they'll"]
    check = []
    for i in vocab:
        check.append(whole_word_checker(i)(text.lower()))

    count = 0
    for i in check:
        if i == None:
            count+=1
    
    if count < len(vocab):
        return True
    else:
        return False

# The following process finds all tweets and checks if they contain pronouns.
# If the user wants pronouns, change the 'pronouns' variable at the top of the cell to True.
# If the user does not want pronouns, change the 'pronouns' variable at the top of the cell to False

In [8]:
# Pronoun False or True?
pronouns = False

if pronouns == True:
    additional_dir = '-with-pronouns.ftr'
else:
    additional_dir = '-without-pronouns.ftr'
print(additional_dir)
json_dir = '../jsons'
# Uses the json_dir variable to navigate to the dataset directory and 'get' all json files
json_pattern = os.path.join(json_dir, '*.json')

# Uses Glob with the json pattern variable to place all the json files and their directories in a list
file_list = glob.glob(json_pattern)

# Opens the JSON file
for file in file_list:
    print(file)
    rows = []
    links = []
    count=0

    with open(file, 'r') as f:
        # directory to save data to
        save_dir = file.replace('.json', additional_dir)

        for line in f:
            if count > 10000:
                break
            count = 0
            data = json.loads(line)

            if count == 0:
                # Retweet?
                if data["text"][:2] != "RT":
                    #username
                    user_id = data["user"]["screen_name"]
                    #location
                    user_location = data["user"]["location"]
                    #Tweet ID
                    tweet_id = data["id_str"]
                    #Hashtags
                    hashtags = data["entities"]["hashtags"]
                    # Retweet?
                    retweet = data["retweeted"]
                    #tweet text
                    if "extended_tweet" in data:
                        text = data["extended_tweet"]["full_text"]
                        if pronoun_checker(text) == pronouns:
                            rows.append((user_id, user_location, tweet_id, hashtags, text))
                    else:
                        text = data["text"]
                        if pronoun_checker(text) == pronouns:
                            rows.append((user_id, user_location, tweet_id, hashtags, text))
                            
        pd.options.display.max_colwidth = 500
        datas = pd.DataFrame(rows, columns=["user_id", "user_location", "tweet_id", "hashtags", "text"])
        datas.to_feather(save_dir)
        print("DONE", save_dir)

-without-pronouns.ftr
../jsons\2020-03-25.json
DONE ../jsons\2020-03-25-without-pronouns.ftr
../jsons\2020-03-26.json
DONE ../jsons\2020-03-26-without-pronouns.ftr
../jsons\2020-03-27.json
DONE ../jsons\2020-03-27-without-pronouns.ftr
../jsons\2020-03-28.json
DONE ../jsons\2020-03-28-without-pronouns.ftr
../jsons\2020-03-29.json
DONE ../jsons\2020-03-29-without-pronouns.ftr
../jsons\2020-03-30.json
DONE ../jsons\2020-03-30-without-pronouns.ftr
../jsons\2020-03-31.json
DONE ../jsons\2020-03-31-without-pronouns.ftr
../jsons\2020-04-01.json
DONE ../jsons\2020-04-01-without-pronouns.ftr
