In [1]:
import datetime
import numpy as np
import statistics
from tqdm import tqdm

def convert(date_time):
    date_time = date_time.replace('+0000', "")
    format = '%a %b %d %H:%M:%S %Y'  
    datetime_str = datetime.datetime.strptime(date_time, format)
 
    return datetime_str


def profile_vectorization(data):

  embd_vectors = {}
  print("Profile Vectorization")

  for item in tqdm(data.items()):
    prof_name = item[0].replace(".json", "")
    features = item[1]

    property_embedding = []

    # location
    if features["location"] != "":
      location = [1]
    else :
      location = [0]
    
    property_embedding += location

    # description
    if features["description"] != "":
      description = [1]
    else :
      description = [0]

    property_embedding += description

    # url
    if features["url"] != "":
      url = [1]
    else:
      url = [0]

    property_embedding += url

    # protected
    if features["protected"] == True:
      protected = [1]
    else:
      protected = [0]

    property_embedding += protected

    # followers count
    followers_count = [features["followers_count"]]

    property_embedding += followers_count

    # friends count
    friends_count = [features["friends_count"]]

    property_embedding += friends_count

    # listed count
    listed_count = [features["listed_count"]]

    property_embedding += listed_count

    # created at
    date = convert(features["created_at"])
    created_at = [date.year,
                  date.month,
                  date.day,
                  date.hour,
                  date.minute,
                  date.second
                  ]


    property_embedding += created_at

    # favourites count
    favourites_count = [features["favourites_count"]]

    property_embedding += favourites_count

    # geo enabled
    if features["geo_enabled"] == True:
      geo_enabled = [1]
    else:
      geo_enabled = [0]

    property_embedding += geo_enabled

    # verified
    if features["verified"] == True:
      verified = [1]
    else:
      verified = [0]

    property_embedding += verified

    # statuses count
    statuses_count = [features["statuses_count"]]

    property_embedding += statuses_count

    # profile use background image
    if features["profile_use_background_image"] == True:
      profile_use_background_image = [1]
    else:
      profile_use_background_image = [0]

    property_embedding += profile_use_background_image

    embd_vectors[prof_name] = [np.float64(x) for x in property_embedding]

  return embd_vectors

In [2]:
def tweet_vectorization(data):

  embed_vectors = {}

  print("Tweet Vectorization")
  for item in tqdm(data.items()):
    prof_name = item[0].replace(".json", "")
    tweets = item[1]

    aggregate_vector = []

    for tweet in tweets:

      # created at
      tweet_vector = []
      temp = convert(tweet["created_at"])
      created_at = [
          temp.year,
          temp.month,
          temp.day,
          temp.hour,
          temp.minute,
          temp.second
      ]
      tweet_vector += created_at

      # truncated
      if tweet["truncated"] == True:
        truncated = [1]
      else:
        truncated = [0]
      tweet_vector += truncated

      # retweet count
      retweet_count = [tweet["retweet_count"]]
      tweet_vector += retweet_count

      # favorite count
      favorite_count = [tweet["favorite_count"]]
      tweet_vector += favorite_count

      # favorited
      if tweet["favorited"] == True:
        favorited = [1]
      else:
        favorited = [0]
      tweet_vector += favorited

      # retweeted
      if tweet["retweeted"] == True:
        retweeted = [1]
      else:
        retweeted = [0]
      tweet_vector += retweeted

      # is quote status
      if tweet["is_quote_status"] == True:
        is_quote_status = [1]
      else:
        is_quote_status = [0]
      tweet_vector += is_quote_status

      # number of hashtags
      if "hashtags" in tweet["entities"].keys():
        number_of_hashtags = [len(tweet["entities"]["hashtags"])]
      else:
        number_of_hashtags = [0]
      tweet_vector += number_of_hashtags

      # number of user mention
      if "user_mentions" in tweet["entities"].keys():
        user_mentions = [len(tweet["entities"]["user_mentions"])]
      else:
        user_mentions = [0]
      tweet_vector += user_mentions

      # number of url
      if "urls" in tweet["entities"].keys():
        urls = [len(tweet["entities"]["urls"])]
      else:
        urls = [0]
      tweet_vector += urls

      # number of symbols
      if "symbols" in tweet["entities"].keys():
        number_of_symbols = [len(tweet["entities"]["symbols"])]
      else:
        number_of_symbols = [0]
      tweet_vector += number_of_symbols

      # poll
      if "poll" in tweet["entities"].keys():
        if len(tweet["entities"]["poll"]) > 0:
          poll = [1]
        else:
          poll = [0]
      else:
        poll = [0]
      tweet_vector += poll

      
      aggregate_vector.append(tweet_vector)
    tweet_property_vec = [statistics.mean(k) for k in zip(*aggregate_vector)]
    embed_vectors[prof_name] = [np.float64(x) for x in tweet_property_vec]
  return embed_vectors

In [6]:
import json

def data_vectorization(profs, tweets):

  prof = profile_vectorization(profs)
  json_obj = json.dumps(prof, indent=4)
  with open("Non textual features\\profiles.json", 'w', encoding="utf-8") as f:
    f.write(json_obj)
  f.close()

  tweet = tweet_vectorization(tweets)
  json_obj = json.dumps(tweet, indent=4)
  with open("Non textual features\\tweets.json", 'w', encoding="utf-8") as f:
    f.write(json_obj)
  f.close()


In [9]:

tweet_path = "Labeled Data\\user_tweet_500.json"
profile_path = "Labeled Data\\user_profile.json"

user_profile = {}
user_tweet = {}

with open(profile_path, "r", encoding="utf8") as f:
  temp = json.load(f)
  user_profile = {x: v for x, v in temp.items()}
  f.close()

with open(tweet_path, "r", encoding="utf8") as f:
  temp = json.load(f)
  user_tweet = {x: v for x, v in temp.items()}
  f.close()

In [None]:
output = data_vectorization(user_profile, user_tweet)