In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import logging
import os
import re
import json
## Cutomized modules
from ContentParser import ContentParser
content_parser = ContentParser()

# Hashtags Generation

In [5]:
dates_to_train =  ["04-13", "04-22", "04-30", "05-08", "05-16", "05-25", "06-02", "06-10", "06-14", "06-22"]
path_of_data = "../Retweets/"
whether_save = True
save_path = "../Hashtags_Covariates/"

## Load data
retweets_dict = content_parser.read_retweets_for_dates(path_of_data, dates_to_train)

In [6]:
## tweets_hashtags are hashtags for each tweet
tweets_hashtags = content_parser.extract_hashtags_for_tweets(retweets_dict)
## hashtags_dic is a dictionary of hashtags and their frequencies
hashtags_dic = content_parser.generate_hashtags_set(retweets_dict)

In [7]:
## Sort hashtags by their frequencies and only select hashtags with frequencies more than 60
sorted_hashtags = sorted(hashtags_dic.items(), key=lambda x: x[1], reverse=True)
sorted_hashtags_more_than_60 = [hashtag for hashtag in sorted_hashtags if hashtag[1] > 60]
hashtags_list = [hashtag[0] for hashtag in sorted_hashtags_more_than_60]
len(sorted_hashtags_more_than_60)

878

In [7]:
## Generate matrix for whole sorted hashtags more than 60 where each row is a hashtag and each column is a tweet
matrix = content_parser.generate_matrix(tweets_hashtags, hashtags_list)
print(matrix.shape)

(878, 159822)


In [8]:
## Train the clustering model
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
Z = linkage(matrix, method='complete', metric='yule')

In [11]:
## Set the distance and generate clusters
distance_threshold = 1
results = fcluster(Z, t=distance_threshold, criterion='distance')
groups = content_parser.generate_topics_clusters(results, hashtags_list)

In [26]:
## groups example
groups['group1']
if not os.path.exists(save_path):
    os.mkdir(save_path)
## Save a goups clustering result of hashtags
with open(f'{save_path}groups_clustering.json', 'w') as f:
    json.dump(groups, f)
    f.close()
## Save a full list of hashtags
with open(f'{save_path}hashtags_list.json', 'w') as f:
    json.dump(hashtags_list, f)
    f.close()

## Generate Hashtags

In [14]:
# dates_to_include = ["04-22", "04-26","05-08", "05-12", "05-16", "05-21", "05-25", "05-29","06-02", "06-06", "06-10", "06-14", "06-18", "06-22"]
dates_to_include =  ["04-30"]
path_of_data = "../Retweets/"
whether_save = True
save_path = "../Hashtags_Covariates/"

## Load Generated groups hashtags
with open(f'{save_path}groups_clustering.json', "r") as f:
    groups = json.load(f)
    f.close()
with open(f'{save_path}hashtags_list.json', "r") as f:
    hashtags_list = json.load(f)
    f.close()
## Load data
retweets_dict = content_parser.read_retweets_for_dates(path_of_data, dates_to_include)

In [13]:
## Generate Benchmark Vector
benchmark = content_parser.generate_vector_groups(groups, hashtags_list)
## Loop through data
for date in dates_to_include:
    dic_of_users = content_parser.extract_hashtags_for_tweets_and_users(retweets_dict[date])
    result = content_parser.generate_users_results(dic_of_users, hashtags_list, benchmark)
    result_matrix = content_parser.generate_matrix_for_users_and_groups(result, list(benchmark.keys()))
    result_df = pd.DataFrame(result_matrix, index=result.keys(), columns=list(benchmark.keys()))
    result_df = result_df.reset_index()
    result_df = result_df.rename(columns={'index':'user'})
    if whether_save:
        result_df.to_csv(f'{save_path}{date}_hashtags_covariates.csv', index=False)
    logging.info(f'{date} hashtags preprocessing is done')