In [1]:
import torch
import torch.nn.functional as F
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
import plotly.graph_objects as go
import os

from sklearn.model_selection import StratifiedKFold
from torch_geometric.data import Data
from torch.utils.data import DataLoader
from torch_geometric.nn.models import Node2Vec
from torch_geometric.nn import GCNConv, GATConv, SAGEConv, GINConv
from torch.optim import Adam
from torch.nn import NLLLoss
from collections import defaultdict
from dateutil import parser

import ast
# import preprocessing
# import utils

torch.manual_seed(0)
np.random.seed(0)

profiles = pd.read_csv("../data/profiles.csv", usecols=["profile_username", "profile_followed_by", "profile_follow", 
                                                        "medias_nb", "comments_nb", "comments_commenters_nb", 
                                                        "comments_self_nb", "category_1"])
comments = pd.read_csv("../data/comments.csv")

In [2]:
profiles["is_tracked"] = 1

In [3]:
threshold = 200

In [4]:
final_df = comments[["media_author", "commenter"]]\
                .groupby("commenter", as_index=False)\
                .agg("count")\
                .sort_values("commenter")
final_df.columns = ["commenter", "comments_nb"]
final_df = final_df[(final_df.comments_nb >= threshold) | final_df.commenter.isin(profiles.profile_username)]

remaining_tracked_users = profiles[~profiles.profile_username.isin(final_df.commenter)][["profile_username"]]
remaining_tracked_users["comments_nb"] = 0
remaining_tracked_users.columns = ["commenter", "comments_nb"]

In [5]:
final_df = pd.concat([final_df, remaining_tracked_users])

In [6]:
print("The number of commenters: {}".format(len(final_df)))
comments = comments[comments.commenter.isin(final_df.commenter)]
print("The current number of interactions: {}".format(len(comments)))

The number of commenters: 1779
The current number of interactions: 594762


In [7]:
tmp = comments[["category_1", "commenter"]]\
            .groupby("commenter", as_index=False)\
            .agg({"category_1": lambda col: col.nunique()})\
            .sort_values("commenter")

to_categories_nb = {commenter: categories_nb for commenter, categories_nb in tmp.values}
final_df["categories_nb"] = final_df.commenter.apply(lambda commenter: to_categories_nb.get(commenter, 0))

del tmp, to_categories_nb

In [8]:
tmp = comments[["commenter", "comment_tags"]]
tmp["comment_tags"] = tmp.comment_tags.apply(lambda x: len(ast.literal_eval(x)))
tmp = tmp.groupby("commenter", as_index=False)\
            .agg("mean")\
            .sort_values("commenter")

to_avg_tags_nb = {commenter: avg_tags_nb for commenter, avg_tags_nb in tmp.values}
final_df["avg_tags_nb"] = final_df.commenter.apply(lambda commenter: to_avg_tags_nb.get(commenter, 0))

del tmp, to_avg_tags_nb



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [9]:
tmp = comments[["commenter", "comment_mentioned_usernames"]]
tmp["comment_mentioned_usernames"] = tmp.comment_mentioned_usernames.apply(lambda x: len(ast.literal_eval(x)))
tmp = tmp.groupby("commenter", as_index=False)\
            .agg("mean")\
            .sort_values("commenter")

to_avg_cited_users = {commenter: avg_cited_users for commenter, avg_cited_users in tmp.values}
final_df["avg_cited_users"] = final_df.commenter.apply(lambda commenter: to_avg_cited_users.get(commenter, 0))

del tmp, to_avg_cited_users



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [10]:
tmp = comments[["commenter", "comment_text"]]
tmp["comment_text"] = tmp.comment_text.apply(lambda x: len(x))
tmp = tmp.groupby("commenter", as_index=False)\
            .agg("mean")\
            .sort_values("commenter")

to_avg_msg_len = {commenter: avg_msg_len for commenter, avg_msg_len in tmp.values}
final_df["avg_msg_len"] = final_df.commenter.apply(lambda commenter: to_avg_msg_len.get(commenter, 0))

del tmp, to_avg_msg_len



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [11]:
def update_dict(to_weekday, commenter, day, freq):
    if to_weekday[commenter][1] < freq:
        to_weekday[commenter] = (day, freq)

In [12]:
tmp = comments[["commenter", "comment_created_time_str"]]
tmp["day"] = tmp.comment_created_time_str.apply(lambda item: parser.parse(item).weekday())
tmp = tmp.groupby(["commenter", "day"], as_index=False).agg("count")

to_weekday = {}
for commenter, day, freq in tmp.values:
    if not commenter in to_weekday:
        to_weekday[commenter] = (0, 0)
    
    update_dict(to_weekday, commenter, day, freq)
    
final_df["weekday"] = final_df.commenter.apply(lambda commenter: to_weekday.get(commenter, [0])[0])

del tmp, to_weekday



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [13]:
final_df

Unnamed: 0,commenter,comments_nb,categories_nb,avg_tags_nb,avg_cited_users,avg_msg_len,weekday
687,0591_aleinad,259,1,0.000000,0.814672,196.810811,0
1747,102.7245,218,2,0.000000,0.059633,36.885321,6
2210,11luisella,397,1,0.020151,0.020151,46.188917,5
2445,12nycole,222,2,1.648649,1.500000,203.639640,5
2512,1340cc.engine,319,2,0.000000,0.000000,72.768025,2
...,...,...,...,...,...,...,...
253,deboraserracchiani,0,0,0.000000,0.000000,0.000000,0
259,pdroma,0,0,0.000000,0.000000,0.000000,0
260,associazionerousseau,0,0,0.000000,0.000000,0.000000,0
261,mauriziogasparri_official,0,0,0.000000,0.000000,0.000000,0


In [14]:
tmp = comments[["commenter", "comment_created_time_str"]]
tmp["hour"] = tmp.comment_created_time_str.apply(lambda item: parser.parse(item).hour)
tmp = tmp.groupby(["commenter", "hour"], as_index=False).agg("count")

to_hour = {}
for commenter, hour, freq in tmp.values:
    if not commenter in to_hour:
        to_hour[commenter] = (0, 0)
    
    update_dict(to_hour, commenter, hour, freq)
    
final_df["hour"] = final_df.commenter.apply(lambda commenter: to_hour.get(commenter, [0])[0])

del tmp, to_hour



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [15]:
final_df["is_tracked"] = 0

In [16]:
final_df.head(5)

Unnamed: 0,commenter,comments_nb,categories_nb,avg_tags_nb,avg_cited_users,avg_msg_len,weekday,hour,is_tracked
687,0591_aleinad,259,1,0.0,0.814672,196.810811,0,16,0
1747,102.7245,218,2,0.0,0.059633,36.885321,6,12,0
2210,11luisella,397,1,0.020151,0.020151,46.188917,5,12,0
2445,12nycole,222,2,1.648649,1.5,203.63964,5,12,0
2512,1340cc.engine,319,2,0.0,0.0,72.768025,2,9,0


In [17]:
cols = set(final_df.columns).union(profiles.columns)
for col in cols:
    if not col in final_df.columns and col != "profile_username":
        final_df[col] = 0
    if not col in profiles.columns and col != "commenter":
        profiles[col] = 0

n_features = len(set(final_df.columns).union(profiles.columns))

In [18]:
final_df.rename(columns={'commenter':'profile_username'}, inplace=True)
final_df = final_df.reindex(sorted(final_df.columns), axis=1)
profiles = profiles.reindex(sorted(profiles.columns), axis=1)

cols = profiles.columns

In [19]:
final_df.head(1)

Unnamed: 0,avg_cited_users,avg_msg_len,avg_tags_nb,categories_nb,category_1,comments_commenters_nb,comments_nb,comments_self_nb,hour,is_tracked,medias_nb,profile_follow,profile_followed_by,profile_username,weekday
687,0.814672,196.810811,0.0,1,0,0,259,0,16,0,0,0,0,0591_aleinad,0


In [20]:
user_dict = {values[-2]: values for values in profiles.values}

In [21]:
remaining_users = list(set(profiles.profile_username) - set(final_df.profile_username))

profiles = profiles[profiles.profile_username.isin(remaining_users)].values
final_df = final_df.values

In [22]:
def merge_row(row1, row2):
    for i in range(row1.size):
        if type(row2[i]) is str:
            row1[i] = row2[i]
        elif type(row1[i]) is str:
            continue
        elif row1[i] == 0:
            row1[i] += row2[i]

In [23]:
for row in final_df:
    name = row[-2]
    if name in user_dict:
        merge_row(row, user_dict[name])

list(final_df).append(profiles)

In [24]:
final_df = pd.DataFrame(final_df, columns=cols)

In [25]:
final_df.head(1)

Unnamed: 0,avg_cited_users,avg_msg_len,avg_tags_nb,categories_nb,category_1,comments_commenters_nb,comments_nb,comments_self_nb,hour,is_tracked,medias_nb,profile_follow,profile_followed_by,profile_username,weekday
0,0.814672,196.811,0,1,0,0,259,0,16,0,0,0,0,0591_aleinad,0


In [26]:
final_df.to_csv("../data/new_profiles_{}t.csv".format(threshold))

In [27]:
users = final_df.profile_username.values
comments = pd.read_csv("../data/comments.csv", usecols=["media_short_code", "media_author", "commenter"])
comments = comments[(comments.commenter.isin(users)) & (comments.media_author.isin(users))]

In [31]:
comments.head(1)

Unnamed: 0,media_short_code,media_author,commenter
682,Bvs4Nh_HT1L,chefaticalavitadabomber,marchi93


In [28]:
comments.to_csv("../data/new_comments_{}t.csv".format(threshold))