In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import feather
import pickle
from scipy import stats

In [4]:
label_members = feather.read_dataframe("./label_members.feather")

In [5]:
label_members.head()

Unnamed: 0,label_id,label_name,count,G0,G1,G2
0,0,Games,859519,0,0,0
1,1,Vehicle,678257,3,1,0
2,2,Video game,518981,0,0,0
3,3,Concert,494707,2,2,1
4,4,Car,371391,3,1,0


In [6]:
# top 10 labels in each cluster
pd.set_option('display.max_colwidth', 40)
label_association = label_members.groupby('G0')\
    .apply(lambda tbl: tbl.sort_values('count', ascending=False).iloc[:10, :].ix[:, 'label_id'].unique())
print(label_association)
print([len(x) for x in label_association])

G0
0      [0, 2, 19, 33, 34, 35, 43, 51, 55, 96]
1     [36, 53, 59, 67, 69, 74, 84, 104, 10...
2        [3, 5, 7, 9, 13, 14, 16, 20, 24, 25]
3      [1, 4, 11, 17, 27, 40, 41, 57, 70, 94]
4     [6, 15, 28, 45, 56, 61, 75, 78, 85, 89]
5     [8, 21, 54, 60, 65, 73, 77, 76, 79, 81]
6     [10, 18, 22, 42, 48, 71, 80, 83, 86,...
7         [12, 26, 32, 52, 58, 109, 179, 184]
8     [23, 29, 37, 47, 72, 101, 131, 141, ...
9     [50, 82, 92, 102, 129, 139, 149, 151...
10          [62, 64, 103, 119, 143, 152, 161]
dtype: object
[10, 10, 10, 10, 10, 10, 10, 8, 10, 10, 7]


In [8]:
grouped_sample = pickle.load(open("./grouped_sample_larger.pkl", "rb"))

In [9]:
grouped_sample.head()

Unnamed: 0,video_id,duration,title,channel_id,channel_title,published_at,views,likes,dislikes,comments,...,1,2,3,4,5,6,7,8,9,10
0,--DwgB78t-c,00:02:36,Fruto Dulce - Geraldine Rojas & Ezequiel Paludi 3,UCdzU3DSGzyWzN2118yd9X9g,Abdullah AYDIN,2012-01-19 18:20:02,1671,10,1,0,...,0,1,0,0,0,0,0,0,0,0
1,--NZRkXBV7k,00:04:53,Carlos Santana presents Cindy Blackman in Mexico City,UC0UnhAG47DRyVZGVcbhAXhQ,Saul TP,2015-03-30 04:04:40,9994,41,3,2,...,0,1,0,0,0,0,0,0,0,0
2,--hoQ2sGG4M,00:04:35,"Taoist monk Tian Xin Shan in Brazil,Wudang Sanfeng T'ai Chi exercises",UCXjtAvK5P3wXBGh0vbGylzg,TheTVtaoista,2009-08-07 06:36:33,3407,8,1,2,...,0,0,0,0,0,0,0,0,0,0
3,--sBoaqBlzA,00:08:19,"MAKEUP FOR GUYS: Actors, TV Hosts, On Camera Experts",UCeKHMeUlcLNPLCLUfZUQI2w,Tiffany Hendra,2011-08-04 00:47:49,15816,120,2,8,...,0,1,0,1,0,0,0,0,0,0
4,--7h1S4neDM,00:03:19,Pet Rescue Saga Level 539 2 Stars No Boosts,UCNWPDyaWf2eAHnofFLSnEMg,All Gamers,2014-04-29 15:44:44,2167,1,0,0,...,0,0,0,0,0,1,0,0,0,0


### overall stats

In [10]:
rows = []
group_names = ["Gaming", "Misc. Hobbies", "Music & Entertainment", "Vehicles", "Animation & Fashion",
             "Sports", "Outdoors & Pets", "Food & Cooking", "Gadgets", "Aircraft", "Transportation"]
for n in range(11):
    selected = grouped_sample.ix[grouped_sample[str(n)] == 1, :]
    group_name = group_names[n]
    rows += [[group_name, 
              stats.trim_mean(selected['views'], 0.1),
              stats.trim_mean(selected.query('likes >= 0')['likes'], 0.1),
              stats.trim_mean(selected.query('dislikes >= 0')['dislikes'], 0.1), 
              stats.trim_mean(selected.query('comments >= 0')['comments'], 0.1), 
              stats.trim_mean(selected['duration'] / np.timedelta64(1, 's'), 0.1)]]
rows = pd.DataFrame(rows)
rows.columns = ['group', 'views', 'likes', 'dislikes', 'comments', 'seconds']

In [11]:
rows

Unnamed: 0,group,views,likes,dislikes,comments,seconds
0,Gaming,8879.36579,53.12589,4.692813,16.508019,266.098315
1,Misc. Hobbies,7937.352481,24.257137,1.730209,5.356456,234.251362
2,Music & Entertainment,9422.091533,35.49102,2.182994,5.905084,240.546259
3,Vehicles,8006.065804,18.596162,1.889405,5.342205,239.049228
4,Animation & Fashion,12431.158557,59.937868,4.253274,12.065301,250.909246
5,Sports,7127.627711,26.339364,1.826721,5.806965,241.045274
6,Outdoors & Pets,8063.466156,21.737077,1.827521,4.914798,235.417117
7,Food & Cooking,12533.840344,55.487546,4.453612,9.34676,251.064214
8,Gadgets,11697.074938,41.11829,6.386471,15.520255,256.019525
9,Aircraft,6617.700793,17.505807,1.532396,5.651761,244.133683


In [12]:
rows.to_json("./stats/overall.json", orient='records')

### individual clusters

In [13]:
labels_num = grouped_sample['labels'].apply(lambda s: [int(x) for x in s.split(" ")])

In [14]:
grouped_sample['labels'] = labels_num

In [15]:
for i in range(10):
    sub = grouped_sample.loc[grouped_sample[str(i)] == 1]
    associated = sub['labels'].apply(lambda s: np.in1d(label_association[i], s))
    selected_ids = associated.apply(any)
    sub_selected = sub.ix[selected_ids, :]
    selected_labels = label_members.ix[label_association[i], 'label_name']
    selected_labels[selected_labels.duplicated()] = selected_labels[selected_labels.duplicated()] + '_2'
    selected_labels[selected_labels.duplicated()] = selected_labels[selected_labels.duplicated()] + '_3'
    assoc_df = pd.concat([pd.Series(x) for x in associated[selected_ids]], axis=1).T
    assoc_df.columns = selected_labels
    sub_selected.reset_index(drop=True, inplace=True)
    assoc_df.reset_index(drop=True, inplace=True)
    sub_full = pd.concat([sub_selected.ix[:, :10], assoc_df], axis=1).fillna(0)
    
    rows = []
    group_names = list(selected_labels)
    for n in range(len(group_names)):
        selected = sub_full.loc[sub_full[group_names[n]], :]
        group_name = group_names[n]
        rows += [[group_name, 
                  stats.trim_mean(selected['views'], 0.1),
                  stats.trim_mean(selected.query('likes >= 0')['likes'], 0.1),
                  stats.trim_mean(selected.query('dislikes >= 0')['dislikes'], 0.1), 
                  stats.trim_mean(selected.query('comments >= 0')['comments'], 0.1), 
                  stats.trim_mean(selected['duration'] / np.timedelta64(1, 's'), 0.1)]]
    rows = pd.DataFrame(rows)
    rows.columns = ['group', 'views', 'likes', 'dislikes', 'comments', 'seconds']
    rows.to_json("./stats/g_{0}.json".format(i), orient='records')