In [1]:
import json
import pandas as pd
import numpy as np
import yaml

In [2]:
def clean_data(json_file):
    with open(json_file) as data_file:    
        data = yaml.safe_load(data_file)
    num_items = len(data['items'])
    df_s = []
    for item in data['items']:
        df = pd.DataFrame(item)
        df = df.drop(['etag','kind'],1)
        df = df.transpose()
        df['video_id'] = df.iloc[0,0]
        df.reset_index(inplace=True, drop=True)
        df = df.drop(df.index[[0]])
        df_s.append(df)
    
    #more data
    df_s.extend(df_s)
    df_s.extend(df_s)
    df_s.extend(df_s)
    data = pd.concat(df_s)
    data.reset_index(inplace = True, drop = True)
    return data

data = clean_data('example_content.json')
print data.shape

(48, 11)


In [3]:
'''
Labels:
1 - joy
2 - surprise
3 - fear
4 - sadness
-1 - unknown
'''

labels = []
num_labels = 4
for i in range(data.shape[0]):
    if i < 6:
        labels.append(i % num_labels+1)
    else:
        labels.append(-1)
    
data['labels'] = labels
data.head(2)

Unnamed: 0,categoryId,channelId,channelTitle,description,liveBroadcastContent,localized,publishedAt,tags,thumbnails,title,video_id,labels
0,1,UCxijiMU6XtEJpw29tTIK8Zw,ThomasNoakes,Finger Cleaner made the top 5 finalists in the...,none,{u'description': u'Finger Cleaner made the top...,2013-11-25T04:14:29.000Z,"[Crash The Super Bowl, Doritos (Brand), 2014, ...",{u'default': {u'url': u'https://i.ytimg.com/vi...,"DORITOS ""FINGER CLEANER"" - 2014 CRASH THE SUPE...",ugo7Y2lRsxc,1
1,24,UCz6CZ_MxRK_8ke5zmSRCWYA,BestCodTrolls,Wish the Pats wouldve won :l\nM&Ms Super Bowl ...,none,{u'description': u'Wish the Pats wouldve won :...,2012-02-06T03:53:08.000Z,"[2012, super, bowl, giants, patriots, m&ms, co...",{u'default': {u'url': u'https://i.ytimg.com/vi...,"M&Ms Super Bowl Commercial 2012 ""I'm Sexy and ...",Pc7BnT5X1tw,2


In [4]:
import re 
def split_words(words):
    return re.sub( r"([A-Z])", r" \1", words)

data['channelTitle'] = data['channelTitle'].apply(split_words)

data['publishedAt'] = pd.to_datetime(data['publishedAt'])
data['publishedAt_month'] = data['publishedAt'].apply(lambda x: x.month)
data['publishedAt_year'] = data['publishedAt'].apply(lambda x: x.year)

def extract_str_from_list(column):
    return ' '.join(column)
    
data['tags'] = data['tags'].map(extract_str_from_list)

In [5]:
# 'localized' column includes title and description
data = data.drop(['channelId', 'liveBroadcastContent', 'localized', 'thumbnails', 'publishedAt'], 1)
data.head(2)

Unnamed: 0,categoryId,channelTitle,description,tags,title,video_id,labels,publishedAt_month,publishedAt_year
0,1,Thomas Noakes,Finger Cleaner made the top 5 finalists in the...,Crash The Super Bowl Doritos (Brand) 2014 Comm...,"DORITOS ""FINGER CLEANER"" - 2014 CRASH THE SUPE...",ugo7Y2lRsxc,1,11,2013
1,24,Best Cod Trolls,Wish the Pats wouldve won :l\nM&Ms Super Bowl ...,2012 super bowl giants patriots m&ms commercia...,"M&Ms Super Bowl Commercial 2012 ""I'm Sexy and ...",Pc7BnT5X1tw,2,2,2012


In [6]:
data['text'] =  data['channelTitle'] + data['description'] + data['tags'] + data['title']
data = data.drop(['channelTitle', 'description', 'tags', 'title'],1)

In [7]:
data = data[['labels', 'video_id', 'text', 'categoryId', 'publishedAt_month', 'publishedAt_year']]
data.head(2)

Unnamed: 0,labels,video_id,text,categoryId,publishedAt_month,publishedAt_year
0,1,ugo7Y2lRsxc,Thomas NoakesFinger Cleaner made the top 5 fi...,1,11,2013
1,2,Pc7BnT5X1tw,Best Cod TrollsWish the Pats wouldve won :l\n...,24,2,2012


In [8]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(data.text)

# Transform to a counter
a = vectorizer.transform(data.text).toarray()

In [9]:
vectorizer.get_feature_names()[:6]

[u'1st', u'2012', u'2013', u'2014', u'2015', u'2016']

In [10]:
data = pd.concat([data, pd.DataFrame(a)],1)
data = data.drop(['text'], 1)
data.head(2)

Unnamed: 0,labels,video_id,categoryId,publishedAt_month,publishedAt_year,0,1,2,3,4,...,343,344,345,346,347,348,349,350,351,352
0,1,ugo7Y2lRsxc,1,11,2013,1,0,0,3,0,...,1,1,0,0,1,1,0,1,0,0
1,2,Pc7BnT5X1tw,24,2,2012,0,2,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [11]:
from sklearn import datasets
from sklearn.semi_supervised import LabelSpreading
from sklearn.cross_validation import train_test_split

#train, test = train_test_split(data, test_size=0.33, random_state=42)
train = data.ix[:3,:]
test = data.ix[6:,:]

label_prop_model = LabelSpreading()
label_prop_model.fit(train.ix[:,2:], train['labels'])

LabelSpreading(alpha=0.2, gamma=20, kernel='rbf', max_iter=30, n_neighbors=7,
        tol=0.001)

In [12]:
print label_prop_model.predict(train.ix[:,2:])
print '-------------------'
label_prop_model.predict_proba(train.ix[:,2:])

[1 2 3 4]
-------------------


array([[ 1.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.],
       [ 0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  1.]])

In [13]:
print label_prop_model.predict(test.ix[:,2:])
print '-------------------'
label_prop_model.predict_proba(test.ix[:,2:])

[1 2 3 4 1 1 1 2 3 4 1 1 1 2 3 4 1 1 1 2 3 4 1 1 1 2 3 4 1 1 1 2 3 4 1 1 1
 2 3 4 1 1]
-------------------


  probabilities /= normalizer


array([[  1.,   0.,   0.,   0.],
       [  0.,   1.,   0.,   0.],
       [  0.,   0.,   1.,   0.],
       [  0.,   0.,   0.,   1.],
       [ nan,  nan,  nan,  nan],
       [ nan,  nan,  nan,  nan],
       [  1.,   0.,   0.,   0.],
       [  0.,   1.,   0.,   0.],
       [  0.,   0.,   1.,   0.],
       [  0.,   0.,   0.,   1.],
       [ nan,  nan,  nan,  nan],
       [ nan,  nan,  nan,  nan],
       [  1.,   0.,   0.,   0.],
       [  0.,   1.,   0.,   0.],
       [  0.,   0.,   1.,   0.],
       [  0.,   0.,   0.,   1.],
       [ nan,  nan,  nan,  nan],
       [ nan,  nan,  nan,  nan],
       [  1.,   0.,   0.,   0.],
       [  0.,   1.,   0.,   0.],
       [  0.,   0.,   1.,   0.],
       [  0.,   0.,   0.,   1.],
       [ nan,  nan,  nan,  nan],
       [ nan,  nan,  nan,  nan],
       [  1.,   0.,   0.,   0.],
       [  0.,   1.,   0.,   0.],
       [  0.,   0.,   1.,   0.],
       [  0.,   0.,   0.,   1.],
       [ nan,  nan,  nan,  nan],
       [ nan,  nan,  nan,  nan],
       [  