In [1]:
import pandas as pd
import numpy as np

In [2]:


#load full ratings df, then load fractional df for ease of use
full_ratings = pd.read_csv('./data/ratings.csv')
r_samp = full_ratings.sample(frac = .05)
r_samp_1 = full_ratings.sample(frac = .01)

In [3]:
#get an idea of the data in ratings df
r_samp.head()

Unnamed: 0,userId,movieId,rating,timestamp
21433726,139288,51255,2.0,1245779928
5483789,35545,135534,4.0,1492030357
10241600,66381,1276,3.5,1229440800
8433812,54939,3471,4.0,1299834194
13465667,87100,161956,2.5,1529861828


In [4]:
#load all the other relevant databases
tags = pd.read_csv('./data/tags.csv')
g_tags = pd.read_csv('./data/genome-tags.csv')
movies = pd.read_csv('./data/movies.csv')
g_scores = pd.read_csv('./data/genome-scores.csv')

In [5]:
#practice merging ratings and movie df to align movies with ratings
m_samp = pd.merge(r_samp_1,movies, on='movieId')
#print df shape to observe a proper merge and take a peek at the df
print(m_samp.shape)
print(r_samp_1.shape)
r_samp_1.head()

(250001, 6)
(250001, 4)


Unnamed: 0,userId,movieId,rating,timestamp
20673562,134427,1597,4.0,919874491
18023910,116760,1047,1.0,851118321
21690921,141005,387,3.0,998681226
10692043,69434,2571,1.0,955909829
5704018,36917,4226,4.0,1354571926


In [7]:
#take a peek at the data from the merged df, you can see movies and genres along with ratings
m_samp.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
65813,158108,1090,3.0,1001122463,Platoon (1986),Drama|War
19384,15459,1407,3.5,1087335311,Scream (1996),Comedy|Horror|Mystery|Thriller
131493,76604,2628,0.5,1171337355,Star Wars: Episode I - The Phantom Menace (1999),Action|Adventure|Sci-Fi
69301,156397,1233,3.5,1124387378,"Boot, Das (Boat, The) (1981)",Action|Drama|War
186787,79381,7022,4.0,1372455365,Battle Royale (Batoru rowaiaru) (2000),Action|Drama|Horror|Thriller


In [8]:
#take a peek at just the tags df you can see ratings assigned to each movie
tags.sample(5)

Unnamed: 0,userId,movieId,tag,timestamp
125760,6550,80574,revenge,1527134999
485618,51383,97921,drama,1492207158
434242,42657,91658,rape,1522872897
550336,62199,7894,Sergio Leone,1239047736
1017464,149526,33794,ninja,1158375843


In [6]:
#group this db by movie and see the max amount of tags assigned to a movie
#max = 6200
#mean = 25
#std = 108
gp_tags = tags.groupby('movieId')
#what happens when I remove the duplicates?
tags.groupby('movieId').get_group(260).tag.drop_duplicates().count()


766

In [9]:
#there are up to 6180 tags for one movie, what movie is it
#Star Wars, a new Hope
movies.iloc[257]

movieId                                          260
title      Star Wars: Episode IV - A New Hope (1977)
genres                       Action|Adventure|Sci-Fi
Name: 257, dtype: object

In [10]:
#A look at the tags df, you can see, it's just each tag along with a tag ID
#should de_duplicate tags
g_tags.sample(5)

Unnamed: 0,tagId,tag
917,918,silent
94,95,awful
770,771,passionate
33,34,africa
811,812,pretentious


In [11]:
#First look at genome scores df, will work off this df to build comprehensive tag df
g_scores.sample(5)

Unnamed: 0,movieId,tagId,relevance
5514453,5438,790,0.06275
3897976,3855,737,0.035
100215,90,952,0.03625
8807552,31104,129,0.09775
11621191,79029,536,0.13225


In [12]:
g_scores.shape #gscores has a lot of rows, 15584448 to be exact

(15584448, 3)

In [7]:
#join genome scores and actual tags
g_score_m = pd.merge(g_scores,g_tags, on='tagId')
g_score_m.sample(10)

Unnamed: 0,movieId,tagId,relevance,tag
14101486,56474,1021,0.013,terminal illness
1278773,27735,93,0.0085,awesome
2184623,1933,159,0.07525,boxing
3044766,5881,221,0.5225,clones
8562011,70708,620,0.21375,mad scientist
3215839,84137,233,0.005,comic book adaption
5071179,780,368,0.16425,existentialism
10643634,5968,771,0.1365,passionate
5241153,5439,380,0.037,fascism
9043476,31123,655,0.06575,mission from god


In [8]:
#now group scores by movieId, and get the first group to work as a sample
#sort g score merge by movie id may relieve the weird error I am having later
g_score_m = g_score_m.sort_values(by = 'movieId')
g_score_m = g_score_m.reset_index(level = 0, drop = True)
gs_gp = g_score_m.groupby(g_score_m.movieId)
samp = gs_gp.get_group(1)
samp2 = gs_gp.get_group(2)
type(gs_gp)

pandas.core.groupby.groupby.DataFrameGroupBy

In [10]:
gs_gp.mean().mean()

tagId        564.500000
relevance      0.116368
dtype: float64

In [11]:
import sys
import datetime
import time

# Create 2 new columns in ratings dataframe: datetime object and year rating was written.
# This takes a VERY long time to run. If anyone can figure why the O(n) is so terrible, let me know.

def tag_tstamp2(row):
    
    time_set = set()
    
    #cnt = row.index[0]
    
    if row.user_tag:
        ttimes = gp_tags.get_group(row.movieId)\
            .timestamp[gp_tags.get_group(row.movieId).tag == row.tag] 
        for t in ttimes:
            time_set.add(t)
        
        time_set = np.array(time_set)
        
    else: 
        
        time_set = np.nan
        
    
    return time_set
        
        

#function to return the top 100 tags and add cols to the df for eventual operations to determine
#user tags and timestamps

#second improved and working verion of top g!
def top_g(x):
    
    try: 
        #first thing is to create the user tag, have to place in a try block because
        #some movies do not have user tags. So you are assigning a boolean series
        #based on whether users tagged movies with a scored tag
        y = x.assign(user_tag = x.tag.isin(gp_tags.get_group(x.movieId.iloc[0]).tag))
        
    except:
        #if you hit a key error, aka no tags, then simply assign a false boolean series
        y = x.assign(user_tag = [False]*len(x.index))
        
    y = y.assign(ut_tstamp = [np.nan]*len(x.index))
    
    y = y.sort_values(by = 'relevance', ascending = False).iloc[0:100]
    #y.user_tag = gp_tags.get_group(y.movieId.iloc[0]).tag.drop_duplicates().isin(y.tag)
    
    #tag_tstamp(y)
    
    return y
    
        

#operate on sample series


#s2 = top_g(gs_gp2.get_group(1))
#tag_tstamp(s2)
#s2.head(5)
#gp_tags.get_group(84).tag.head(5)
#gp_tags.get_group(56)
#gs_gp2 = gs_gp.apply(top_g)
#gs_gp2 = gs_gp2.reset_index(level = [0,1], drop = True)
#gs_gp2 = gs_gp2.groupby(gs_gp2.movieId)
#gs_gp2.head(1)

In [32]:
gs_gp3 = gs_gp2.groupby(gs_gp2.movieId)
s1 = gs_gp3.get_group(1)
s1.head(5)



Unnamed: 0,movieId,tagId,relevance,tag,user_tag,ut_tstamp
0,1,1036,0.99925,toys,True,
1,1,244,0.99875,computer animation,True,
2,1,786,0.99575,pixar animation,False,
3,1,589,0.98575,kids and family,True,
4,1,64,0.98425,animation,True,


In [50]:
s2 = gs_gp2.iloc[0:90000]
import time
gs_gp2.shape

(1381600, 6)

In [52]:
t1 = time.time()

gs_gp2.ut_tstamp = gs_gp2.apply(lambda x: tag_tstamp2(x),axis = 1)

#tag_tstamp(s1)

print(time.time() - t1)



187.21835923194885


Unnamed: 0,movieId,tagId,relevance,tag,user_tag,ut_tstamp
400,5,901,0.96025,sequel,True,{1137102574}
401,5,451,0.954,good sequel,False,
402,5,383,0.94825,father daughter relationship,False,
403,5,902,0.941,sequels,False,
404,5,808,0.91275,pregnancy,True,"{1147409794, 1528513373}"
405,5,650,0.90625,midlife crisis,True,{1528513373}
406,5,374,0.86875,family,True,"{1147409796, 1137102574}"
407,5,230,0.83925,comedy,False,
408,5,768,0.759,parenthood,False,
409,5,388,0.753,feel-good,False,


In [54]:
gs_gp2.to_csv('updated_tags.csv')

In [79]:
#will have to adjust tags accoridngly, still getting an index error
#gs_gp3 = gs_gp2.groupby(gs_gp2.movieId)
print(gs_gp3.ut_tstamp.count().mean(),gs_gp3.ut_tstamp.count().std())

5.841488129704691 6.034446055506499


In [None]:
s2.dtypes

In [None]:
#function to create a new data frame, essentially by concatenating every group. Very, very slow
#but a useful snippet to have around. 

def c_cat_df(grp_df):
    
    tot = pd.DataFrame()
    
    for key in grp_df.groups:
        
        df = grp_df.get_group(key)
        tag_tstamp(df)
        tot = pd.concat([tot,df])
        
    return tot

tot = c_cat_df(gs_gp2)
        
tot.head(5)


In [12]:
#second improved and working verion of top g!
def top_g2(x):
    
    try: 
        #first thing is to create the user tag, have to place in a try block because
        #some movies do not have user tags. So you are assigning a boolean series
        #based on whether users tagged movies with a scored tag
        y = x.assign(user_tag = x.tag.isin(gp_tags.get_group(x.movieId.iloc[0]).tag))
        
    except:
        #if you hit a key error, aka no tags, then simply assign a false boolean series
        y = x.assign(user_tag = [False]*len(x.index))
        
    #y = y.assign(ut_tstamp = [np.nan]*len(x.index))
    
    y['ut_tstamp'] = y.apply(lambda x : tag_tstamp2(x),axis = 1)
    
    y = y.sort_values(by = 'relevance', ascending = False).iloc[0:25]
    
    return y



In [13]:
gs_gp2 = gs_gp.apply(top_g2)
gs_gp2 = gs_gp2.reset_index(level = [0,1], drop = True)
gs_gp2 = gs_gp2.groupby(gs_gp2.movieId)
gs_gp2.head(5)

Unnamed: 0,movieId,tagId,relevance,tag,user_tag,ut_tstamp
0,1,1036,0.99925,toys,True,"{1494862083, 1246932614, 1449840522, 126653429..."
1,1,244,0.99875,computer animation,True,"{1274197762, 1494862082, 1514919558, 152011546..."
2,1,786,0.99575,pixar animation,False,
3,1,589,0.98575,kids and family,True,{1367943150}
4,1,64,0.98425,animation,True,"{1470206081, 1329527940, 1147410824, 150497972..."
25,2,29,0.97600,adventure,True,"{1368842814, 1560892764, 1555875358, 1446853492}"
26,2,204,0.95900,children,True,{1249829577}
27,2,377,0.95175,fantasy,True,"{1154126755, 1328614307, 1325585318, 125038771..."
28,2,588,0.95000,kids,True,{1446853547}
29,2,951,0.93200,special effects,False,


In [15]:
gs_gp2.mean().std()

tagId        74.115519
relevance     0.105364
user_tag      0.145055
dtype: float64

In [77]:
gs_gp2.count()
print(gs_gp2.ut_tstamp.count().mean(),gs_gp2.ut_tstamp.count().std())

5.057397220613781 4.852642056251426


In [82]:
gs_gp4 = gs_gp.apply(top_g2)
gs_gp4 = gs_gp4.reset_index(level = [0,1], drop = True)
gs_gp4 = gs_gp4.groupby(gs_gp4.movieId)
print(gs_gp4.ut_tstamp.count().mean(),gs_gp4.ut_tstamp.count().std())

4.125796178343949 3.6262740718281576


In [84]:
gs_gp4 = gs_gp.apply(top_g2)
gs_gp4 = gs_gp4.reset_index(level = [0,1], drop = True)
gs_gp4.to_csv('tag_upd_25.csv')

In [86]:
#second improved and working verion of top g!
def top_g_50(x):
    
    try: 
        #first thing is to create the user tag, have to place in a try block because
        #some movies do not have user tags. So you are assigning a boolean series
        #based on whether users tagged movies with a scored tag
        y = x.assign(user_tag = x.tag.isin(gp_tags.get_group(x.movieId.iloc[0]).tag))
        
    except:
        #if you hit a key error, aka no tags, then simply assign a false boolean series
        y = x.assign(user_tag = [False]*len(x.index))
        
    #y = y.assign(ut_tstamp = [np.nan]*len(x.index))
    
    y['ut_tstamp'] = y.apply(lambda x : tag_tstamp2(x),axis = 1)
    
    y = y.sort_values(by = 'relevance', ascending = False).iloc[0:50]
    
    return y

gs_gp_50 = gs_gp.apply(top_g_50)
gs_gp_50 = gs_gp_50.reset_index(level = [0,1], drop = True)
gs_gp_50.to_csv('tag_upd_50.csv')
