In [59]:
import numpy as np
import pandas as pd
import turicreate as tc

In [60]:
def drop_nan_cols(df, threshold=0.8):
    important_meta_cols = ['also_buy', 'also_view', 'asin', 'brand', 'description', 'feature', 'main_cat', 'similar_item', 'title']

    drop_cols = []
    for col in df.columns:
        if df[col].isna().sum() > len(df)*threshold:
            if col not in important_meta_cols:
                drop_cols.append(col)
                
    print('\nremoving columns with more then {0} missing...', threshold)
    print(drop_cols)
    df = df.drop(drop_cols, axis=1)
    return df

def remove_long_titles(df):
    old_len = len(df)
    df['val_length'] = df['title'].str.len()
    print('val length: ', df['val_length'] )
    df = df[df['val_length']  < 300]
    print('===========================================================')
    print('Removed ' + str(old_len-len(df)) + ' rows')
    print('===========================================================')
    return df

def preprocess_sf(sf):      
    df = sf.to_dataframe()
    print('\nds length: ', len(df))
    
    print('\nisna:')
    print(df.isna().sum())
    
    df = drop_nan_cols(df) 
    
    if 'title' in df:        
        df = remove_long_titles(df)
     
    sf = tc.SFrame(df)
    return sf

def get_mutual_columns(meta_paths):
    meta_cols = {}
    sf_list = []

    # get columns from all datasets
    for path in meta_paths:
        key = path.split('.')[0][5:]
        sf = tc.SFrame.read_json(path, orient='lines')      
        sf = preprocess_sf(sf)
        sf_list.append(sf)
        meta_cols[key] = sf.column_names()

    # take first ds's columns and then check for the instersection of the columns
    tmp = meta_cols[meta_paths[0].split('.')[0][5:]]
    for col in meta_cols:
        tmp = list(set(tmp).intersection(meta_cols[col]))
    return sf_list, tmp

def remove_cols(sf, important_cols, mutual_cols):
    for col in sf.column_names():
        if col not in mutual_cols or col not in important_cols:
            sf = sf.remove_column(col)
    return sf

In [69]:
# example
paths = ['ds_5/Video_Games_5.json', 'ds_5/Software_5.json', 'ds_5/Movies_and_TV_5.json']
# meta_paths = ['meta_Video_Games.json', 'meta_Software.json', 'meta_Movies_and_TV.json']
important_cols = ['asin', 'overall', 'reviewText', 'reviewerID', 'summary', 'unixReviewTime']

In [66]:
sf_list, mutual_cols = get_mutual_columns(paths)

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[dict]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------



ds length:  497577

isna:
asin                   0
image             493943
overall                0
reviewText           158
reviewTime             0
reviewerID             0
reviewerName          76
style             208340
summary              109
unixReviewTime         0
verified               0
vote              389784
dtype: int64

removing columns with more then {0} missing... 0.8
['image']


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[dict]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------



ds length:  12805

isna:
asin                  0
image             12734
overall               0
reviewText            1
reviewTime            0
reviewerID            0
reviewerName          9
style              5644
summary               6
unixReviewTime        0
verified              0
vote               8903
dtype: int64

removing columns with more then {0} missing... 0.8
['image']


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[dict]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------



ds length:  3410019

isna:
asin                    0
image             3403048
overall                 0
reviewText           1581
reviewTime              0
reviewerID              0
reviewerName           82
style              174207
summary               640
unixReviewTime          0
verified                0
vote              2768010
dtype: int64

removing columns with more then {0} missing... 0.8
['image', 'vote']


In [67]:
games_5 = sf_list[0]
sw_5 = sf_list[1]
movies_5 = sf_list[2]

In [70]:
games_5 = remove_cols(games_5, important_cols, mutual_cols)
sw_5 = remove_cols(sw_5, important_cols, mutual_cols)
movies_5 = remove_cols(movies_5, important_cols, mutual_cols)

In [71]:
games_sw_movies = games_5.append(sw_5)
games_sw_movies = games_sw_movies.append(movies_5)
# games_sw_movies.export_json('games_sw_movies.json')

In [72]:
games_sw_movies

asin,overall,reviewText,reviewerID,summary,unixReviewTime
700026657,5.0,"This game is a bit hard to get the hang of, but ...",A1HP7NVNPFMA4N,but when you do it's great. ...,1445040000
700026657,4.0,I played it a while but it was alright. The s ...,A1JGAP0185YJI6,"But in spite of that it was fun, I liked it ...",1437955200
700026657,3.0,ok game.,A1YJWEXHQBWK2B,Three Stars,1424649600
700026657,2.0,"found the game a bit too complicated, not what I ...",A2204E1TH211HT,Two Stars,1424390400
700026657,5.0,"great game, I love it and have played it since its ...",A2RF5B5H74JLPE,love this game,1419465600
700026657,4.0,i liked a lot some time that i haven't play a ...,A11V6ZJ2FVQY1D,Anno 2070,1415836800
700026657,1.0,"I'm an avid gamer, but Anno 2070 is an INSUL ...",A1KXJ1ELZIU05C,Avoid This Game - Filled with Bugs ...,1406937600
700026657,5.0,I bought this game thinking it would be ...,A1WK5I4874S3O2,A very good game balance of skill with depth of ...,1393804800
700026657,5.0,I have played the old anno 1701 AND 1503. ...,AV969NA4CBP10,Anno 2070 more like anno 1701 ...,1392940800
700026657,4.0,"I liked it and had fun with it, played for a ...",A1EO9BFUHTGWKZ,Pretty fun,1372291200


In [None]:
# check for mutual users
print(len(np.intersect1d(games_5['reviewerID'], sw_5['reviewerID'])))
print(len(np.intersect1d(movies_5['reviewerID'], sw_5['reviewerID'])))
print(len(np.intersect1d(games_5['reviewerID'], movies_5['reviewerID'])))

420
