In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from collections import defaultdict, Counter
import itertools
import json
from datetime import datetime
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import plot_roc_curve, roc_curve, auc, roc_auc_score, accuracy_score, classification_report
from sklearn.metrics import precision_score,recall_score,f1_score
from sklearn.ensemble import RandomForestClassifier
import pickle
import os

In [2]:
cwd = os.getcwd()
stash_path = f'{cwd}/stash,'
if not os.path.isdir(stash_path):
    os.makedirs(stash_path)

# PLAN
1. Take out subset of users who have more than 30 interactions to EVALUATE the business trajectories network.
2. Build business trajectories network.
3. For test users, generate lists of recommendations with different max direct neighbors allowed (1, ..., 20).
4. Separately, fit 2 random forest classifiers with user and business features.
5. For each set of business trajectories network recommendations, obtain the random forests' predictions to compare.

**STEP 1: Get test users**

In [3]:
# read in reviews dataset to generate the business trajectories network
reviews_df = pd.read_json("data/review.json",lines=True)

# get test users for business trajectories network evaluation: subset of users with above 30 visited businesses 
user_numinteractions = reviews_df.user_id.value_counts()
users_above_30_interactions = user_numinteractions[user_numinteractions>30]
test_users_above_30_ratings = list(users_above_30_interactions.sample(frac=0.3,random_state=1).index)

**STEP 2: Build business trajectories network**

In [1]:
# for each user, get chronologically ordered list of positively reviewed businesses visited
good_reviews_df = reviews_df[reviews_df['stars'] > 3]
good_reviews_df.drop(columns = ['stars'],inplace=True)
good_reviews_df = good_reviews_df.sort_values(by=['date']).reset_index(drop=True)
business_visits_series = good_reviews_df.groupby('user_id').apply(lambda df: list(df['business_id']))
business_visits_series = business_visits_series[business_visits_series.map(len)>1]
business_visits_series_train = business_visits_series[~business_visits_series.index.isin(test_users_above_30_ratings)]

In [None]:
# Generate network

source = []
target = []
for user_id, business_list in business_visits_series_train.iteritems():
    for i,business in enumerate(business_list):
        try:
            target.append(business_list[i+1])
            source.append(business)
        except IndexError:
            break

raw_edges = pd.DataFrame(data = {'source':source,'target':target})
weighted_edges = raw_edges.groupby(['source','target']).agg(len)
weighted_edges = weighted_edges.to_frame().rename(columns={0:'weight'})
weighted_edges = weighted_edges.reset_index()
weighted_edges.to_csv("stash/business_network_edgelist.csv")

G = nx.from_pandas_edgelist(weighted_edges,'source','target',['weight'])

weight_dict = defaultdict(int)
for s,t,w in G.edges(data=True):
    weight_dict[w['weight']] += w['weight']

**STEP 3: Generate business trajectories network recommendations for different values of `max direct neighbors`**

In [None]:
# generate network recommendations for different number of DIRECT neighbors allowed in list

def get_neighbors_from_node(node,graph) -> dict:
    # given node and graph, returns dict of node neighbors and corresponding weights
    return {neighbor:weight['weight'] for neighbor,weight in dict(graph[node]).items()}
    

def get_all_neighbors_from_nodes(nodes,graph) -> dict:
    # given list of nodes, returns dict containing all neighbors of each node in nodes with corresponding weight
    nodes_neighbors = []
    for node in nodes:
        nodes_neighbors.append(get_neighbors_from_node(node,graph))
    return dict(itertools.chain.from_iterable(dct.items() for dct in nodes_neighbors))


def top_20_businesses(fromNode,max_direct_neighbors,graph) -> list:
    # Given business fromNode, this method returns the business network's top 20 business recommendations.
    # Picks top max_direct_neighbors based on weight, then completes the rest of the list
    # by fanning out to 2nd, 3rd, 4th...  degree neighbors and picking top businesses by rank.
    
    recommendations = []
    
    # get dataframe of direct neighbors and corresponding weights, sorted by weight 
    direct_neighbors_dict = get_all_neighbors_from_nodes(nodes=[fromNode], graph=graph)
    direct_neighbors_df = pd.DataFrame.from_dict(direct_neighbors_dict,orient='index')
    direct_neighbors_df.sort_values(by=0,ascending=False,inplace=True)
    
    # append at most max_direct_neighbors direct neighbors to recommendations
    if len(direct_neighbors_df) <= max_direct_neighbors: 
        top_direct_neighbors = list(direct_neighbors_df.index)
    elif len(direct_neighbors_df) > max_direct_neighbors: 
        top_direct_neighbors = list(direct_neighbors_df.iloc[0:max_direct_neighbors].index)
    for direct_neighbor in top_direct_neighbors:
        recommendations.append(direct_neighbor)

    
    # fan out to neighbors of neigbors by level to get top 20 - max_direct_neighbors remaining recommendations
    while len(recommendations) < 20:
        neighbors_by_level = list(direct_neighbors_df.index)
        neighbors_of_neighbors_by_level_dict = get_all_neighbors_from_nodes(nodes=neighbors_by_level, graph=graph)
        neighbors_of_neighbors_by_level_df = pd.DataFrame.from_dict(neighbors_of_neighbors_by_level_dict,orient='index')
        neighbors_of_neighbors_by_level_df.sort_values(by=0,ascending=False,inplace=True)

        if len(neighbors_of_neighbors_by_level_df) >= 20 - len(recommendations):
            top_neighbors_of_neighbors_by_level = list(neighbors_of_neighbors_by_level_df.iloc[0:20-len(recommendations)].index)
        else:
            top_neighbors_of_neighbors_by_level = list(neighbors_of_neighbors_by_level_df.index)
        for neighbor in top_neighbors_of_neighbors_by_level:
            recommendations.append(neighbor)
        neighbors_by_level = list(neighbors_of_neighbors_by_level_df.index)
    return recommendations


def generate_network_recommendations(max_direct_neighbors, business_visits_series, graph)-> pd.DataFrame:
    # for each user, generate top 20 businesses to recommend given first valid business visited (has to be in network)
    user_dataframes = []
    
    print(f'number of users {len(business_visits_series)}')
    denom = len(business_visits_series)
    count = 0
    for user, businesses in business_visits_series.iteritems():
        if count % 1000 == 0:
            print(f'progress: {count/denom *100}%')
            print(f'users completed {count} out of {denom}')
        if businesses[0] in graph.nodes:
            first_valid_business = businesses[0]
        else:
            for j,business in enumerate(businesses):
                if business in graph.nodes:
                    first_valid_business = business
                    break
        recommendations = top_20_businesses(first_valid_business, max_direct_neighbors, graph)
        user_recommendations_df = pd.DataFrame({
                                                'user': [user for _ in range(20)],
                                                'recommendations': recommendations,
                                                'network_rank': [i+1 for i in range(20)],
                                                'business_net_pred': [1 for _ in range(20)]
                                                })

        user_dataframes.append(user_recommendations_df)
        count += 1
    all_user_network_recommendations = pd.concat(user_dataframes).reset_index().drop(columns=['index'])
    all_user_network_recommendations.to_csv(f'stash/network_recommendations/network_recommendations_{max_direct_neighbors}_max_direct_neighbors.csv')
    return all_user_network_recommendations

In [None]:
# get test users
business_visits_series_test = business_visits_series[business_visits_series.index.isin(test_users_above_30_ratings)]
recommendations_dict = dict()

# generate network recommendations with different max_direct_neighbors
for i in range(20):
    print(f'Generating recommendations for test users with constraint {i+1} max direct neighbors')
    recommendations = generate_network_recommendations(max_direct_neighbors = i+1,
                                                       business_visits_series = business_visits_series_test, 
                                                       graph = G)
    recommendations_dict[i+1] = recommendations

**STEP 4: Build random forest classifiers**

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [5]:
# read in training data
data = pd.read_csv("data/training_data.csv",index_col=0)

# get users who have rated over 30 businesses
groups = data.groupby('user_id')
group_df = groups.nunique()
group_df_filtered = group_df.loc[group_df["business_id"]>30][["business_id", "stars"]]
group_df_filtered.rename(columns={"business_id":"num_businesses_rated"}, inplace=True)

# training data with users who have rated over 30 businesses
data_process = data.join(group_df_filtered.drop(columns = ['stars']), on="user_id", how="inner")
data_process.drop(columns = ['business_longitude','business_latitude','business_review_count','user_first_elite_year', 
                             'user_last_elite_year','user_begin_yelping_year', 'user_begin_yelping_month',
                             'user_begin_yelping_day','num_businesses_rated'],inplace=True)

data_process = data_process[~data_process['user_id'].isin(test_users_above_30_ratings)]

training_data_with_net_info = data_process.drop(columns=['business_id','user_id'])
training_data_no_net_info = data_process.drop(columns=['business_id','user_id','business_latent_category'])

# training data with business categories feature
y = training_data_with_net_info['stars'].values
X = training_data_with_net_info.drop(columns=['stars']).values
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.30,
                                                    random_state=101)
# training data without business categories feature
y_noNet = training_data_no_net_info['stars'].values
X_noNet = training_data_no_net_info.drop(columns=['stars']).values
X_noNet_train, X_noNet_test, y_noNet_train, y_noNet_test = train_test_split(X_noNet, y_noNet,
                                                                            test_size=0.30,
                                                                            random_state=101)

  mask |= (ar1 == a)


In [None]:
rfc_net = RandomForestClassifier(n_estimators=100)
rfc_net.fit(X_train, y_train)
file1 = 'stash/random_forest/rfc_net.pkl'
rfc_net = pickle.load(open(file1, 'rb'))
with open("stash/random_forest/rfc_net.pkl", 'wb') as file:
    pickle.dump(rfc_net, file)

rfc_NoNet = RandomForestClassifier(n_estimators=100)
rfc_NoNet.fit(X_noNet_train, y_noNet_train)
file2 = 'stash/random_forest/rfc_noNet.pkl'
rfc_NoNet = pickle.load(open(file2, 'rb'))
with open("stash/random_forest/rfc_noNet.pkl", 'wb') as file:
    pickle.dump(rfc_NoNet, file)

In [15]:
rfc_net_pred = rfc_net.predict(X_test)
print(confusion_matrix(y_test,rfc_net_pred))
print(classification_report(y_test,rfc_net_pred))

[[ 97888  38465]
 [ 21577 240678]]
              precision    recall  f1-score   support

           0       0.82      0.72      0.77    136353
           1       0.86      0.92      0.89    262255

    accuracy                           0.85    398608
   macro avg       0.84      0.82      0.83    398608
weighted avg       0.85      0.85      0.85    398608



In [16]:
rfc_NoNet_pred = rfc_NoNet.predict(X_noNet_test)
print(confusion_matrix(y_noNet_test, rfc_NoNet_pred))
print(classification_report(y_noNet_test, rfc_NoNet_pred))

[[ 96616  39737]
 [ 22732 239523]]
              precision    recall  f1-score   support

           0       0.81      0.71      0.76    136353
           1       0.86      0.91      0.88    262255

    accuracy                           0.84    398608
   macro avg       0.83      0.81      0.82    398608
weighted avg       0.84      0.84      0.84    398608



**STEP 5: Get random forest predictions of each set of business trajectories network's recommendations.**

In [None]:
# get random forest predictions for the set of recommendations from the business trajectories network


# defining order on columns for training data
business_cols = ['business_id', 'business_stars', 'business_state', 'business_latent_category',
                 'business_days_open_weekly', 'average_open_time', 'average_close_time','business_city']
                 
user_cols = ['user_id', 'user_average_stars', 'user_compliment_cool', 'user_compliment_cute',
             'user_compliment_funny', 'user_compliment_hot', 'user_compliment_list',
             'user_compliment_more', 'user_compliment_note',
             'user_compliment_photos', 'user_compliment_plain',
             'user_compliment_profile', 'user_compliment_writer', 'user_cool',
             'user_fans', 'user_funny', 'user_review_count', 'user_useful',
             'user_num_elite_years', 'user_friends_count']

In [None]:
# get test users
# training data with users who have rated over 30 businesses
data_process_net_test = data[data['user_id'].isin(test_users_above_30_ratings)]
user_training_data = data[user_cols].drop_duplicates()
business_training_data = data[business_cols].drop_duplicates()

In [None]:
# for each set of recommendations (max_direct_neighbors), get random forest predictions for each
# user business (recommendation) pair

# get all businesses each user liked; this will be useful for finding network recommendation matches
user_visited_businesses_df = good_reviews_df[['user_id','business_id']].groupby('user_id').agg(set)
user_visited_businesses_df = user_visited_businesses_df.reset_index()
user_visited_businesses_map = dict()
for item, row in user_visited_businesses_df.iterrows():
    user = row.user_id
    businesses_set = row.business_id
    user_visited_businesses_map[user] = businesses_set

for i in range(20):
    # for each user business pair in recommendations, get corresponding user business metadata
    print(f'{i+1} max direct neighbors')
    recommendations = recommendations_dict[i+1]
    business_join = recommendations.join(business_training_data.set_index('business_id'),on='recommendations')
    user_join = business_join.join(user_training_data.set_index('user_id'),on='user')
    network_recommendations_data = user_join.dropna()
    network_recommendations_data = network_recommendations_data.reset_index().drop(columns = ['index'])
    
    # get appropriate subset of columns needed for random forest predictions
    training_data_with_net_info_cols = list(training_data_with_net_info.columns)
    training_data_with_net_info_cols.remove('stars')
    training_data_no_net_info_cols = list(training_data_no_net_info.columns)
    training_data_no_net_info_cols.remove('stars')  
    net_test_with_lat_cat = network_recommendations_data[training_data_with_net_info_cols]
    net_test_no_lat_cat = network_recommendations_data[training_data_no_net_info_cols]
    
    # random forest predictions for all user business pairs in business trajectories' network recommendations
    rfc_net_recommend_pred = rfc_net.predict(net_test_with_lat_cat.values)
    rfc_net_recommend_pred_proba = rfc_net.predict_proba(net_test_with_lat_cat.values)
    rfc_NoNet_recommend_pred = rfc_NoNet.predict(net_test_no_lat_cat.values)
    rfc_NoNet_recommend_pred_proba = rfc_NoNet.predict_proba(net_test_no_lat_cat.values) 
    rf_net_pred_df = pd.DataFrame({'rf_net_pred':rfc_net_recommend_pred})
    rf_net_prob_like_df = pd.DataFrame({'rf_net_prob_like':rfc_net_recommend_pred_proba[:,1]}) 
    rf_NoNet_pred_df = pd.DataFrame({'rf_NoNet_pred':rfc_NoNet_recommend_pred})
    rf_NoNet_prob_like_df = pd.DataFrame({'rf_NoNet_prob_like':rfc_NoNet_recommend_pred_proba[:,1]}) 
    
    # merge random forest predictions back to recommendations dataframe
    narrow_recs = network_recommendations_data[['user','recommendations','network_rank','business_net_pred']]
    df1 = pd.merge(narrow_recs,rf_net_pred_df,left_index=True, right_index=True)
    df2 = pd.merge(df1,rf_net_prob_like_df,left_index=True, right_index=True)
    df3 = pd.merge(df2,rf_NoNet_pred_df,left_index=True, right_index=True)
    recommendations = pd.merge(df3,rf_NoNet_prob_like_df,left_index=True, right_index=True)
        
    # append true_like column to recommendations 
    recommendations['true_like'] = recommendations.apply(lambda row: 1 if row['recommendations'] in user_visited_businesses_map[row['user']] else 0, axis = 1)
    recommendations.to_csv(f'stash/network_recommendations_with_rf_predictions/network_recommendations_with_rf_predictions_{i+1}_max_direct_neighbors.csv')
