## Importing Packages 

### Network Analytics

In the following code, we create an undirected graph representing relationships between users based on the video games they've reviewed. Each vertex in the graph corresponds to a unique user, and an edge is added between two vertices if they have reviewed the same game. This graph allows us to analyze relationships between users and potentially discover interesting patterns, such as clusters of users who review similar games, highly connected users, or correlations between graph properties and other features of the dataframe.

The graph is being used to find a similar user to a user that only has one review. The most similar user must have at least 2 different games reviewed.
The function <b>find_most_similar_user</b> takes a user ID, a graph, and an optional parameter min_edges as inputs, and returns the ID of the user in the graph that is most similar to the input user based on their common game reviews with other users. The function finds the input user's neighbors between the input user and their neighbors, filters the candidate vertices based on the input user's y value and minimum degree, and then finds the candidate vertex with the most common game reviews with the input user's neighbors. If there are ties, the function returns the user with the most number of connections to other neighbors.

In [1]:
import ast
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import igraph as ig
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle
import csv
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import concurrent.futures

## Reading DataFrame

In [2]:
# Optimizing datatypes for efficient DF storage
data_types = {
    'review_id': 'int32',
    'y': 'bool',
    'app_id': 'int32',
    'user_id': 'int32',
    'reviews': 'int16'
}
# Only importing necessary columns
final_df = pd.read_csv("../data/final_df.csv", dtype = data_types, usecols=data_types.keys())
final_df.head()

Unnamed: 0,review_id,y,app_id,user_id,reviews
0,1,False,304390,1098,1
1,5,True,306130,17622,4
2,6,True,238960,33969,1
3,7,False,730,24431,2
4,8,True,255710,125959,3


In [3]:
n_total_samples = 100000

neg_pos_ratio = final_df['y'].value_counts()[1] / final_df['y'].value_counts()[0]
# Determine Number of False and Number of True Samples
n_false_samples = int(n_total_samples * neg_pos_ratio)
n_true_samples = n_total_samples - n_false_samples

# Sample On this Basis 
false_samples = final_df[final_df['y'] == False].sample(n_false_samples, random_state=70)
true_samples = final_df[final_df['y'] == True].sample(n_true_samples, random_state=70)

final_sample_df = pd.concat([false_samples, true_samples])

# shuffle the data
final_df = final_sample_df.sample(frac=1, random_state=70)

In [4]:
len(final_df)

100000

In [5]:
# Non-distributed version
def add_edges_optimized(graph, common_user_df, final_df):
    edges = []
    edge_common_games = []
    edge_columns = {col: [] for col in ['y_x', 'user_id_x', 'review_id_x']}
    
    grouped_common_user_df = common_user_df.groupby(['user_id_x', 'user_id_y'])
    
    for (user_id_source, user_id_target), group in grouped_common_user_df:
        source_vertex_id = user_id_to_vertex_id[user_id_source]
        target_vertex_id = user_id_to_vertex_id[user_id_target]

        common_games = group['app_id'].tolist()
        columns = {col: group[col].tolist() for col in edge_columns.keys()}

        edges.append((source_vertex_id, target_vertex_id))
        edge_common_games.append(common_games)

        for col in edge_columns:
            edge_columns[col].extend(columns[col])

    graph.add_edges(edges)
    graph.es["common_game_ids"] = edge_common_games
    for col in edge_columns:
        graph.es[col[:-2]] = edge_columns[col]  # Remove the '_x' suffix for the edge attribute names

In [12]:
unique_user_ids = final_df['user_id'].unique()

user_id_to_user_info = {user_id: user_info for user_id, user_info in zip(final_df['user_id'], final_df.to_dict('records'))}

user_id_to_vertex_id = {user_id: idx for idx, user_id in enumerate(unique_user_ids)}

In [7]:
# Executing undirected graph
G = ig.Graph(directed=False)

G.add_vertices(len(unique_user_ids))

In [8]:
G.vs["user_info"] = [user_id_to_user_info[user_id] for user_id in unique_user_ids]

In [9]:
review_columns = ['y', 'user_id', 'review_id']

for col in review_columns:
    G.vs[col] = [user_info[col] for user_info in G.vs["user_info"]]

app_user_pairs = final_df[['app_id', 'user_id', 'y', 'review_id']]

common_user_df = app_user_pairs.merge(app_user_pairs, on='app_id')

common_user_df = common_user_df[common_user_df['user_id_x'] < common_user_df['user_id_y']]

In [10]:
# Adding the edges to the graph
add_edges_optimized(G, common_user_df, final_df)

In [None]:
with open("../data/user_graph.pickle", "wb") as f:
    pickle.dump(G, f)

In [6]:
# Read graph from file:
with open("../data/user_graph.pickle", "rb") as f:
    G = pickle.load(f)

In [7]:
# In case reviews is not existing in the df
user_reviews = final_df.set_index('user_id')['reviews'].to_dict()

# Now, iterate over the vertices of the graph
for v in G.vs:
    user_id = v["user_id"]
    # If the user_id of the vertex is in the dictionary, add the reviews attribute
    if user_id in user_reviews:
        v["reviews"] = user_reviews[user_id]

In [8]:
def find_most_similar_user_by_connections(user_id, graph):
    user_vertex_id = user_id_to_vertex_id[user_id]
    user_vertex = graph.vs[user_vertex_id]
    user_y = user_vertex['y']
    
    max_connections = 1
    most_connected_neighbor = None
    
    for neighbor_id in graph.neighbors(user_vertex_id):
        neighbor_vertex = graph.vs[neighbor_id]
        if neighbor_vertex['y'] != user_y:
            continue
        if neighbor_vertex['reviews'] <= 1:
            continue
            
        connections = len(set(graph.neighbors(user_vertex_id)).intersection(graph.neighbors(neighbor_id)))
        
        if connections > max_connections:
            max_connections = connections
            most_connected_neighbor = neighbor_vertex
            
    return most_connected_neighbor["user_id"] if most_connected_neighbor else None

In [28]:
def find_most_similar_user_by_connections(user_id, graph):
    user_vertex_id = user_id_to_vertex_id[user_id]
    user_vertex = graph.vs[user_vertex_id]
    user_y = user_vertex['y']
    
    max_connections = -1
    most_connected_neighbor = None
    
    user_neighbors = set(graph.neighbors(user_vertex_id))

    for neighbor_id in user_neighbors:
        neighbor_vertex = graph.vs[neighbor_id]
        if neighbor_vertex['y'] != user_y:
            continue
            
        neighbor_edges = graph.es[graph.incident(neighbor_id)]
        unique_app_ids = set(e["common_game_ids"][0] for e in neighbor_edges)
        if len(unique_app_ids) < 2:
            continue

        neighbor_neighbors = set(graph.neighbors(neighbor_id))
        connections = len(user_neighbors.intersection(neighbor_neighbors))
        
        if connections > max_connections:
            max_connections = connections
            most_connected_neighbor = neighbor_vertex

    return most_connected_neighbor["user_id"] if most_connected_neighbor else None

In [18]:
# get a count of the unique values in the 'name' column
value_counts = final_df['user_id'].value_counts()

# create a Boolean mask to select only the values that appear once
mask = value_counts == 1

# use the Boolean mask to filter the DataFrame
one_review_users = final_df[final_df['user_id'].isin(value_counts[mask].index)]['user_id']

In [None]:
dict_most_similar_user = {}
count = 0

for user in one_review_users:
    dict_most_similar_user[user] = find_most_similar_user_by_connections(user, G)
    count += 1
    if count % 1000 == 0:
        print(count)

In [None]:
with open('../data/similar_users.csv', 'w') as f:
    f.write("%s, %s\n" % ("user_id", "similar_user"))
    for key in dict_most_similar_user.keys():
        f.write("%s, %s\n" % (key, dict_most_similar_user[key]))

In [None]:
display(ig.plot(G))