# Data Pre-Processing

In [52]:
# Imports

import pandas as pd
import numpy as np
import networkx as nx
from sklearn.metrics import pairwise_distances

import warnings
warnings.filterwarnings('ignore')

### Step 1. Make sure source and target have no shared ID's:

In [53]:
# Read the CSV file into a DataFrame
df = pd.read_csv('wang-tripadvisor/raw.csv', delimiter = ";")

# Update the 'Source' and 'Target' columns by adding 'A' and 'B' to the beginning of each value
df['Source'] = 'a' + df['Source'].astype(str)
df['Target'] = 'b' + df['Target'].astype(str)

# Rename source column to include #
df.rename(columns = {'Source': '#Source'})

df.head(10)

Unnamed: 0,Source,Target,Rating,Timestamp
0,a1,b1,4,1230332400
1,a2,b1,5,1230073200
2,a3,b1,5,1229122800
3,a4,b1,5,1228777200
4,a5,b1,4,1228345200
5,a6,b1,5,1228345200
6,a7,b1,5,1228345200
7,a8,b1,4,1228172400
8,a9,b1,4,1228172400
9,a10,b1,4,1227135600


### Step 2. Handle parallel edges:

We can now load the data into a graph:

In [54]:
# We create two versions of the graph to check the amount of parallel edges

# No parallel edges
B = nx.Graph()

# Parallel edges
B_multi = nx.MultiGraph()

edges = [(row['Source'], row['Target'], {'Rating': row['Rating'], 'Timestamp': row['Timestamp']}) for _, row in df.iterrows()]
B.add_edges_from(edges)
B_multi.add_edges_from(edges)

print('Graph without parallel edges:', B)
print('Graph with parallel edges:', B_multi)
print('Difference in amount of edges:', B_multi.number_of_edges() - B.number_of_edges())

Graph without parallel edges: Graph with 147075 nodes and 175655 edges
Graph with parallel edges: MultiGraph with 147075 nodes and 175765 edges
Difference in amount of edges: 110


It seems we have 110 parallel edges.

We need to remove these. However, it would be careless to simply choose one at random since edges carry weights corresponding to the rating the user gave the hotel. It makes sense to take the newest edge using the timestamp attribute, since this represents the users most recent visit to the hotel.

We will once again return to the pandas dataframe to solve this issue.

In [55]:
# Sort DataFrame by 'Timestamp' descending
df.sort_values(by = 'Timestamp', ascending = False, inplace = True)

# Drop duplicates, keeping only the first occurrence (which by sorting is the newest)
df.drop_duplicates(subset = ['Source', 'Target'], keep = 'first', inplace = True)

# Reset the index of the modified DataFrame
df.reset_index(drop = True, inplace = True)

# Check that amount of DataFrame rows is equal to amount of edges in G
B.number_of_edges() == df.shape[0]

True

In [56]:
# Re-add # before source column name
df.rename(columns = {'Source': '#Source'}, inplace = True)
df.head()

Unnamed: 0,#Source,Target,Rating,Timestamp
0,a107950,b1202,4,1231714800
1,a11220,b114,4,1231714800
2,a107159,b1192,4,1231628400
3,a42532,b398,1,1231628400
4,a58447,b603,1,1231628400


In [57]:
# Save the modified DataFrame to a new CSV file
# Un-comment this to re-create file
df.to_csv('data_transformed.csv', index = False, sep = ';')