# Reddit Hyperlinks Dataset Exploration

This notebook explores the Reddit Hyperlinks dataset from SNAP.

## Dataset Structure

The dataset contains hyperlink information between subreddits. The data is in a tab-separated format with the following columns:

*   **SOURCE_SUBREDDIT**: The subreddit where the link originates.
*   **TARGET_SUBREDDIT**: The subreddit where the link ends.
*   **POST_ID**: The post in the source subreddit that starts the link.
*   **TIMESTAMP**: The time of the post.
*   **POST_LABEL**: A label indicating if the source post is explicitly negative towards the target post. (-1 for negative, 1 for neutral/positive).
*   **POST_PROPERTIES**: A vector of text properties of the source post, including sentiment scores and linguistic features.

In [11]:
import pandas as pd
import requests
import io
import os

# Define local data path
data_dir = 'data'
file_path = os.path.join(data_dir, 'soc-redditHyperlinks-body.tsv')

# Create data directory if it doesn't exist
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

# Check if the dataset already exists locally
if os.path.exists(file_path):
    print(f"Loading data from local file: {file_path}")
    # The file was saved with a tab separator, but also with an index.
    # We load it and then drop the extra column.
    df = pd.read_csv(file_path, sep='	')
    if 'Unnamed: 0' in df.columns:
        df = df.drop(columns=['Unnamed: 0']).drop(index=0)
else:
    print("Downloading data from the web...")
    # Download the dataset
    url = "https://snap.stanford.edu/data/soc-redditHyperlinks-body.tsv"
    response = requests.get(url)

    # Read the TSV content into a pandas DataFrame
    # We need to tell pandas there is no header
    df = pd.read_csv(io.StringIO(response.text), sep='	', header=None)

    # Rename columns for easier access
    df.columns = ['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT', 'POST_ID', 'TIMESTAMP', 'POST_LABEL', 'POST_PROPERTIES']

    # Save the DataFrame to a local file for future use
    df.to_csv(file_path, sep='	', index=True)
    print(f"Data saved to {file_path}")

# If the columns were loaded from the saved file, they might have been read as a single string.
# Let's ensure the columns are set correctly.
df.columns = ['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT', 'POST_ID', 'TIMESTAMP', 'POST_LABEL', 'POST_PROPERTIES']

df.info()

df.head()

Loading data from local file: data/soc-redditHyperlinks-body.tsv


  df = pd.read_csv(file_path, sep='	')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286561 entries, 1 to 286561
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   SOURCE_SUBREDDIT  286561 non-null  object
 1   TARGET_SUBREDDIT  286561 non-null  object
 2   POST_ID           286561 non-null  object
 3   TIMESTAMP         286561 non-null  object
 4   POST_LABEL        286561 non-null  object
 5   POST_PROPERTIES   286561 non-null  object
dtypes: object(6)
memory usage: 13.1+ MB


Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,POST_ID,TIMESTAMP,POST_LABEL,POST_PROPERTIES
1,leagueoflegends,teamredditteams,1u4nrps,2013-12-31 16:39:58,1,"345.0,298.0,0.75652173913,0.0173913043478,0.08..."
2,theredlion,soccer,1u4qkd,2013-12-31 18:18:37,-1,"101.0,98.0,0.742574257426,0.019801980198,0.049..."
3,inlandempire,bikela,1u4qlzs,2014-01-01 14:54:35,1,"85.0,85.0,0.752941176471,0.0235294117647,0.082..."
4,nfl,cfb,1u4sjvs,2013-12-31 17:37:55,1,"1124.0,949.0,0.772241992883,0.0017793594306,0...."
5,playmygame,gamedev,1u4w5ss,2014-01-01 02:51:13,1,"715.0,622.0,0.777622377622,0.00699300699301,0...."


In [16]:
len(df[df["POST_LABEL"] == -1]) / len(df) * 100

4.010315430222536

In [10]:

# Define local data path for title data
file_path_title = os.path.join(data_dir, 'soc-redditHyperlinks-title.tsv')

# Check if the title dataset already exists locally
if os.path.exists(file_path_title):
    print(f"Loading title data from local file: {file_path_title}")
    # The file was saved with a tab separator, but also with an index.
    # We load it and then drop the extra column.
    df_title = pd.read_csv(file_path_title, sep='	')
    if 'Unnamed: 0' in df_title.columns:
        df_title = df_title.drop(columns=['Unnamed: 0'])
else:
    print("Downloading title data from the web...")
    # Download the dataset
    url_title = "https://snap.stanford.edu/data/soc-redditHyperlinks-title.tsv"
    response_title = requests.get(url_title)

    # Read the TSV content into a pandas DataFrame
    # We need to tell pandas there is no header
    df_title = pd.read_csv(io.StringIO(response_title.text), sep='	', header=None)

    # Rename columns for easier access
    df_title.columns = ['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT', 'POST_ID', 'TIMESTAMP', 'POST_LABEL', 'POST_PROPERTIES']

    # Save the DataFrame to a local file for future use
    df_title.to_csv(file_path_title, sep='	', index=True)
    print(f"Data saved to {file_path_title}")

# If the columns were loaded from the saved file, they might have been read as a single string.
# Let's ensure the columns are set correctly.
df_title.columns = ['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT', 'POST_ID', 'TIMESTAMP', 'POST_LABEL', 'POST_PROPERTIES']

df_title.info()

df_title.head()

Loading title data from local file: data/soc-redditHyperlinks-title.tsv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 571927 entries, 0 to 571926
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   SOURCE_SUBREDDIT  571927 non-null  object
 1   TARGET_SUBREDDIT  571927 non-null  object
 2   POST_ID           571927 non-null  object
 3   TIMESTAMP         571927 non-null  object
 4   POST_LABEL        571927 non-null  int64 
 5   POST_PROPERTIES   571927 non-null  object
dtypes: int64(1), object(5)
memory usage: 26.2+ MB


Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,POST_ID,TIMESTAMP,POST_LABEL,POST_PROPERTIES
0,rddtgaming,rddtrust,1u4pzzs,2013-12-31 16:39:18,1,"25.0,23.0,0.76,0.0,0.44,0.12,0.12,4.0,4.0,0.0,..."
1,xboxone,battlefield_4,1u4tmfs,2013-12-31 17:59:11,1,"100.0,88.0,0.78,0.02,0.08,0.13,0.07,16.0,16.0,..."
2,ps4,battlefield_4,1u4tmos,2013-12-31 17:59:40,1,"100.0,88.0,0.78,0.02,0.08,0.13,0.07,16.0,16.0,..."
3,fitnesscirclejerk,leangains,1u50xfs,2013-12-31 19:01:56,1,"49.0,43.0,0.775510204082,0.0,0.265306122449,0...."
4,fitnesscirclejerk,lifeprotips,1u51nps,2013-12-31 21:02:28,1,"14.0,14.0,0.785714285714,0.0,0.428571428571,0...."


In [15]:
len(df_title[df_title["POST_LABEL"] == -1]) / len(df_title) * 100

10.690175494424988

In [3]:

# Concatenate the body and title dataframes
df = pd.concat([df, df_title], ignore_index=True)


In [4]:
# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,POST_ID,TIMESTAMP,POST_LABEL,POST_PROPERTIES
0,leagueoflegends,teamredditteams,1u4nrps,2013-12-31 16:39:58,1,"345.0,298.0,0.75652173913,0.0173913043478,0.08..."
1,theredlion,soccer,1u4qkd,2013-12-31 18:18:37,-1,"101.0,98.0,0.742574257426,0.019801980198,0.049..."
2,inlandempire,bikela,1u4qlzs,2014-01-01 14:54:35,1,"85.0,85.0,0.752941176471,0.0235294117647,0.082..."
3,nfl,cfb,1u4sjvs,2013-12-31 17:37:55,1,"1124.0,949.0,0.772241992883,0.0017793594306,0...."
4,playmygame,gamedev,1u4w5ss,2014-01-01 02:51:13,1,"715.0,622.0,0.777622377622,0.00699300699301,0...."


In [5]:
# Get information about the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858488 entries, 0 to 858487
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   SOURCE_SUBREDDIT  858488 non-null  object
 1   TARGET_SUBREDDIT  858488 non-null  object
 2   POST_ID           858488 non-null  object
 3   TIMESTAMP         858488 non-null  object
 4   POST_LABEL        858488 non-null  object
 5   POST_PROPERTIES   858488 non-null  object
dtypes: object(6)
memory usage: 39.3+ MB


In [6]:
# Get summary statistics of the 'POST_LABEL' column
df['POST_LABEL'].describe()

count     858488
unique         4
top            1
freq      654785
Name: POST_LABEL, dtype: int64

## Reddit Subreddit Embeddings

This dataset contains embeddings for subreddits. The embeddings are 300-dimensional vectors.

In [7]:

# Define local data path for subreddit embeddings
embeddings_file_path = os.path.join(data_dir, 'web-redditEmbeddings-subreddits.csv')

# Check if the dataset already exists locally
if os.path.exists(embeddings_file_path):
    print(f"Loading subreddit embeddings from local file: {embeddings_file_path}")
    # Load the CSV without a header
    df_embed_raw = pd.read_csv(embeddings_file_path, header=None)
else:
    print("Downloading subreddit embeddings from the web...")
    # Download the dataset
    url_embed = "https://snap.stanford.edu/data/web-redditEmbeddings-subreddits.csv"
    response_embed = requests.get(url_embed)

    # Read the CSV content into a pandas DataFrame without a header
    df_embed_raw = pd.read_csv(io.StringIO(response_embed.text), header=None)

    # Save the DataFrame to a local file for future use
    df_embed_raw.to_csv(embeddings_file_path, index=False, header=False)
    print(f"Subreddit embeddings saved to {embeddings_file_path}")

# The first column is the subreddit name, the rest are the embedding
# Create a new DataFrame with the correct structure
df_embed = pd.DataFrame()
df_embed['SUBREDDIT'] = df_embed_raw[0]
df_embed['SUBREDDIT_EMBEDDING'] = df_embed_raw.iloc[:, 1:].values.tolist()


# Display the first few rows of the embeddings DataFrame
df_embed.head()


Loading subreddit embeddings from local file: data/web-redditEmbeddings-subreddits.csv


Unnamed: 0,SUBREDDIT,SUBREDDIT_EMBEDDING
0,spiders,"[0.158972, 0.285813, 0.226329, -0.183338, -0.1..."
1,askreddit,"[-0.499114, 0.323983, -0.424809, -0.222705, -0..."
2,globaloffensivetrade,"[-0.023145, -1.199374, 1.661484, -1.025296, 1...."
3,fireteams,"[2.492506, -2.529917, -0.448484, -3.543441, -0..."
4,funny,"[-0.81937, -0.865261, 0.301753, 0.018787, 0.20..."


## Merging Datasets

Now, let's merge the hyperlink dataset with the subreddit embeddings. We will merge based on the source and target subreddits. This will allow us to analyze the properties of the subreddits involved in the hyperlinks.

First, we need to rename the column in the embeddings dataframe to be able to merge. Then we will perform two merges:
1.  Add embeddings for the `SOURCE_SUBREDDIT`.
2.  Add embeddings for the `TARGET_SUBREDDIT`.

In [8]:

# Merge with source subreddit
df_merged = pd.merge(
    df,
    df_embed,
    left_on='SOURCE_SUBREDDIT',
    right_on='SUBREDDIT',
    how='left'
)

# Rename the embedding columns for clarity
df_merged.rename(columns={'SUBREDDIT_EMBEDDING': 'SOURCE_SUBREDDIT_EMBEDDING'}, inplace=True)
df_merged.drop(columns=['SUBREDDIT'], inplace=True)


# Merge with target subreddit
df_merged = pd.merge(
    df_merged,
    df_embed,
    left_on='TARGET_SUBREDDIT',
    right_on='SUBREDDIT',
    how='left'
)

# Rename the embedding columns for clarity
df_merged.rename(columns={'SUBREDDIT_EMBEDDING': 'TARGET_SUBREDDIT_EMBEDDING'}, inplace=True)
df_merged.drop(columns=['SUBREDDIT'], inplace=True)


# Display the first few rows of the merged DataFrame
df_merged.head()


Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,POST_ID,TIMESTAMP,POST_LABEL,POST_PROPERTIES,SOURCE_SUBREDDIT_EMBEDDING,TARGET_SUBREDDIT_EMBEDDING
0,leagueoflegends,teamredditteams,1u4nrps,2013-12-31 16:39:58,1,"345.0,298.0,0.75652173913,0.0173913043478,0.08...","[-2.785298, -0.166391, 1.592624, -1.269829, 2....","[-1.402933, -1.115654, 0.921651, -0.711803, 1...."
1,theredlion,soccer,1u4qkd,2013-12-31 18:18:37,-1,"101.0,98.0,0.742574257426,0.019801980198,0.049...","[-0.239977, -0.246483, 0.059679, -0.019758, -0...","[-2.798847, -1.243099, 0.513319, 0.162192, 1.3..."
2,inlandempire,bikela,1u4qlzs,2014-01-01 14:54:35,1,"85.0,85.0,0.752941176471,0.0235294117647,0.082...","[0.070123, -0.001389, 0.143414, -0.466989, -0....","[0.15803, 0.035603, 0.197221, -0.423409, 0.192..."
3,nfl,cfb,1u4sjvs,2013-12-31 17:37:55,1,"1124.0,949.0,0.772241992883,0.0017793594306,0....","[-0.413744, 0.135593, -0.375919, -2.155808, 0....","[-0.54229, 1.074772, 0.236502, -1.676222, -0.0..."
4,playmygame,gamedev,1u4w5ss,2014-01-01 02:51:13,1,"715.0,622.0,0.777622377622,0.00699300699301,0....","[0.006368, -0.135252, 0.458533, -0.080084, -0....","[-0.033949, 0.150966, 1.039643, -0.114099, -0...."
