<a href="https://colab.research.google.com/github/JacobH140/century-of-college-football/blob/main/college_football_make_custom_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget -O all_cfb_games_1924_2024.csv https://github.com/JacobH140/century-of-college-football/raw/main/all_cfb_games_1924_2024.csv


--2024-11-21 23:01:43--  https://github.com/JacobH140/century-of-college-football/raw/main/all_cfb_games_1924_2024.csv
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/JacobH140/century-of-college-football/main/all_cfb_games_1924_2024.csv [following]
--2024-11-21 23:01:44--  https://raw.githubusercontent.com/JacobH140/century-of-college-football/main/all_cfb_games_1924_2024.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3485791 (3.3M) [text/plain]
Saving to: ‘all_cfb_games_1924_2024.csv’


2024-11-21 23:01:44 (31.6 MB/s) - ‘all_cfb_games_1924_2024.csv’ saved [3485791/3485791]



# Specify these

In [2]:
##### EDIT THESE #####
start_year = 2009
end_year = 2024
exclude_years = (2020,)

# decide whether to only consider teams continuously present in the data — either present every year, or present every month
# (these should be True e.g. if you plan on applying a dynamic community detection algorithm that assumes a constant node set V)
evergreen_teams_yearly = True
evergreen_teams_monthly = True; assert evergreen_teams_yearly or not evergreen_teams_monthly, "Enabling evergreen_teams_monthly requires enabling evergreen_teams_yearly"

# Optionally, segment the data into year-long or month-long snapshots
# (two teams are connected at snapshot i iff they played during the relevant snapshot interval)
snapshot_length = "month"; assert (snapshot_length is None) or (snapshot_length in ("year", "month"))


######################

# Then run this

In [7]:
import pandas as pd
import networkx as nx
from datetime import datetime
import calendar
from statistics import mean, median
import numpy as np
from scipy import sparse

def calculate_network_stats(df, timestamp, all_nodes=None):
    """
    Calculate network statistics for a given snapshot.

    Args:
        df: DataFrame containing edges for this snapshot
        timestamp: Timestamp for this snapshot
        all_nodes: Set of all nodes that should be present (for evergreen case)
    """
    G = nx.Graph()

    if all_nodes:
        G.add_nodes_from(all_nodes)

    edges = list(zip(df['Team1'], df['Team2']))
    G.add_edges_from(edges)

    A = nx.adjacency_matrix(G)

    stats = {
        'timestamp': timestamp,
        'num_edges': len(edges),
        'num_nodes': len(G.nodes()),
        'num_components': nx.number_connected_components(G),
        'clustering_coefficient': nx.average_clustering(G),
        'density': nx.density(G),
        'avg_degree': sum(dict(G.degree()).values()) / len(G.nodes()) if len(G.nodes()) > 0 else 0,
        'diameter': max(nx.eccentricity(G).values()) if nx.is_connected(G) else float('inf')
    }
    return stats, A, sorted(G.nodes())

def make_custom_dataset(start_year, end_year, exclude_years, evergreen_teams_yearly, evergreen_teams_monthly, snapshot_length):
    """
    Process college football games dataset according to specified filtering criteria.
    """
    input_file = f'all_cfb_games_{1924}_{2024}.csv'
    df = pd.read_csv(input_file)
    print(f"Initial dataset size: {len(df)} games")

    df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
    df['year'] = df['datetime'].dt.year
    df['month'] = df['datetime'].dt.month

    df = df[
        (df['year'] >= start_year) &
        (df['year'] <= end_year) &
        (~df['year'].isin(exclude_years))
    ]
    print(f"After year filtering: {len(df)} games")

    all_teams = set(pd.concat([df['Team1'], df['Team2']]).unique())
    print(f"Total unique teams in filtered years: {len(all_teams)}")

    if evergreen_teams_yearly:
        yearly_teams = {}
        for year in range(start_year, end_year + 1):
            if year not in exclude_years:
                year_df = df[df['year'] == year]
                teams = set(pd.concat([year_df['Team1'], year_df['Team2']]).unique())
                yearly_teams[year] = teams
                print(f"Year {year}: {len(teams)} teams")

        evergreen_yearly = set.intersection(*yearly_teams.values())
        print(f"Teams present in every year: {len(evergreen_yearly)}")

        if evergreen_teams_monthly:
            season_months = [9, 10, 11]
            monthly_teams = {}

            for year in yearly_teams.keys():
                year_df = df[df['year'] == year]
                for month in season_months:
                    month_df = year_df[year_df['month'] == month]

                    if not month_df.empty:
                        teams = set(pd.concat([month_df['Team1'], month_df['Team2']]).unique())
                        monthly_teams[(year, month)] = teams
                        print(f"Year {year}, Month {month}: {len(teams)} teams")

            evergreen_teams = evergreen_yearly
            print(f"Teams present in every season: {len(evergreen_teams)}")
        else:
            evergreen_teams = evergreen_yearly

        df = df[
            (df['Team1'].isin(evergreen_teams)) &
            (df['Team2'].isin(evergreen_teams))
        ]
        print(f"After evergreen team filtering: {len(df)} games")

        print("Evergreen teams:")
        print(sorted(evergreen_teams))

    # handle temporal snapshots
    if snapshot_length:
        if snapshot_length == "year":
            df['timestamp'] = df['datetime'].apply(
                lambda x: int(datetime(x.year, 1, 1).timestamp())
            )
        elif snapshot_length == "month":
            df['timestamp'] = df['datetime'].apply(
                lambda x: int(datetime(x.year, x.month, 1).timestamp())
            )

    output_df = df[['timestamp', 'Team1', 'Team2', 'Conf1', 'Conf2']].sort_values('timestamp')

    output_file = f'processed_cfb_games_{start_year}_{end_year}.csv'
    if exclude_years:
        output_file = output_file.replace('.csv', f'_excl{"_".join(map(str, exclude_years))}.csv')
    if evergreen_teams_yearly:
        output_file = output_file.replace('.csv', '_evergreen_yearly.csv')
    if evergreen_teams_monthly:
        output_file = output_file.replace('.csv', '_monthly.csv')
    if snapshot_length:
        output_file = output_file.replace('.csv', f'_snapshot_{snapshot_length}.csv')
    else:
        print("snapshot length not seen")

    output_df.to_csv(output_file, index=False)

    if snapshot_length:
        all_nodes = set(pd.concat([output_df['Team1'], output_df['Team2']]).unique()) if evergreen_teams_yearly else None

        network_stats = []
        adjacency_matrices = []
        timestamps = []
        node_labels = None

        for timestamp, group in output_df.groupby('timestamp'):
            stats, adj_matrix, nodes = calculate_network_stats(group, timestamp, all_nodes)
            network_stats.append(stats)
            adjacency_matrices.append(adj_matrix)
            timestamps.append(timestamp)

            if node_labels is None:
                node_labels = nodes

        stats_df = pd.DataFrame(network_stats)

        summary_stats = {
            'Metric': [],
            'Min': [],
            'Max': [],
            'Median': [],
            'Mean': []
        }

        for column in ['num_edges', 'num_nodes', 'num_components', 'clustering_coefficient',
                      'density', 'avg_degree', 'diameter']:
            if column == 'diameter':
                values = [x for x in stats_df[column] if x != float('inf')]
                if not values:
                    continue
            else:
                values = stats_df[column]

            summary_stats['Metric'].append(column)
            summary_stats['Min'].append(min(values))
            summary_stats['Max'].append(max(values))
            summary_stats['Median'].append(median(values))
            summary_stats['Mean'].append(mean(values))

        summary_df = pd.DataFrame(summary_stats)

        if evergreen_teams_yearly:
            npz_file = output_file.replace('.csv', '_adjacency_matrices.npz')
            np.savez(
                npz_file,
                matrices=[matrix.todense() for matrix in adjacency_matrices],
                timestamps=timestamps,
                node_labels=node_labels
            )

        readme_content = f"""# College Football Games Network Analysis

Dataset Parameters:
- Years: {start_year}-{end_year} (excluding {exclude_years})
- Evergreen Teams (Yearly): {evergreen_teams_yearly}
- Evergreen Teams (Monthly): {evergreen_teams_monthly}
- Snapshot Length: {snapshot_length}

## Network Statistics Summary
{summary_df.to_string(index=False)}

## Files Generated
1. {output_file} - Processed game data
2. {output_file.replace('.csv', '_network_stats.csv')} - Per-snapshot network statistics
3. {output_file.replace('.csv', '_README.md')} - This documentation file"""

        if evergreen_teams_yearly:
            readme_content += f"\n4. {npz_file} - Adjacency matrices for each snapshot"
            readme_content += "\n\nAdjacency Matrix Information:"
            readme_content += f"\n- Number of snapshots: {len(timestamps)}"
            readme_content += f"\n- Matrix size: {adjacency_matrices[0].shape}"
            readme_content += "\n- Format: Each matrix is stored as a dense array"
            readme_content += "\n- Node labels are provided to maintain consistent node ordering"

        readme_content += f"\n\nNotes:"
        readme_content += f"\n- Timestamps are aggregated to {snapshot_length} intervals"
        readme_content += f"\n- Network statistics are calculated for each {snapshot_length} snapshot"
        if evergreen_teams_yearly:
            readme_content += "\n- Adjacency matrices maintain consistent node ordering across snapshots"

        stats_df.to_csv(output_file.replace('.csv', '_network_stats.csv'), index=False)
        with open(output_file.replace('.csv', '_README.md'), 'w') as f:
            f.write(readme_content)

        print("\nFiles saved:")
        print(f"1. {output_file}")
        print(f"2. {output_file.replace('.csv', '_network_stats.csv')}")
        print(f"3. {output_file.replace('.csv', '_README.md')}")
        if evergreen_teams_yearly:
            print(f"4. {npz_file}")

        print("\nNetwork Statistics Summary:")
        print(summary_df.to_string(index=False))

    print("\nFinal Summary:")
    print(f"Processed dataset saved to {output_file}")
    print(f"Total games (edges): {len(output_df)}")
    print(f"Total teams (nodes): {len(set(pd.concat([output_df['Team1'], output_df['Team2']]).unique()))}")
    if len(output_df) > 0:
        print(f"Date range: {min(df['datetime'])} to {max(df['datetime'])}")

    return output_df

df = make_custom_dataset(start_year, end_year, exclude_years, evergreen_teams_yearly, evergreen_teams_monthly, snapshot_length)

Initial dataset size: 68313 games
After year filtering: 12395 games
Total unique teams in filtered years: 249
Year 2009: 195 teams
Year 2010: 195 teams
Year 2011: 197 teams
Year 2012: 205 teams
Year 2013: 207 teams
Year 2014: 209 teams
Year 2015: 208 teams
Year 2016: 216 teams
Year 2017: 212 teams
Year 2018: 219 teams
Year 2019: 217 teams
Year 2021: 227 teams
Year 2022: 228 teams
Year 2023: 229 teams
Year 2024: 230 teams
Teams present in every year: 164
Year 2009, Month 9: 186 teams
Year 2009, Month 10: 129 teams
Year 2009, Month 11: 128 teams
Year 2010, Month 9: 189 teams
Year 2010, Month 10: 126 teams
Year 2010, Month 11: 126 teams
Year 2011, Month 9: 189 teams
Year 2011, Month 10: 125 teams
Year 2011, Month 11: 125 teams
Year 2012, Month 9: 195 teams
Year 2012, Month 10: 126 teams
Year 2012, Month 11: 132 teams
Year 2013, Month 9: 179 teams
Year 2013, Month 10: 130 teams
Year 2013, Month 11: 135 teams
Year 2014, Month 9: 173 teams
Year 2014, Month 10: 133 teams
Year 2014, Month 11: 

# Reading in saved adjacency matrices
(Note that adjacency matrices are only saved if evergreen nodes is `True` and a snapshot length is specified — these conditions are satisfied in the default version of the notebook)

In [6]:
npz_file = f'processed_cfb_games_{start_year}_{end_year}'
if exclude_years:
    npz_file += f'_excl{"_".join(map(str, exclude_years))}'
if evergreen_teams_yearly:
    npz_file += '_evergreen_yearly'
if evergreen_teams_monthly:
    npz_file += '_monthly'
if snapshot_length:
    npz_file += f'_snapshot_{snapshot_length}'
npz_file += '_adjacency_matrices.npz'


data = np.load(npz_file, allow_pickle=True)
matrices = data['matrices']
timestamps = data['timestamps']
node_labels = data['node_labels']

print("times (unix): ", timestamps)
print("nodes: ", node_labels)

times (unix):  [1251763200 1254355200 1257033600 1259625600 1283299200 1285891200
 1288569600 1291161600 1314835200 1317427200 1320105600 1322697600
 1343779200 1346457600 1349049600 1351728000 1354320000 1375315200
 1377993600 1380585600 1383264000 1385856000 1406851200 1409529600
 1412121600 1414800000 1417392000 1441065600 1443657600 1446336000
 1448928000 1470009600 1472688000 1475280000 1477958400 1480550400
 1501545600 1504224000 1506816000 1509494400 1512086400 1533081600
 1535760000 1538352000 1541030400 1543622400 1564617600 1567296000
 1569888000 1572566400 1575158400 1627776000 1630454400 1633046400
 1635724800 1638316800 1659312000 1661990400 1664582400 1667260800
 1669852800 1690848000 1693526400 1696118400 1698796800 1701388800
 1722470400 1725148800 1727740800 1730419200 1733011200]
nodes:  ['Air Force' 'Akron' 'Alabama' 'App State' 'Arizona' 'Arizona State'
 'Arkansas' 'Arkansas State' 'Army' 'Auburn' 'BYU' 'Ball State' 'Baylor'
 'Boise State' 'Boston College' 'Bowling 