<a href="https://colab.research.google.com/github/JacobH140/century-of-college-football/blob/main/college_football_make_custom_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!wget -O all_cfb_games_1924_2024.csv https://github.com/JacobH140/century-of-college-football/raw/main/all_cfb_games_1924_2024.csv


--2024-11-21 22:13:31--  https://github.com/JacobH140/century-of-college-football/raw/main/all_cfb_games_1924_2024.csv
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/JacobH140/century-of-college-football/main/all_cfb_games_1924_2024.csv [following]
--2024-11-21 22:13:31--  https://raw.githubusercontent.com/JacobH140/century-of-college-football/main/all_cfb_games_1924_2024.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3485791 (3.3M) [text/plain]
Saving to: ‘all_cfb_games_1924_2024.csv’


2024-11-21 22:13:32 (56.7 MB/s) - ‘all_cfb_games_1924_2024.csv’ saved [3485791/3485791]



# Specify these

In [9]:
##### EDIT THESE #####
start_year = 2009
end_year = 2024
exclude_years = (2020,)

# decide whether to only consider teams continuously present in the data — either present every year, or present every month
# (these should be True e.g. if you plan on applying a dynamic community detection algorithm that assumes a constant node set V)
evergreen_teams_yearly = True
evergreen_teams_monthly = True; assert evergreen_teams_yearly or not evergreen_teams_monthly, "Enabling evergreen_teams_monthly requires enabling evergreen_teams_yearly"

# Optionally, segment the data into year-long or month-long snapshots
# (two teams are connected at snapshot i iff they played during the relevant snapshot interval)
snapshot_length = "year"; assert (snapshot_length is None) or (snapshot_length in ("year", "month"))


######################

# Then run this

In [14]:
import pandas as pd
import networkx as nx
from datetime import datetime
import calendar
from statistics import mean, median

def calculate_network_stats(df, timestamp):
    """Calculate network statistics for a given snapshot."""
    # Create network from edges
    G = nx.Graph()
    edges = list(zip(df['Team1'], df['Team2']))
    G.add_edges_from(edges)

    stats = {
        'timestamp': timestamp,
        'num_edges': len(edges),
        'num_nodes': len(G.nodes()),
        'num_components': nx.number_connected_components(G),
        'clustering_coefficient': nx.average_clustering(G),
        'density': nx.density(G),
        'avg_degree': sum(dict(G.degree()).values()) / len(G.nodes()) if len(G.nodes()) > 0 else 0,
        'diameter': max(nx.eccentricity(G).values()) if nx.is_connected(G) else float('inf')
    }
    return stats

def make_custom_dataset(start_year, end_year, exclude_years, evergreen_teams_yearly, evergreen_teams_monthly, snapshot_length):
    """
    Process college football games dataset according to specified filtering criteria.

    Args:
        start_year (int): Starting year for the dataset
        end_year (int): Ending year for the dataset
        exclude_years (tuple): Years to exclude from the dataset
        evergreen_teams_yearly (bool): If True, only include teams present in every year
        evergreen_teams_monthly (bool): If True, only include teams present in every month
        snapshot_length (str): Either "year", "month", or None for temporal aggregation
    """
    # Read the original dataset
    input_file = f'all_cfb_games_{1924}_{2024}.csv'
    df = pd.read_csv(input_file)
    print(f"Initial dataset size: {len(df)} games")

    # Convert timestamp to datetime for easier processing
    df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
    df['year'] = df['datetime'].dt.year
    df['month'] = df['datetime'].dt.month

    # Filter by year range and exclude specified years
    df = df[
        (df['year'] >= start_year) &
        (df['year'] <= end_year) &
        (~df['year'].isin(exclude_years))
    ]
    print(f"After year filtering: {len(df)} games")

    # Get all teams (both home and away)
    all_teams = set(pd.concat([df['Team1'], df['Team2']]).unique())
    print(f"Total unique teams in filtered years: {len(all_teams)}")

    if evergreen_teams_yearly:
        # For each year, get the teams that played
        yearly_teams = {}
        for year in range(start_year, end_year + 1):
            if year not in exclude_years:
                year_df = df[df['year'] == year]
                teams = set(pd.concat([year_df['Team1'], year_df['Team2']]).unique())
                yearly_teams[year] = teams
                print(f"Year {year}: {len(teams)} teams")

        # Keep only teams that appear in every year
        evergreen_yearly = set.intersection(*yearly_teams.values())
        print(f"Teams present in every year: {len(evergreen_yearly)}")

        if evergreen_teams_monthly:
            # Get the typical CFB season months (September to November)
            season_months = [9, 10, 11]
            monthly_teams = {}

            for year in yearly_teams.keys():
                year_df = df[df['year'] == year]
                for month in season_months:
                    month_df = year_df[year_df['month'] == month]

                    if not month_df.empty:
                        teams = set(pd.concat([month_df['Team1'], month_df['Team2']]).unique())
                        monthly_teams[(year, month)] = teams
                        print(f"Year {year}, Month {month}: {len(teams)} teams")

            # Consider a team evergreen if it plays at least one game in each season
            evergreen_teams = evergreen_yearly
            print(f"Teams present in every season: {len(evergreen_teams)}")
        else:
            evergreen_teams = evergreen_yearly

        # Filter the dataframe to keep only evergreen teams
        df = df[
            (df['Team1'].isin(evergreen_teams)) &
            (df['Team2'].isin(evergreen_teams))
        ]
        print(f"After evergreen team filtering: {len(df)} games")

        # Print the evergreen teams for verification
        print("Evergreen teams:")
        print(sorted(evergreen_teams))

    # Handle temporal snapshots if specified
    if snapshot_length:
        if snapshot_length == "year":
            # Set all timestamps to January 1st of their respective years
            df['timestamp'] = df['datetime'].apply(
                lambda x: int(datetime(x.year, 1, 1).timestamp())
            )
        elif snapshot_length == "month":
            # Set all timestamps to the 1st of their respective months
            df['timestamp'] = df['datetime'].apply(
                lambda x: int(datetime(x.year, x.month, 1).timestamp())
            )

    # Select final columns and sort
    output_df = df[['timestamp', 'Team1', 'Team2', 'Conf1', 'Conf2']].sort_values('timestamp')

    # Save to CSV
    output_file = f'processed_cfb_games_{start_year}_{end_year}.csv'
    if exclude_years:
        output_file = output_file.replace('.csv', f'_excl{"_".join(map(str, exclude_years))}.csv')
    if evergreen_teams_yearly:
        output_file = output_file.replace('.csv', '_evergreen_yearly.csv')
    if evergreen_teams_monthly:
        output_file = output_file.replace('.csv', '_monthly.csv')
    if snapshot_length:
        output_file = output_file.replace('.csv', f'_snapshot_{snapshot_length}.csv')

    output_df.to_csv(output_file, index=False)

    # Calculate network statistics if snapshots are specified
    if snapshot_length:
        # Group by snapshot timestamps
        network_stats = []
        for timestamp, group in output_df.groupby('timestamp'):
            stats = calculate_network_stats(group, timestamp)
            network_stats.append(stats)

        # Convert to DataFrame for easy analysis
        stats_df = pd.DataFrame(network_stats)

        # Calculate summary statistics
        summary_stats = {
            'Metric': [],
            'Min': [],
            'Max': [],
            'Median': [],
            'Mean': []
        }

        for column in ['num_edges', 'num_nodes', 'num_components', 'clustering_coefficient',
                      'density', 'avg_degree', 'diameter']:
            if column == 'diameter':
                values = [x for x in stats_df[column] if x != float('inf')]
                if not values:  # Skip if no finite values
                    continue
            else:
                values = stats_df[column]

            summary_stats['Metric'].append(column)
            summary_stats['Min'].append(min(values))
            summary_stats['Max'].append(max(values))
            summary_stats['Median'].append(median(values))
            summary_stats['Mean'].append(mean(values))

        summary_df = pd.DataFrame(summary_stats)

        # Create README content
        readme_content = f"""# College Football Games Network Analysis

Dataset Parameters:
- Years: {start_year}-{end_year} (excluding {exclude_years})
- Evergreen Teams (Yearly): {evergreen_teams_yearly}
- Evergreen Teams (Monthly): {evergreen_teams_monthly}
- Snapshot Length: {snapshot_length}

## Network Statistics Summary
{summary_df.to_string(index=False)}

## Files Generated
1. {output_file} - Processed game data
2. {output_file.replace('.csv', '_network_stats.csv')} - Per-snapshot network statistics
3. {output_file.replace('.csv', '_README.md')} - This documentation file

## Notes
- Timestamps are aggregated to {snapshot_length} intervals
- Network statistics are calculated for each {snapshot_length} snapshot
- Diameter calculations exclude disconnected graphs (marked as infinity)
"""

        # Save additional files
        stats_df.to_csv(output_file.replace('.csv', '_network_stats.csv'), index=False)
        with open(output_file.replace('.csv', '_README.md'), 'w') as f:
            f.write(readme_content)

        print("\nFiles saved:")
        print(f"1. {output_file}")
        print(f"2. {output_file.replace('.csv', '_network_stats.csv')}")
        print(f"3. {output_file.replace('.csv', '_README.md')}")

        # Print summary statistics
        print("\nNetwork Statistics Summary:")
        print(summary_df.to_string(index=False))

    # Print final summary statistics
    print("\nFinal Summary:")
    print(f"Processed dataset saved to {output_file}")
    print(f"Total games (edges): {len(output_df)}")
    print(f"Total teams (nodes): {len(set(pd.concat([output_df['Team1'], output_df['Team2']]).unique()))}")
    if len(output_df) > 0:
        print(f"Date range: {min(df['datetime'])} to {max(df['datetime'])}")

    return output_df


make_custom_dataset(start_year, end_year, exclude_years, evergreen_teams_yearly, evergreen_teams_monthly, snapshot_length)


Initial dataset size: 68313 games
After year filtering: 12395 games
Total unique teams in filtered years: 249
Year 2009: 195 teams
Year 2010: 195 teams
Year 2011: 197 teams
Year 2012: 205 teams
Year 2013: 207 teams
Year 2014: 209 teams
Year 2015: 208 teams
Year 2016: 216 teams
Year 2017: 212 teams
Year 2018: 219 teams
Year 2019: 217 teams
Year 2021: 227 teams
Year 2022: 228 teams
Year 2023: 229 teams
Year 2024: 230 teams
Teams present in every year: 164
Year 2009, Month 9: 186 teams
Year 2009, Month 10: 129 teams
Year 2009, Month 11: 128 teams
Year 2010, Month 9: 189 teams
Year 2010, Month 10: 126 teams
Year 2010, Month 11: 126 teams
Year 2011, Month 9: 189 teams
Year 2011, Month 10: 125 teams
Year 2011, Month 11: 125 teams
Year 2012, Month 9: 195 teams
Year 2012, Month 10: 126 teams
Year 2012, Month 11: 132 teams
Year 2013, Month 9: 179 teams
Year 2013, Month 10: 130 teams
Year 2013, Month 11: 135 teams
Year 2014, Month 9: 173 teams
Year 2014, Month 10: 133 teams
Year 2014, Month 11: 

Unnamed: 0,timestamp,Team1,Team2,Conf1,Conf2
55377,1230768000,Bowling Green,Troy,Mid-American,Sun Belt
55896,1230768000,North Texas,Western Kentucky,Sun Belt,Sun Belt
55897,1230768000,Northwestern,Penn State,Big Ten,Big Ten
55898,1230768000,Fresno State,Utah State,Western Athletic,Western Athletic
55899,1230768000,Idaho,Louisiana Tech,Western Athletic,Western Athletic
...,...,...,...,...,...
67775,1704067200,Kent State,Eastern Michigan,Mid-American,Mid-American
67776,1704067200,Ohio,Akron,Mid-American,Mid-American
67777,1704067200,UNLV,Fresno State,Mountain West,Mountain West
67766,1704067200,NC State,Northern Illinois,ACC,Mid-American
