<a href="https://colab.research.google.com/github/MattB234/first-CCAC-model/blob/main/CCACcode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# prompt: I want to import the "bracket_test.csv" and "CCAC 2025 - Institutions.csv"

import pandas as pd

# Assuming the files are in your current working directory
# If not, specify the full path to the files

try:
  bracket_df = pd.read_csv("bracket_training.csv")
  print("bracket_training.csv imported successfully!")
except FileNotFoundError:
  print("Error: bracket_training.csv not found.")


try:
  ccac_df = pd.read_csv("CCAC 2025 - Institutions.csv")
  print("CCAC 2025 - Institutions.csv imported successfully!")
except FileNotFoundError:
  print("Error: CCAC 2025 - Institutions.csv not found.")

# Now, let's explore the data to identify potential key fields
if 'bracket_df' in locals() and 'ccac_df' in locals():
  print("\n--- braket_training.csv fields ---")
  print(bracket_df.columns)
  print("\n--- CCAC 2025 - Institutions.csv fields ---")
  print(ccac_df.columns)

# Initial hypothesis: Look for fields related to team names, seeds, region, and historical performance
  potential_key_fields_bracket = ['BracketEntryId', 'RegionWinner_East', 'RegionWinner_West', 'RegionWinner_South', 'RegionWinner_Midwest', 'SemifinalWinner_East_West', 'SemifinalWinner_South_Midwest', 'NationalChampion']  # Example, adjust based on actual column names
  potential_key_fields_ccac = ['InstitutionID', 'InstitutionName', 'RegularSeasonWins', 'RegularSeasonLosses', 'RegularSeasonAverageAttendance', 'RegularSeasonAverageScore'] # Example, adjust based on actual column names

  # Further analysis needed based on the contents of each column to determine which are truly key.
  # For example, are there multiple teams with same name in different years?
  # Consider the unique values present in each field
  print("\n--- Potential Key Fields (Initial Hypothesis) ---")
  print("braket_training.csv:", potential_key_fields_bracket)
  print("CCAC 2025 - Institutions.csv:", potential_key_fields_ccac)


bracket_training.csv imported successfully!
CCAC 2025 - Institutions.csv imported successfully!

--- braket_training.csv fields ---
Index(['CustomerID', 'CustomerAreaCode', 'CustomerPostalCode',
       'CustomerPostalCodeLatitude', 'CustomerPostalCodeLongitude',
       'CustomerDMACode', 'CustomerDMADescription',
       'NCAACustomerRecordCreated', 'BracketEntryId',
       'BracketEntryCreatedDate', 'RegionWinner_East', 'RegionWinner_West',
       'RegionWinner_South', 'RegionWinner_Midwest',
       'SemifinalWinner_East_West', 'SemifinalWinner_South_Midwest',
       'NationalChampion'],
      dtype='object')

--- CCAC 2025 - Institutions.csv fields ---
Index(['InstitutionID', 'InstitutionName', 'InstitutionNickname',
       'InstitutionAbbreviation', 'InstitutionCity', 'InstitutionState',
       'InstitutionPostalCode', 'InstitutionDMACode',
       'InstitutionDMADescription', 'InstitutionLatitude',
       'InstitutionLongitude', 'InstitutionConference',
       'InstitutionEnrollment_

In [12]:
# prompt: Use these potential key fields to build a linear model that is able to predict 'SemifinalWinner_East_West', 'SemifinalWinner_South_Midwest', 'NationalChampion' for each BracketEntryID in "bracket_training.csv"

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the first CSV with selected coloumns of interest
bracket_training_df = pd.read_csv("bracket_training.csv", usecols=['BracketEntryId', 'RegionWinner_East', 'RegionWinner_West', 'RegionWinner_South', 'RegionWinner_Midwest', 'SemifinalWinner_East_West', 'SemifinalWinner_South_Midwest', 'NationalChampion'])

# Load the second CSV with selected columns of interest
institutions_df = pd.read_csv("CCAC 2025 - Institutions.csv", usecols=['InstitutionID', 'InstitutionName', 'RegularSeasonWins', 'RegularSeasonLosses', 'RegularSeasonAverageAttendance', 'RegularSeasonAverageScore'])

# Concatenate the selected columns (side by side)
merged_df = pd.concat([bracket_training_df, institutions_df], axis=1)

# Save the merged DataFrame to a new CSV file
merged_df.to_csv("merged_file.csv", index=False)


# Preprocessing
# 1. Handle missing values (replace with mean, median or remove rows/columns)
# 2. Convert categorical features (e.g. Region, Team names) to numerical using one-hot encoding
#    or other suitable methods

# Example preprocessing (replace with your actual preprocessing)
merged_df = merged_df.dropna()  # Drop rows with missing values (simple approach, consider imputation instead)
# Convert relevant categorical columns to numerical
# Example assuming the dataframe has 'Region' and 'Team'
merged_df = pd.get_dummies(merged_df, columns=['InstitutionName'], drop_first=True)  # One-hot encoding
# ...other preprocessing steps


# Feature Selection
# Use the potential key fields identified in previous code
potential_key_fields = ['InstitutionID', 'RegularSeasonWins', 'RegularSeasonLosses', 'RegularSeasonAverageAttendance', 'RegularSeasonAverageScore']

potential_key_fields += [col for col in merged_df.columns if 'InstitutionName_' in col]

target_variables = ['RegionWinner_East', 'RegionWinner_West', 'RegionWinner_South', 'RegionWinner_Midwest', 'SemifinalWinner_East_West', 'SemifinalWinner_South_Midwest', 'NationalChampion']
# Add one-hot encoded columns from your preprocessing steps to the 'potential_key_fields' list

# Model Training (Linear Regression)
X = merged_df[potential_key_fields] # Features
y = merged_df[target_variables] # Target variables

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Split the data

model = LinearRegression()
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Predictions for new data (replace with your actual new data)
# new_data = pd.DataFrame({'Seed': [1, 2], 'Year': [2023, 2024], ...}) #include your one hot encoded columns here
# predictions = model.predict(new_data)
# print(f"Predictions: \n{predictions}")


Mean Squared Error: 50619.56362193844
