In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import re
from datetime import datetime

## 1 - Looking at the data

### 1.1 - Data Description
- `season`: The NFL season year. Int
- `week_num`: The week number of the NFL season. Int
- `day_of_week`: The day of the week the game was played. String
- `gametime_local`: The local time the game started. String
- `home_team`: The name of the home team. String
- `away_team`: The name of the away team. String
- `home_score`: The score of the home team. Int
- `away_score`: The score of the away team. Int
- `OT_flag`: Indicates if the game went into overtime. String (e.g., "OT" or empty)
- `arrests`: The number of arrests made during and after the game. Int
- `division_game`: Indicates if the game was a divisional matchup. String (e.g., "y" or "n")

### 1.2 - Data overview

In [None]:
# Load data
df = pd.read_csv("nfl_arrests_2011-2015.csv")
print(f"Dataset shape: {df.shape}")

In [None]:
# Display the first 10 rows
display(df.head(10))

In [None]:
# Describe the data
df.describe()

In [None]:
# Get the info of the data
df.info()

From this overview of the data my first big problem appears with the amount of categorical features. There is a heavy amount of categorical data I need to convert, but most of it shouldn't be bad. I could convert the ot flag to 0 if false and 1 if true and do the same for division game. I can convert game time and day to integers as well, but converting teams through one hot encoding might make the data set to large. I need to research some ideas.

### 1.5 - Check for nulls

In [None]:
# Check for null values
nulls = df.isnull().sum()
print("\nNull values per column:")
print(nulls)

### 1.7 - Finding basic stats

In [None]:
# Games per season
print("\nGames per season:")
display(df['season'].value_counts().sort_index())

In [None]:
# Games per day
print("\nGames per day of week:")
display(df['day_of_week'].value_counts())

In [None]:
# Mean arrests per game
print("\nMean arrests per game:", df['arrests'].mean())

## 2 - Cleaning the data

This is a hard section for me because I want to fill nulls, convert, add features, and scale them all before I can start to run predications on anything

### 2.1 - Fill null values

In [None]:
# Fill null ot flag values with N
df['OT_flag'] = df['OT_flag'].fillna('N')

# Check to see if it worked
nulls = df.isnull().sum()
print("\nNull values per column:")
print(nulls)

In [None]:
# Next fill all numerical features with mean
numeric_columns = df.select_dtypes(include=[np.number]).columns
imputer = SimpleImputer(strategy='mean')
df[numeric_columns] = imputer.fit_transform(df[numeric_columns])

# Check to see if it worked
nulls = df.isnull().sum()
print("\nNull values per column:")
print(nulls)

Now all nulls are filled, so now we can start to convert

### 2.5 - Convert categorical to numerical

First im going to start with one hot encoding the day of the week

In [None]:
df["day_of_week"].value_counts()

5 Values means 5 features will be add, but it is very annoying that I have to add wednesday as a feature even though it only appears once

In [None]:
# Day of week to numerical
day_encoder = OneHotEncoder(sparse_output=False)
day_encoded = day_encoder.fit_transform(df[['day_of_week']])
day_encoded_df = pd.DataFrame(day_encoded, columns=[f'day_{cat}' for cat in day_encoder.categories_[0]])
df = pd.concat([df, day_encoded_df], axis=1)
df.drop('day_of_week', axis=1, inplace=True)

display(df.head(10))

Next is teams

In [None]:
df["home_team"].value_counts()

There is defiantly too many teams here to one hot encode because I have to do away and home. After researching only I think label encoding will work the best out of my options. How this will work is it will assign each home and away team a integer value that will correspond to the team. It shouldn't be to bad to make it work for this.

In [None]:
# Team encoding
all_teams = sorted(set(df['home_team'].unique()).union(set(df['away_team'].unique())))
team_to_id = {team: idx for idx, team in enumerate(all_teams)}

df['home_team_id'] = df['home_team'].map(team_to_id)
df['away_team_id'] = df['away_team'].map(team_to_id)
df.drop(['home_team', 'away_team'], axis=1, inplace=True)

display(df.head(10))

In [None]:
# Print out the table of the id with the team
team_encoding_df = pd.DataFrame(list(team_to_id.items()), columns=['Team', 'Team_ID'])
display(team_encoding_df.sort_values('Team_ID'))

Label encoding worked great here and now I just have to fix a couple more values and I should be done with converting soon

In [None]:
# Convert OT_flag and division_game to 0 and 1
df['overtime'] = df['OT_flag'].apply(lambda x: 1 if x == 'OT' else 0)
df.drop('OT_flag', axis=1, inplace=True)

# Convert division_game
df['division_game'] = df['division_game'].apply(lambda x: 1 if x.lower() == 'y' else 0)

# Check if worked
display(df.head(10))

Lastly I need to convert game time to numerical. This turned out to be pretty hard because I tried to convert to hours and just round, but that didn't work at all and messed up the data anyways. After looking online for a bit I found the re library and a stack overflow question with basically the same problem and just used the function in there and it worked great.

In [None]:
# Convert game time
def time_to_minutes(time_str):
    match = re.search(r'(\d+):(\d+):(\d+)\s*([AP]M)', time_str)
    if match:
        hour, minute, second, period = match.groups()
        hour = int(hour)
        if period == 'PM' and hour < 12:
            hour += 12
        elif period == 'AM' and hour == 12:
            hour = 0
        return hour * 60 + int(minute)
    return None

df['game_minutes'] = df['gametime_local'].apply(time_to_minutes)
df.drop('gametime_local', axis=1, inplace=True)

# Check if worked
display(df.head(10))

Now after all the conversion I got every feature to numerical and I didn't have to use one hot encoding much so the data set isn't extremely big either. Now I want to add a couple more features to help the future analysis.

### 2.7 - Feature engineering

For the features I decided I want to add I went with total points, score difference, home team win, high scoring, and game competitiveness all of which will be explained in more code comments when in their cells.

In [None]:
# First is total points which is just home + away points added together
df['total_points'] = df['home_score'] + df['away_score']

In [None]:
# Next is score difference which is the difference between the two teams scoring
df['score_diff'] = df['home_score'] - df['away_score']

In [None]:
# Now home team win is added which is just seeing whether the home team won and if they did a 1 is recorded and if not a 0 is recorded
df['home_win'] = (df['score_diff'] > 0).astype(int)

In [None]:
# Next is high scoring which finds the average score of the games and if it is above records a 1 and if not it records a 0
median_score = df['total_points'].median()
df['high_scoring'] = (df['total_points'] > median_score).astype(int)

In [None]:
# Lastly is game competitiveness which uses how close the score was to find if they game was competitive
df['competitive_game'] = 1 - (abs(df['score_diff']) / df['total_points'])

Now we can check if they all worked

In [None]:
display(df.head(10))

It looks perfect and now that those features are added its finally time to scale

### 2.7 - Splitting and Scaling

I forgot I need to split my data before so im going to do that quickly and then scale

In [None]:
# Define target variable and features
y = df['arrests']
X = df.drop('arrests', axis=1)

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

Now I can actually scale

In [None]:
# Define which ones to scale because no point in scale the 0 and 1 columns
cols_to_scale = ['season', 'week_num', 'home_score', 'away_score', 
                'game_minutes', 'total_points', 'score_diff', 'competitive_game']

# Create and fit scaler using training data
scaler = StandardScaler()
X_train[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])

# Apply the same to the test data
X_test[cols_to_scale] = scaler.transform(X_test[cols_to_scale])


Now that the training and test data is scaled I can output the head for each and see if I can finally be done with cleaning

In [None]:
print("Final training data after cleaning: ")
display(X_train.head(3))

print("Final test data after cleaning: ")
display(X_test.head(3))

It looks good and now I am finally done with cleaning.

A quick summary of what I did with part 2 was I filled all nulls, converted all categorical data to numerical with one hot encoding and label encoding, I split the data into test and training data and made sure my target (arrests) was not included, I also added 5 new features to help with future analysis, and lastly I scaled both the test and training data. 

## 3 - Early model predictions