# PGA Tour Data Preprocessing

This notebook focuses on preparing our cleaned dataset for modeling by:
1. Loading the cleaned dataset
2. Splitting into training and testing sets
3. Scaling features
4. Preparing final datasets for modeling

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for visualizations
plt.style.use('seaborn')
sns.set_palette('husl')

  plt.style.use('seaborn')


In [2]:
# Load the cleaned dataset
df = pd.read_csv('../data/pga_tour_cleaned.csv')
print("Dataset shape:", df.shape)
print("\nFeatures:", df.columns.tolist())
print("\nSample of the data:")
df.head()

Dataset shape: (36864, 21)

Features: ['is_top_10', 'finish_SDP', 'finish_FDP', 'finish_DKP', 'total_FDP', 'total_DKP', 'total_SDP', 'streak_FDP', 'hole_FDP', 'hole_DKP', 'streak_DKP', 'hole_SDP', 'streak_SDP', 'sg_total', 'sg_t2g', 'pos', 'n_rounds', 'hole_par', 'made_cut', 'strokes', 'no_cut']

Sample of the data:


Unnamed: 0,is_top_10,finish_SDP,finish_FDP,finish_DKP,total_FDP,total_DKP,total_SDP,streak_FDP,hole_FDP,hole_DKP,...,hole_SDP,streak_SDP,sg_total,sg_t2g,pos,n_rounds,hole_par,made_cut,strokes,no_cut
0,0,0,1,2,59.7,65.0,59,7.6,51.1,60.0,...,56,3,0.85,0.65,32.0,4,288,1,289,0
1,0,2,4,5,78.5,85.5,66,13.0,61.5,72.5,...,61,3,1.6,1.24,18.0,4,288,1,286,0
2,0,0,0,0,17.4,21.5,27,0.0,17.4,21.5,...,27,0,-0.54,0.02,999.0,2,144,0,147,0
3,0,0,0,0,14.0,20.5,17,0.4,13.6,20.5,...,17,0,-2.54,-1.08,999.0,2,144,0,151,0
4,0,0,0,0,19.3,23.5,23,1.2,18.1,23.5,...,23,0,-1.04,-1.56,999.0,2,144,0,148,0


## 1. Data Splitting
Split the data into training and testing sets, ensuring balanced representation of our target variable.

In [3]:
# Separate features and target
X = df.drop('is_top_10', axis=1)
y = df['is_top_10']

# Split the data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y  # Ensure balanced split of target variable
)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)
print("\nTarget distribution:")
print("Training set:\n", y_train.value_counts(normalize=True))
print("\nTesting set:\n", y_test.value_counts(normalize=True))

Training set shape: (29491, 20)
Testing set shape: (7373, 20)

Target distribution:
Training set:
 is_top_10
0    0.899291
1    0.100709
Name: proportion, dtype: float64

Testing set:
 is_top_10
0    0.899363
1    0.100637
Name: proportion, dtype: float64


## 2. Feature Scaling
Scale the features to have zero mean and unit variance using StandardScaler.

In [4]:
# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on training data and transform both training and test sets
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrames with feature names
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

print("Scaled feature statistics (training set):")
print(X_train_scaled.describe())

Scaled feature statistics (training set):
         finish_SDP    finish_FDP    finish_DKP     total_FDP     total_DKP  \
count  2.949100e+04  2.949100e+04  2.949100e+04  2.949100e+04  2.949100e+04   
mean  -1.156490e-17  4.577774e-18  2.867132e-17  1.219133e-16 -9.492857e-17   
std    1.000017e+00  1.000017e+00  1.000017e+00  1.000017e+00  1.000017e+00   
min   -4.039028e-01 -4.453636e-01 -5.136458e-01 -2.260257e+00 -1.965816e+00   
25%   -4.039028e-01 -4.453636e-01 -5.136458e-01 -8.849229e-01 -9.286490e-01   
50%   -4.039028e-01 -4.453636e-01 -5.136458e-01 -6.271256e-02  3.937337e-02   
75%   -4.039028e-01 -2.818578e-02  1.053360e-01  7.266094e-01  7.135318e-01   
max    4.759146e+00  5.812304e+00  5.676172e+00  4.437020e+00  5.225208e+00   

          total_SDP    streak_FDP      hole_FDP      hole_DKP    streak_DKP  \
count  2.949100e+04  2.949100e+04  2.949100e+04  2.949100e+04  2.949100e+04   
mean   6.264322e-18  1.599811e-16 -2.298524e-16 -1.089028e-16 -3.854967e-18   
std    1.

## 3. Save Processed Datasets
Save the preprocessed training and testing sets for modeling.

In [5]:
# Create processed data directory if it doesn't exist
import os
processed_dir = '../data/processed'
os.makedirs(processed_dir, exist_ok=True)

# Save processed datasets
X_train_scaled.to_csv(f'{processed_dir}/X_train.csv', index=False)
X_test_scaled.to_csv(f'{processed_dir}/X_test.csv', index=False)
y_train.to_csv(f'{processed_dir}/y_train.csv', index=False)
y_test.to_csv(f'{processed_dir}/y_test.csv', index=False)

print("Saved processed datasets to:", processed_dir)

Saved processed datasets to: ../data/processed
