In [0]:
sdf = spark.read.format("delta").load(uri + "PDGA_project/Silver")   # or Gold

sdf.printSchema()

In [0]:
sdf.select(F.avg("PRCP_3day_avg"), F.max("PRCP_3day_avg")).show()

In [0]:
# SIMPLE LINEAR REGRESSION: C1X_Putt_Pct ~ AWND_3day_avg + PRCP_3day_avg + TMAX_3day_avg
# + PLOT: Predicted C1X_Putt_Pct vs Wind

from pyspark.sql import SparkSession, functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
import matplotlib.pyplot as plt
import numpy as np


#  Ensure numeric types
numeric_cols = ["C1X_Putt_Pct", "AWND_3day_avg", "PRCP_3day_avg", "TMAX_3day_avg"]
for c in numeric_cols:
    sdf = sdf.withColumn(c, F.col(c).cast("double"))

#  Drop rows with missing values in model columns
sdf_simple = sdf.dropna(subset=numeric_cols)

#  Assemble features
assembler = VectorAssembler(
    inputCols=["AWND_3day_avg", "PRCP_3day_avg", "TMAX_3day_avg"],
    outputCol="features"
)

sdf_assembled = assembler.transform(sdf_simple)

#  Linear regression model
lr = LinearRegression(
    featuresCol="features",
    labelCol="C1X_Putt_Pct",
    predictionCol="prediction"
)

model = lr.fit(sdf_assembled)
summary = model.summary

print("=== Simple Linear Regression: C1X_Putt_Pct ~ AWND + PRCP + TMAX ===")
print("RMSE:", summary.rootMeanSquaredError)
print("R2:", summary.r2)
print("Coefficients (AWND, PRCP, TMAX):", model.coefficients)
print("Intercept:", model.intercept)

#  Build prediction grid: vary wind, hold PRCP & TMAX at their means
stats = sdf_simple.agg(
    F.min("AWND_3day_avg").alias("min_wind"),
    F.max("AWND_3day_avg").alias("max_wind"),
    F.avg("PRCP_3day_avg").alias("mean_prcp"),
    F.avg("TMAX_3day_avg").alias("mean_tmax")
).collect()[0]

min_wind = float(stats["min_wind"])
max_wind = float(stats["max_wind"])
mean_prcp = float(stats["mean_prcp"])
mean_tmax = float(stats["mean_tmax"])

# Create a range of wind values
wind_vals = np.linspace(min_wind, max_wind, 50)

# Spark DataFrame for prediction grid
pred_input_rows = [(float(w), mean_prcp, mean_tmax) for w in wind_vals]
pred_input = spark.createDataFrame(
    pred_input_rows,
    ["AWND_3day_avg", "PRCP_3day_avg", "TMAX_3day_avg"]
)

pred_features = assembler.transform(pred_input)
predicted = model.transform(pred_features).select(
    "AWND_3day_avg", "prediction"
).orderBy("AWND_3day_avg").toPandas()

#  Plot: Predicted C1X putting vs Wind
plt.figure(figsize=(8,5))
plt.plot(predicted["AWND_3day_avg"], predicted["prediction"])
plt.xlabel("Average Wind Speed (AWND_3day_avg)")
plt.ylabel("Predicted C1X Putting %")
plt.title("Predicted C1X_Putt_Pct vs Wind\n(PRCP & TMAX held at their mean values)")
plt.grid(True)
plt.show()


In [0]:
# BETTER SIMPLE MODEL:
# Convert PRCP → mm (NOAA = tenths of mm)
# Fit Linear Regression: C1X_Putt_Pct ~ AWND + PRCP_mm + TMAX
# Plot predicted C1X vs Wind

from pyspark.sql import SparkSession, functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
import matplotlib.pyplot as plt
import numpy as np

spark = SparkSession.builder.getOrCreate()


# 1. Load CSV



# 2. Ensure correct numeric types

numeric_cols = ["C1X_Putt_Pct", "AWND_3day_avg", "PRCP_3day_avg", "TMAX_3day_avg"]
for c in numeric_cols:
    sdf = sdf.withColumn(c, F.col(c).cast("double"))


# 3. FIX PRECIPITATION UNITS
# NOAA PRCP = tenths of mm → convert to mm

sdf = sdf.withColumn("PRCP_mm", F.col("PRCP_3day_avg") / 10.0)


# 4. Drop rows with missing values

sdf_simple = sdf.dropna(subset=["C1X_Putt_Pct", "AWND_3day_avg", "PRCP_mm", "TMAX_3day_avg"])


# 5. Build Feature Vector

assembler = VectorAssembler(
    inputCols=["AWND_3day_avg", "PRCP_mm", "TMAX_3day_avg"],
    outputCol="features"
)

sdf_assembled = assembler.transform(sdf_simple)


# 6. Fit Linear Regression Model

lr = LinearRegression(
    featuresCol="features",
    labelCol="C1X_Putt_Pct",
    predictionCol="prediction"
)

model = lr.fit(sdf_assembled)
summary = model.summary

print("=== Improved Linear Regression: C1X_Putt_Pct ~ AWND + PRCP_mm + TMAX ===")
print("RMSE:", summary.rootMeanSquaredError)
print("R2:", summary.r2)
print("Coefficients (AWND, PRCP_mm, TMAX):", model.coefficients)
print("Intercept:", model.intercept)


# 7. Create prediction grid

stats = sdf_simple.agg(
    F.min("AWND_3day_avg").alias("min_wind"),
    F.max("AWND_3day_avg").alias("max_wind"),
    F.avg("PRCP_mm").alias("mean_prcp"),
    F.avg("TMAX_3day_avg").alias("mean_tmax")
).collect()[0]

min_wind = float(stats["min_wind"])
max_wind = float(stats["max_wind"])
mean_prcp = float(stats["mean_prcp"])
mean_tmax = float(stats["mean_tmax"])

wind_vals = np.linspace(min_wind, max_wind, 50)
rows = [(float(w), mean_prcp, mean_tmax) for w in wind_vals]

pred_input = spark.createDataFrame(
    rows, ["AWND_3day_avg", "PRCP_mm", "TMAX_3day_avg"]
)

pred_feat = assembler.transform(pred_input)
predicted = (
    model.transform(pred_feat)
         .select("AWND_3day_avg", "prediction")
         .orderBy("AWND_3day_avg")
         .toPandas()
)


# 8. Plot: Predicted C1X vs Wind

plt.figure(figsize=(8,5))
plt.plot(predicted["AWND_3day_avg"], predicted["prediction"], linewidth=2)
plt.xlabel("Average Wind Speed (mph)")
plt.ylabel("Predicted C1X Putting %")
plt.title("Predicted C1X_Putt_Pct vs Wind\n(PRCP & TMAX held at their mean values)")
plt.grid(True)
plt.show()


In [0]:
# CORRECT SKILL-AWARE MODEL:
#  - Compute avg_rating per player
#  - Convert PRCP -> mm
#  - Linear Regression: C1X ~ AWND + PRCP_mm + TMAX + avg_rating
#  - Plot predicted C1X vs Wind

from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
import matplotlib.pyplot as plt
import numpy as np


# 1. Ensure numeric types

sdf = sdf.withColumn("evt_rating", F.col("evt_rating").cast("double"))
sdf = sdf.withColumn("C1X_Putt_Pct", F.col("C1X_Putt_Pct").cast("double"))
sdf = sdf.withColumn("AWND_3day_avg", F.col("AWND_3day_avg").cast("double"))
sdf = sdf.withColumn("PRCP_3day_avg", F.col("PRCP_3day_avg").cast("double"))
sdf = sdf.withColumn("TMAX_3day_avg", F.col("TMAX_3day_avg").cast("double"))


# 2. Compute player skill (avg_rating)

player_skill = (
    sdf.groupBy("player_name")
       .agg(F.avg("evt_rating").alias("avg_rating"))
)

sdf2 = sdf.join(player_skill, "player_name", "left")


# 3. Fix PRCP units

sdf2 = sdf2.withColumn("PRCP_mm", F.col("PRCP_3day_avg") / 10.0)


# 4. Drop missing rows

model_cols = ["C1X_Putt_Pct", "AWND_3day_avg", "PRCP_mm", "TMAX_3day_avg", "avg_rating"]
sdf_model = sdf2.dropna(subset=model_cols)


# 5. Assemble feature vector (INCLUDING SKILL)

assembler = VectorAssembler(
    inputCols=["AWND_3day_avg", "PRCP_mm", "TMAX_3day_avg", "avg_rating"],
    outputCol="features"
)

sdf_assembled = assembler.transform(sdf_model)


# 6. Fit Linear Regression

lr = LinearRegression(
    featuresCol="features",
    labelCol="C1X_Putt_Pct",
    predictionCol="prediction"
)

model = lr.fit(sdf_assembled)
summary = model.summary

print("=== TRUE Skill-Aware Regression ===")
print("RMSE:", summary.rootMeanSquaredError)
print("R2:", summary.r2)
print("Coefficients (AWND, PRCP_mm, TMAX, avg_rating):", model.coefficients)
print("Intercept:", model.intercept)


# 7. Build prediction curve vs wind

stats = sdf_model.agg(
    F.min("AWND_3day_avg").alias("min_wind"),
    F.max("AWND_3day_avg").alias("max_wind"),
    F.avg("PRCP_mm").alias("mean_prcp"),
    F.avg("TMAX_3day_avg").alias("mean_tmax"),
    F.avg("avg_rating").alias("mean_rating")
).collect()[0]

wind_vals = np.linspace(float(stats["min_wind"]), float(stats["max_wind"]), 50)

rows = [(float(w), float(stats["mean_prcp"]), float(stats["mean_tmax"]), float(stats["mean_rating"]))
        for w in wind_vals]

pred_input = spark.createDataFrame(
    rows,
    ["AWND_3day_avg", "PRCP_mm", "TMAX_3day_avg", "avg_rating"]
)

pred_feat = assembler.transform(pred_input)
predicted = (
    model.transform(pred_feat)
         .select("AWND_3day_avg", "prediction")
         .orderBy("AWND_3day_avg")
         .toPandas()
)


# 8. Plot predicted C1X vs wind

plt.figure(figsize=(8,5))
plt.plot(predicted["AWND_3day_avg"], predicted["prediction"], linewidth=2)
plt.xlabel("Average Wind (mph)")
plt.ylabel("Predicted C1X Putting %")
plt.title("Predicted C1X Putting vs Wind\n(PRCP, TMAX, and Skill Held Constant)")
plt.grid(True)
plt.show()


In [0]:
# WIND × SKILL INTERACTION MODEL
# C1X_Putt_Pct ~ AWND + PRCP_mm + TMAX + avg_rating + (AWND * avg_rating)
# Produces two prediction lines: low-skill vs high-skill players

from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
import matplotlib.pyplot as plt
import numpy as np


# 1. Ensure numeric types

sdf = sdf.withColumn("evt_rating", F.col("evt_rating").cast("double"))
sdf = sdf.withColumn("C1X_Putt_Pct", F.col("C1X_Putt_Pct").cast("double"))
sdf = sdf.withColumn("AWND_3day_avg", F.col("AWND_3day_avg").cast("double"))
sdf = sdf.withColumn("PRCP_3day_avg", F.col("PRCP_3day_avg").cast("double"))
sdf = sdf.withColumn("TMAX_3day_avg", F.col("TMAX_3day_avg").cast("double"))


# 2. Compute player skill: avg_rating

player_skill = (
    sdf.groupBy("player_name")
       .agg(F.avg("evt_rating").alias("avg_rating"))
)

sdf2 = sdf.join(player_skill, "player_name", "left")


# 3. Fix PRCP units (tenths mm → mm)

sdf2 = sdf2.withColumn("PRCP_mm", F.col("PRCP_3day_avg") / 10.0)


# 4. Create Interaction Term: AWND × avg_rating

sdf2 = sdf2.withColumn("AWND_x_skill", F.col("AWND_3day_avg") * F.col("avg_rating"))
sdf2 = sdf2.withColumn("AWND_x_skill", (F.col("AWND_3day_avg") * F.col("avg_rating")).cast("double"))


# 5. Drop missing rows

model_cols = [
    "C1X_Putt_Pct", "AWND_3day_avg", "PRCP_mm",
    "TMAX_3day_avg", "avg_rating", "AWND_x_skill"
]

sdf_model = sdf2.dropna(subset=model_cols)


# 6. Assemble features

assembler = VectorAssembler(
    inputCols=[
        "AWND_3day_avg", "PRCP_mm", "TMAX_3day_avg",
        "avg_rating", "AWND_x_skill"
    ],
    outputCol="features"
)

sdf_assembled = assembler.transform(sdf_model)


# 7. Fit Linear Regression

lr = LinearRegression(
    featuresCol="features",
    labelCol="C1X_Putt_Pct",
    predictionCol="prediction"
)

model = lr.fit(sdf_assembled)
summary = model.summary

print("=== WIND × SKILL INTERACTION MODEL ===")
print("RMSE:", summary.rootMeanSquaredError)
print("R2:", summary.r2)
print("\nCoefficients:")
print(" AWND:", model.coefficients[0])
print(" PRCP_mm:", model.coefficients[1])
print(" TMAX:", model.coefficients[2])
print(" avg_rating:", model.coefficients[3])
print(" AWND_x_skill:", model.coefficients[4])
print("Intercept:", model.intercept)


# 8. Prediction curves for LOW vs HIGH skill

stats = sdf_model.agg(
    F.min("AWND_3day_avg").alias("min_wind"),
    F.max("AWND_3day_avg").alias("max_wind"),
    F.avg("PRCP_mm").alias("mean_prcp"),
    F.avg("TMAX_3day_avg").alias("mean_tmax"),
    F.percentile_approx("avg_rating", 0.1).alias("low_skill"),
    F.percentile_approx("avg_rating", 0.9).alias("high_skill")
).collect()[0]

min_wind = float(stats["min_wind"])
max_wind = float(stats["max_wind"])
mean_prcp = float(stats["mean_prcp"])
mean_tmax = float(stats["mean_tmax"])
low_skill = float(stats["low_skill"])
high_skill = float(stats["high_skill"])

wind_vals = np.linspace(min_wind, max_wind, 60)

# Build prediction rows
rows_low = [(float(w), mean_prcp, mean_tmax, low_skill, w * low_skill)
            for w in wind_vals]

rows_high = [(float(w), mean_prcp, mean_tmax, high_skill, w * high_skill)
             for w in wind_vals]

# Create DataFrames
pred_low = spark.createDataFrame(rows_low,
    ["AWND_3day_avg", "PRCP_mm", "TMAX_3day_avg", "avg_rating", "AWND_x_skill"])
pred_high = spark.createDataFrame(rows_high,
    ["AWND_3day_avg", "PRCP_mm", "TMAX_3day_avg", "avg_rating", "AWND_x_skill"])

pred_low = model.transform(assembler.transform(pred_low)).toPandas()
pred_high = model.transform(assembler.transform(pred_high)).toPandas()


# 9. Plot LOW vs HIGH skill curves

plt.figure(figsize=(8,5))
plt.plot(pred_low["AWND_3day_avg"], pred_low["prediction"],
         label="Low Skill Player", linewidth=2)
plt.plot(pred_high["AWND_3day_avg"], pred_high["prediction"],
         label="High Skill Player", linewidth=2)

plt.xlabel("Average Wind Speed (mph)")
plt.ylabel("Predicted C1X Putting %")
plt.title("Wind × Skill Interaction Effect on C1X Putting")
plt.legend()
plt.grid(True)
plt.show()


In [0]:
# EVENT FIXED EFFECTS MODEL
# C1X ~ AWND + PRCP_mm + TMAX + avg_rating + (AWND * avg_rating) + event fixed effects

from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
import matplotlib.pyplot as plt
import numpy as np


# 1. Ensure numeric columns

sdf = sdf.withColumn("evt_rating", F.col("evt_rating").cast("double"))
sdf = sdf.withColumn("C1X_Putt_Pct", F.col("C1X_Putt_Pct").cast("double"))
sdf = sdf.withColumn("AWND_3day_avg", F.col("AWND_3day_avg").cast("double"))
sdf = sdf.withColumn("PRCP_3day_avg", F.col("PRCP_3day_avg").cast("double"))
sdf = sdf.withColumn("TMAX_3day_avg", F.col("TMAX_3day_avg").cast("double"))


# 2. Compute avg_rating per player

player_skill = (
    sdf.groupBy("player_name")
       .agg(F.avg("evt_rating").alias("avg_rating"))
)

sdf2 = sdf.join(player_skill, "player_name", "left")


# 3. Fix PRCP units

sdf2 = sdf2.withColumn("PRCP_mm", (F.col("PRCP_3day_avg") / 10.0).cast("double"))


# 4. Create interaction term: wind × skill

sdf2 = sdf2.withColumn("AWND_x_skill",
                       (F.col("AWND_3day_avg") * F.col("avg_rating")).cast("double"))


# 5. Drop rows missing relevant columns

model_cols = [
    "C1X_Putt_Pct", "AWND_3day_avg", "PRCP_mm",
    "TMAX_3day_avg", "avg_rating", "AWND_x_skill", "pdga_event_id"
]
sdf_model = sdf2.dropna(subset=model_cols)


# 6. Add EVENT FIXED EFFECTS

# Convert event_id to index
event_indexer = StringIndexer(
    inputCol="pdga_event_id",
    outputCol="event_idx",
    handleInvalid="keep"
)

# One-hot encode event index -> event vector
event_encoder = OneHotEncoder(
    inputCols=["event_idx"],
    outputCols=["event_vec"],
    handleInvalid="keep"
)


# 7. Assemble features (weather + skill + interaction + event FE)

assembler = VectorAssembler(
    inputCols=[
        "AWND_3day_avg", "PRCP_mm", "TMAX_3day_avg",
        "avg_rating", "AWND_x_skill", "event_vec"
    ],
    outputCol="features"
)


# 8. Linear Regression model

lr = LinearRegression(
    featuresCol="features",
    labelCol="C1X_Putt_Pct",
    predictionCol="prediction",
    maxIter=100
)

pipeline = Pipeline(stages=[event_indexer, event_encoder, assembler, lr])

# Fit model
model = pipeline.fit(sdf_model)
lr_model = model.stages[-1]
summary = lr_model.summary


# 9. Print results

print("=== EVENT FIXED EFFECTS MODEL ===")
print("RMSE:", summary.rootMeanSquaredError)
print("R2:", summary.r2)

print("\nCoefficients:")
print(" AWND:", lr_model.coefficients[0])
print(" PRCP_mm:", lr_model.coefficients[1])
print(" TMAX:", lr_model.coefficients[2])
print(" avg_rating:", lr_model.coefficients[3])
print(" AWND_x_skill:", lr_model.coefficients[4])

print("\nNumber of Event Fixed Effects:", len(lr_model.coefficients) - 5)
print("Intercept:", lr_model.intercept)


In [0]:
serving = (sdf_model
  .select("pdga_event_id","player_name","event_date",
          "C1X_Putt_Pct","AWND_3day_avg","PRCP_mm","TMAX_3day_avg","avg_rating","AWND_x_skill"))

serving.write.mode("overwrite").format("delta").save(uri + "PDGA_project/Gold")
