In [4]:
!pip install sparkxgb

Collecting sparkxgb
  Downloading sparkxgb-0.2.tar.gz (3.9 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyspark==3.5.4 (from sparkxgb)
  Downloading pyspark-3.5.4.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: sparkxgb, pyspark
  Building wheel for sparkxgb (setup.py) ... [?25l[?25hdone
  Created wheel for sparkxgb: filename=sparkxgb-0.2-py3-none-any.whl size=5629 sha256=db9b354b6a50120e23e39af293d639c73335922067aca6eebd4007291f89b178
  Stored in directory: /root/.cache/pip/wheels/f9/e5/73/8b34b9b0ba0e0793033077ead4ec8cb93329bcc71d53cfd7fc
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.4-py2.py3-none-any.whl size=317849769 sha256=727837bad261b178809e5eb98825332c499da32a97baff51e18e772abcf8cf00
  Stored in direc

In [8]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from sparkxgb import XGBoostRegressor
from pyspark.ml.evaluation import RegressionEvaluator

# Start Spark session
spark = SparkSession.builder \
    .appName("XGBoost Spark") \
    .config("spark.jars.packages", "ml.dmlc:xgboost4j-spark_2.12:1.6.1") \
    .getOrCreate()
# Load CSV file
data_path = "/content/part-00000-88d747c0-e1c3-4314-bcd6-a3bf54a570b7-c000.csv"
df = spark.read.option("header", True).option("inferSchema", True).csv(data_path)

# Show schema and preview
df.printSchema()
df.show(5)

columns_to_drop = ["Date", "Brand_Name", "Ticker", "Industry_Tag", "Country"]
df = df.drop(*columns_to_drop)
# Define feature and label columns
label_col = 'Close'  # <-- change this if your target variable is named differently
feature_cols = [col for col in df.columns if col != label_col]

# Assemble features into a vector
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(df).select("features", label_col)

# Split into training and test sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Define XGBoost Regressor
xgb = XGBoostRegressor(
    featuresCol="features",
    labelCol=label_col,
    predictionCol="prediction",

    objective="reg:squarederror",
    numRound=100,
    maxDepth=5,
    eta=0.1,
    numWorkers=2
)

# Train the model
model = xgb.fit(train_data)

# Make predictions
predictions = model.transform(test_data)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol=label_col, predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
r2 = evaluator.setMetricName("r2").evaluate(predictions)

print(f"RMSE: {rmse}")
print(f"R²: {r2}")



# Save predictions as CSV for Power BI
predictions.select("prediction", label_col).toPandas().to_csv(r"/content/xgboost.csv", index=False)

root
 |-- Date: date (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Dividends: double (nullable = true)
 |-- Stock Splits: double (nullable = true)
 |-- Brand_Name: string (nullable = true)
 |-- Ticker: string (nullable = true)
 |-- Industry_Tag: string (nullable = true)
 |-- Country: string (nullable = true)

+----------+--------+--------+--------+------+--------+---------+------------+----------+------+------------+-------+
|      Date|    Open|    High|     Low| Close|  Volume|Dividends|Stock Splits|Brand_Name|Ticker|Industry_Tag|Country|
+----------+--------+--------+--------+------+--------+---------+------------+----------+------+------------+-------+
|2000-01-03|14.96875|15.59375| 14.6875|  15.5| 7843200|      0.0|         0.0|       amd|   AMD|  technology|    usa|
|2000-01-04|  15.125|    15.5|14.59375|14.625| 6290200|  