In [None]:
# ------------------------------------------
# 1. Import Libraries
# ------------------------------------------
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, sum
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# ------------------------------------------
# 2. Initialize Spark Session
# ------------------------------------------
spark = SparkSession.builder \
    .appName("Crop Yield Prediction - Linear Regression") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

In [None]:
# ------------------------------------------
# 3. Load dataset
# ------------------------------------------
file_path = "crop_yield.csv"
data = spark.read.csv(file_path, header=True, inferSchema=True)

if data.count() > 0:
    print("Dataset loaded successfully!")
else:
    print("No data found in the dataset.")


Dataset loaded successfully!


In [None]:
# Display dataset schema
data.printSchema()

root
 |-- Region: string (nullable = true)
 |-- Soil_Type: string (nullable = true)
 |-- Crop: string (nullable = true)
 |-- Rainfall_mm: double (nullable = true)
 |-- Temperature_Celsius: double (nullable = true)
 |-- Fertilizer_Used: boolean (nullable = true)
 |-- Irrigation_Used: boolean (nullable = true)
 |-- Weather_Condition: string (nullable = true)
 |-- Days_to_Harvest: integer (nullable = true)
 |-- Yield_tons_per_hectare: double (nullable = true)



In [None]:
# Show the first few rows
data.show(5)

+------+---------+-------+-----------------+-------------------+---------------+---------------+-----------------+---------------+----------------------+
|Region|Soil_Type|   Crop|      Rainfall_mm|Temperature_Celsius|Fertilizer_Used|Irrigation_Used|Weather_Condition|Days_to_Harvest|Yield_tons_per_hectare|
+------+---------+-------+-----------------+-------------------+---------------+---------------+-----------------+---------------+----------------------+
|  West|    Sandy| Cotton|897.0772391101236| 27.676966373377603|          false|           true|           Cloudy|            122|     6.555816258223593|
| South|     Clay|   Rice|992.6732816189208|  18.02614225436302|           true|           true|            Rainy|            140|       8.5273409063236|
| North|     Loam| Barley|147.9980252926104|  29.79404241557257|          false|          false|            Sunny|            106|     1.127443335982929|
| North|    Sandy|Soybean|986.8663313367325|  16.64419019137728|          fa

In [None]:
# ------------------------------------------
# 4. Check for Missing Values
# ------------------------------------------
print("\nMissing Values in Each Column:")
missing_values = data.select(
    [(sum(col(column).isNull().cast("int")).alias(column)) for column in data.columns]
)
missing_values.show()



Missing Values in Each Column:
+------+---------+----+-----------+-------------------+---------------+---------------+-----------------+---------------+----------------------+
|Region|Soil_Type|Crop|Rainfall_mm|Temperature_Celsius|Fertilizer_Used|Irrigation_Used|Weather_Condition|Days_to_Harvest|Yield_tons_per_hectare|
+------+---------+----+-----------+-------------------+---------------+---------------+-----------------+---------------+----------------------+
|     0|        0|   0|          0|                  0|              0|              0|                0|              0|                     0|
+------+---------+----+-----------+-------------------+---------------+---------------+-----------------+---------------+----------------------+



In [None]:
# ------------------------------------------
# 5. Handle Categorical Columns
#    - StringIndexer & OneHotEncoder
# ------------------------------------------
categorical_columns = ["Region", "Soil_Type", "Crop", "Weather_Condition"]

# 5.1 StringIndexer
indexers = [
    StringIndexer(inputCol=c, outputCol=c + "_Index").fit(data)
    for c in categorical_columns
]
for indexer in indexers:
    data = indexer.transform(data)

# Drop original categorical columns
data = data.drop(*categorical_columns)

# 5.2 One-hot encode the indexed columns
encoder = OneHotEncoder(
    inputCols=[c + "_Index" for c in categorical_columns],
    outputCols=[c + "_OHE" for c in categorical_columns]
)
data = encoder.fit(data).transform(data)



In [None]:
# ------------------------------------------
# 6. Assemble All Features
# ------------------------------------------
feature_columns = [
    c for c in data.columns
    if c not in ["Yield_tons_per_hectare"] and not c.endswith("_Index")
]

assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
full_data = assembler.transform(data).select("features", "Yield_tons_per_hectare")

In [None]:
# ------------------------------------------
# 7. Train/Test Split
# ------------------------------------------
train_data, test_data = full_data.randomSplit([0.9, 0.1], seed=42)


In [None]:
# ------------------------------------------
# 8. Train Linear Regression (Full Features)
# ------------------------------------------
lr = LinearRegression(
    featuresCol="features",
    labelCol="Yield_tons_per_hectare",
    maxIter=100,
    regParam=0.0,
    elasticNetParam=0.0
)

lr_model = lr.fit(train_data)

In [None]:
# ------------------------------------------
# 9. Evaluate Model Performance
# ------------------------------------------
predictions = lr_model.transform(test_data)

evaluator = RegressionEvaluator(
    labelCol="Yield_tons_per_hectare",
    predictionCol="prediction",
    metricName="rmse"
)
rmse = evaluator.evaluate(predictions)

r2 = RegressionEvaluator(
    labelCol="Yield_tons_per_hectare",
    predictionCol="prediction",
    metricName="r2"
).evaluate(predictions)

print(f"RMSE (Full Model) = {rmse:.3f}")
print(f"R^2 (Full Model)  = {r2:.3f}")

# 9.1 Calculate and Display Model Accuracy
# ------------------------------------------
# Use Mean Absolute Error (MAE) as another metric
mae = RegressionEvaluator(
    labelCol="Yield_tons_per_hectare",
    predictionCol="prediction",
    metricName="mae"
).evaluate(predictions)

# Calculate accuracy percentage (R^2 * 100 for interpretability)
accuracy_percentage = r2 * 100

print(f"Mean Absolute Error (Full Model) = {mae:.3f}")
print(f"Model Accuracy = {accuracy_percentage:.2f}%")

RMSE (Full Model) = 0.500
R^2 (Full Model)  = 0.913
Mean Absolute Error (Full Model) = 0.399
Model Accuracy = 91.31%


In [None]:
# Add residuals for Random Forest predictions
predictions_lr = predictions_lr.withColumn("Residuals", col("Yield_tons_per_hectare") - col("prediction"))

# Select only the required columns for export
export_rf = predictions_lr.select(
    "Yield_tons_per_hectare",
    col("prediction").alias("Predicted_Yield"),
    "Residuals"
)

# Export to CSV
export_rf.coalesce(1).write.csv("test.csv", header=True, mode="overwrite")
print("Random Forest predictions with selected features exported successfully!")


Random Forest predictions with selected features exported successfully!
