In [None]:
# 1. Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, sum
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



In [None]:
# 2. Initialize Spark Session
spark = SparkSession.builder \
    .appName("Crop Yield Prediction") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()


In [None]:
# 3. Load dataset
file_path = "crop_yield.csv"
data = spark.read.csv(file_path, header=True, inferSchema=True)

if data.count() > 0:
    print("Dataset loaded successfully!")
else:
    print("No data found in the dataset.")

Dataset loaded successfully!


In [None]:
# Display dataset schema
data.printSchema()

root
 |-- Region: string (nullable = true)
 |-- Soil_Type: string (nullable = true)
 |-- Crop: string (nullable = true)
 |-- Rainfall_mm: double (nullable = true)
 |-- Temperature_Celsius: double (nullable = true)
 |-- Fertilizer_Used: boolean (nullable = true)
 |-- Irrigation_Used: boolean (nullable = true)
 |-- Weather_Condition: string (nullable = true)
 |-- Days_to_Harvest: integer (nullable = true)
 |-- Yield_tons_per_hectare: double (nullable = true)



In [None]:
# Show the first few rows
data.show(5)

+------+---------+-------+-----------------+-------------------+---------------+---------------+-----------------+---------------+----------------------+
|Region|Soil_Type|   Crop|      Rainfall_mm|Temperature_Celsius|Fertilizer_Used|Irrigation_Used|Weather_Condition|Days_to_Harvest|Yield_tons_per_hectare|
+------+---------+-------+-----------------+-------------------+---------------+---------------+-----------------+---------------+----------------------+
|  West|    Sandy| Cotton|897.0772391101236| 27.676966373377603|          false|           true|           Cloudy|            122|     6.555816258223593|
| South|     Clay|   Rice|992.6732816189208|  18.02614225436302|           true|           true|            Rainy|            140|       8.5273409063236|
| North|     Loam| Barley|147.9980252926104|  29.79404241557257|          false|          false|            Sunny|            106|     1.127443335982929|
| North|    Sandy|Soybean|986.8663313367325|  16.64419019137728|          fa

In [None]:
# 4. Check for missing values
print("\nMissing Values in Each Column:")
missing_values = data.select(
    [(sum(col(column).isNull().cast("int")).alias(column)) for column in data.columns]
)
missing_values.show()


Missing Values in Each Column:
+------+---------+----+-----------+-------------------+---------------+---------------+-----------------+---------------+----------------------+
|Region|Soil_Type|Crop|Rainfall_mm|Temperature_Celsius|Fertilizer_Used|Irrigation_Used|Weather_Condition|Days_to_Harvest|Yield_tons_per_hectare|
+------+---------+----+-----------+-------------------+---------------+---------------+-----------------+---------------+----------------------+
|     0|        0|   0|          0|                  0|              0|              0|                0|              0|                     0|
+------+---------+----+-----------+-------------------+---------------+---------------+-----------------+---------------+----------------------+



In [None]:
data.select(["Rainfall_mm", "Temperature_Celsius", "Days_to_Harvest", "Yield_tons_per_hectare"]).describe().show()


+-------+------------------+-------------------+------------------+----------------------+
|summary|       Rainfall_mm|Temperature_Celsius|   Days_to_Harvest|Yield_tons_per_hectare|
+-------+------------------+-------------------+------------------+----------------------+
|  count|           1000000|            1000000|           1000000|               1000000|
|   mean|  549.981900729366| 27.504965199661616|        104.495025|      4.64947248766303|
| stddev|259.85132027823227|  7.220607587682008|25.953412277174294|     1.696572451116516|
|    min|100.00089622522204| 15.000034141430271|                60|    -1.147613222534901|
|    max|  999.998098221668|  39.99999662316004|               149|     9.963372228814649|
+-------+------------------+-------------------+------------------+----------------------+



In [None]:
data.groupBy("Region").count().show()
data.groupBy("Soil_Type").count().show()
data.groupBy("Crop").count().show()


+------+------+
|Region| count|
+------+------+
| South|250054|
|  East|249699|
|  West|250074|
| North|250173|
+------+------+

+---------+------+
|Soil_Type| count|
+---------+------+
|    Sandy|167119|
|     Loam|166795|
|     Clay|166352|
|   Chalky|166779|
|     Silt|166672|
|    Peaty|166283|
+---------+------+

+-------+------+
|   Crop| count|
+-------+------+
|  Maize|166824|
|Soybean|166349|
|  Wheat|166673|
| Cotton|166585|
|   Rice|166792|
| Barley|166777|
+-------+------+



In [None]:
# from pyspark.sql.functions import col
# from pyspark.ml.stat import Correlation
# from pyspark.ml.feature import VectorAssembler
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Step 1: Select relevant numeric columns (ensure these columns exist in your dataset)
# numeric_columns = [
#     "Rainfall_mm", "Temperature_Celsius", "Fertilizer_Used",
#     "Irrigation_Used", "Days_to_Harvest", "Yield_tons_per_hectare"
# ]

# # Step 2: Assemble features into a vector
# assembler = VectorAssembler(inputCols=numeric_columns, outputCol="features")
# data_vector = assembler.transform(data).select("features")

# # Step 3: Compute the correlation matrix
# correlation_matrix = Correlation.corr(data_vector, "features").head()[0].toArray()

# # Step 4: Convert correlation matrix to a heatmap
# plt.figure(figsize=(12, 10))
# sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", cbar=True,
#             xticklabels=numeric_columns, yticklabels=numeric_columns)
# plt.title("Correlation Heatmap of Features (PySpark)")
# plt.show()


In [None]:
# 5. Handle categorical columns using StringIndexer and OneHotEncoder
categorical_columns = ['Region', 'Soil_Type', 'Crop', 'Weather_Condition']

# 5.1 StringIndexer transformations
indexers = [StringIndexer(inputCol=col, outputCol=col + "_Index").fit(data) for col in categorical_columns]
for indexer in indexers:
    data = indexer.transform(data)

# Drop original categorical columns
data = data.drop(*categorical_columns)

# 5.2 One-hot encode the indexed columns
encoder = OneHotEncoder(
    inputCols=[col + "_Index" for col in categorical_columns],
    outputCols=[col + "_OHE" for col in categorical_columns]
)
data = encoder.fit(data).transform(data)


In [None]:
# 6. Assemble all features into a single vector
feature_columns = [col for col in data.columns if col not in ['Yield_tons_per_hectare'] and not col.endswith('_Index')]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(data).select("features", "Yield_tons_per_hectare")


In [None]:
# 7. Split data into training and testing sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# 8. Train a Random Forest Regressor to extract feature importances
rf = RandomForestRegressor(featuresCol="features", labelCol="Yield_tons_per_hectare", numTrees=40)
rf_model = rf.fit(train_data)


In [None]:
# ------------------------------------------
# 9. Evaluate Random Forest Model Performance
# ------------------------------------------
predictions_rf = rf_model.transform(test_data)

# Evaluate using RMSE
evaluator_rf = RegressionEvaluator(
    labelCol="Yield_tons_per_hectare",
    predictionCol="prediction",
    metricName="rmse"
)
rmse_rf = evaluator_rf.evaluate(predictions_rf)

# Evaluate using RÂ²
r2_rf = RegressionEvaluator(
    labelCol="Yield_tons_per_hectare",
    predictionCol="prediction",
    metricName="r2"
).evaluate(predictions_rf)

print(f"RMSE (Random Forest Full Model) = {rmse_rf:.3f}")
print(f"R^2 (Random Forest Full Model)  = {r2_rf:.3f}")

# 9.1 Calculate and Display Random Forest Model Accuracy
# ------------------------------------------
# Use Mean Absolute Error (MAE) as another metric
mae_rf = RegressionEvaluator(
    labelCol="Yield_tons_per_hectare",
    predictionCol="prediction",
    metricName="mae"
).evaluate(predictions_rf)

# Calculate accuracy percentage (R^2 * 100 for interpretability)
accuracy_percentage_rf = r2_rf * 100

print(f"Mean Absolute Error (Random Forest Full Model) = {mae_rf:.3f}")
print(f"Random Forest Model Accuracy = {accuracy_percentage_rf:.2f}%")

RMSE (Random Forest Full Model) = 0.701
R^2 (Random Forest Full Model)  = 0.829
Mean Absolute Error (Random Forest Full Model) = 0.562
Random Forest Model Accuracy = 82.86%


In [None]:
# Add residuals for Random Forest predictions
predictions_rf = predictions_rf.withColumn("Residuals", col("Yield_tons_per_hectare") - col("prediction"))

# Select relevant columns for export
export_rf = predictions_rf.select("Yield_tons_per_hectare", "prediction", "Residuals")

# Export to CSV
export_rf.coalesce(1).write.csv("random_forest.csv", header=True, mode="overwrite")
print("Random Forest predictions exported successfully!")


Random Forest predictions exported successfully!
