In [5]:
import os
os.environ['SPARK_HOME']= r"C:\Spark\spark-3.4.3-bin-hadoop3"
os.environ['PYSPARK_DRIVER_PYTHON']= "Jupyter"
os.environ['PYSPARK_DRIVER_PYTHON_OPTS']= "notebook"
os.environ['PYSPARK_PYTHON']= "python"

In [12]:
#import PySpark
from pyspark.sql import SparkSession
from pyspark.sql.functions import year, month

In [7]:
#Create a SparkSession
spark = SparkSession.builder \
    .appName("Global-Temperature") \
    .getOrCreate()

In [11]:
# Load the dataset
file_path = r"C:\Users\ENGR WOLE\GlobalLandTemperaturesByCity.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

In [14]:
# Drop rows with missing values
df_clean = df.dropna()

# Extract year and month from the date
df_clean = df_clean.withColumn("Year", year(df_clean["dt"])) \
                   .withColumn("Month", month(df_clean["dt"]))
# Group by year and calculate average temperature
df_yearly_avg = df_clean.groupBy("Year").agg({"AverageTemperature": "avg"})

# Rename the column
df_yearly_avg = df_yearly_avg.withColumnRenamed("avg(AverageTemperature)", "AvgTemperature")

In [17]:
from pyspark.ml.feature import VectorAssembler

# Select features and label
feature_columns = ["Year"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(df_yearly_avg)

# Prepare final dataset with features and label
final_data = data.select("features", "AvgTemperature")


In [18]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder

# Split the data into training and testing sets
train_data, test_data = final_data.randomSplit([0.8, 0.2], seed=42)


In [30]:
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

# Initialize and train the model
lr = LinearRegression(labelCol="AvgTemperature")
lr_model = lr.fit(train_data)
predictions = lr_model.transform(test_data)

# Initialize evaluators for different metrics
rmse_evaluator = RegressionEvaluator(labelCol="AvgTemperature", predictionCol="prediction", metricName="rmse")
mae_evaluator = RegressionEvaluator(labelCol="AvgTemperature", predictionCol="prediction", metricName="mae")
mse_evaluator = RegressionEvaluator(labelCol="AvgTemperature", predictionCol="prediction", metricName="mse")
r2_evaluator = RegressionEvaluator(labelCol="AvgTemperature", predictionCol="prediction", metricName="r2")

# Evaluate the model
rmse = rmse_evaluator.evaluate(predictions)
mae = mae_evaluator.evaluate(predictions)
mse = mse_evaluator.evaluate(predictions)
r2 = r2_evaluator.evaluate(predictions)

# Print evaluation metrics
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R-squared (R²): {r2}")


RMSE: 2.192565463495532
MAE: 1.6671595702598943
MSE: 4.8073433117133755
R-squared (R²): 0.7098341994798378


In [31]:
# Train and evaluate Decision Tree Regressor model
dt = DecisionTreeRegressor(labelCol="AvgTemperature")
dt_model = dt.fit(train_data)
dt_predictions = dt_model.transform(test_data)

# Evaluate Decision Tree Regressor model
dt_rmse = RegressionEvaluator(labelCol="AvgTemperature", predictionCol="prediction", metricName="rmse").evaluate(dt_predictions)
dt_mae = RegressionEvaluator(labelCol="AvgTemperature", predictionCol="prediction", metricName="mae").evaluate(dt_predictions)
dt_mse = RegressionEvaluator(labelCol="AvgTemperature", predictionCol="prediction", metricName="mse").evaluate(dt_predictions)
dt_r2 = RegressionEvaluator(labelCol="AvgTemperature", predictionCol="prediction", metricName="r2").evaluate(dt_predictions)

print(f"Decision Tree Regressor - RMSE: {dt_rmse}, MAE: {dt_mae}, MSE: {dt_mse}, R-squared: {dt_r2}")


Decision Tree Regressor - RMSE: 1.642263513023361, MAE: 0.8966262710284, MSE: 2.697029446207831, R-squared: 0.8372103555869354


In [32]:
# Train and evaluate Random Forest Regressor model
rf = RandomForestRegressor(labelCol="AvgTemperature")
rf_model = rf.fit(train_data)
rf_predictions = rf_model.transform(test_data)

# Evaluate Random Forest Regressor model
rf_rmse = RegressionEvaluator(labelCol="AvgTemperature", predictionCol="prediction", metricName="rmse").evaluate(rf_predictions)
rf_mae = RegressionEvaluator(labelCol="AvgTemperature", predictionCol="prediction", metricName="mae").evaluate(rf_predictions)
rf_mse = RegressionEvaluator(labelCol="AvgTemperature", predictionCol="prediction", metricName="mse").evaluate(rf_predictions)
rf_r2 = RegressionEvaluator(labelCol="AvgTemperature", predictionCol="prediction", metricName="r2").evaluate(rf_predictions)

print(f"Random Forest Regressor - RMSE: {rf_rmse}, MAE: {rf_mae}, MSE: {rf_mse}, R-squared: {rf_r2}")


Random Forest Regressor - RMSE: 1.6512963997681422, MAE: 0.9011519160672329, MSE: 2.7267797998872285, R-squared: 0.8354146579153947
