# Importing the dataset

In [1]:

import pandas as pd

# Load the dataset
file_path = "Climate Data RF & Temp 2012 to 2018.xlsx"
df = pd.read_excel(file_path)

df

Unnamed: 0,Station,LON,LAT,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,Kitgum,32.8830,3.300,2012,0.00,0.90,8.70,267.60,110.00,84.70,140.00,175.60,161.6,220.1,63.1,30.3
1,Lira,32.9000,2.250,2012,0.00,7.40,23.90,319.50,135.50,86.30,99.70,255.30,278.9,196.1,147.8,93.6
2,Kasese,30.1000,0.183,2012,0.70,22.80,18.50,302.80,124.10,13.80,22.90,36.60,51.3,276.9,93.2,123.1
3,Gulu,32.2830,2.783,2012,0.00,13.60,10.00,217.00,218.80,201.40,106.00,198.60,401.7,214.7,111.4,52.6
4,Arua,30.9170,3.050,2012,0.00,4.30,18.30,142.80,110.80,109.10,355.50,181.70,226.0,336.7,56.4,47.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,Mubende,31.3666,0.580,2018,7.73,72.50,194.88,125.25,134.52,40.36,36.03,51.96,92.9,142.2,71.2,173.8
129,Kibanda,31.2200,-0.520,2018,28.30,71.60,121.50,186.40,67.80,23.60,0.60,42.50,48.2,88.3,142.0,165.6
130,Buginyanya,34.3670,1.283,2018,22.30,41.20,464.90,371.00,295.70,325.40,151.90,166.70,243.6,167.5,79.4,38.2
131,Kyembogo,30.3330,0.683,2018,18.40,48.53,177.93,183.00,125.40,124.80,49.80,0.00,155.6,177.3,211.8,153.8


## Checking for missing values

In [2]:

missing_values = df.isna().sum()
print("Missing values:\n", missing_values)


Missing values:
 Station    0
LON        0
LAT        0
Year       0
Jan        0
Feb        0
Mar        0
Apr        1
May        0
Jun        0
Jul        0
Aug        0
Sep        0
Oct        0
Nov        0
Dec        0
dtype: int64


## Dropping unnecessary columns

In [3]:

df = df.drop(columns=['LON', 'LAT', 'Year'])


In [4]:
df

Unnamed: 0,Station,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,Kitgum,0.00,0.90,8.70,267.60,110.00,84.70,140.00,175.60,161.6,220.1,63.1,30.3
1,Lira,0.00,7.40,23.90,319.50,135.50,86.30,99.70,255.30,278.9,196.1,147.8,93.6
2,Kasese,0.70,22.80,18.50,302.80,124.10,13.80,22.90,36.60,51.3,276.9,93.2,123.1
3,Gulu,0.00,13.60,10.00,217.00,218.80,201.40,106.00,198.60,401.7,214.7,111.4,52.6
4,Arua,0.00,4.30,18.30,142.80,110.80,109.10,355.50,181.70,226.0,336.7,56.4,47.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,Mubende,7.73,72.50,194.88,125.25,134.52,40.36,36.03,51.96,92.9,142.2,71.2,173.8
129,Kibanda,28.30,71.60,121.50,186.40,67.80,23.60,0.60,42.50,48.2,88.3,142.0,165.6
130,Buginyanya,22.30,41.20,464.90,371.00,295.70,325.40,151.90,166.70,243.6,167.5,79.4,38.2
131,Kyembogo,18.40,48.53,177.93,183.00,125.40,124.80,49.80,0.00,155.6,177.3,211.8,153.8


## Imputation of missing value with mode

In [5]:
#Imputation of missing value with mode

# Identify the row with missing value
missing_row = df[df['Apr'].isna()]

# Get the station and month for the missing value
missing_station = missing_row['Station'].values[0]

# Calculate mode for April rainfall for the specific station
mode_rainfall = df[df['Station'] == missing_station]['Apr'].mode()[0]

# Impute the missing value with the mode
df.loc[missing_row.index, 'Apr'] = mode_rainfall


In [6]:
# Confirming that no more missing values
missing_values = df.isna().sum()
print("Missing values:\n", missing_values)


Missing values:
 Station    0
Jan        0
Feb        0
Mar        0
Apr        0
May        0
Jun        0
Jul        0
Aug        0
Sep        0
Oct        0
Nov        0
Dec        0
dtype: int64


## Feature Engineering

In [7]:

# Define features (X) and labels (y)
X = df[['Station']]
y = df[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']]
X

Unnamed: 0,Station
0,Kitgum
1,Lira
2,Kasese
3,Gulu
4,Arua
...,...
128,Mubende
129,Kibanda
130,Buginyanya
131,Kyembogo


In [8]:
y

Unnamed: 0,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,0.00,0.90,8.70,267.60,110.00,84.70,140.00,175.60,161.6,220.1,63.1,30.3
1,0.00,7.40,23.90,319.50,135.50,86.30,99.70,255.30,278.9,196.1,147.8,93.6
2,0.70,22.80,18.50,302.80,124.10,13.80,22.90,36.60,51.3,276.9,93.2,123.1
3,0.00,13.60,10.00,217.00,218.80,201.40,106.00,198.60,401.7,214.7,111.4,52.6
4,0.00,4.30,18.30,142.80,110.80,109.10,355.50,181.70,226.0,336.7,56.4,47.2
...,...,...,...,...,...,...,...,...,...,...,...,...
128,7.73,72.50,194.88,125.25,134.52,40.36,36.03,51.96,92.9,142.2,71.2,173.8
129,28.30,71.60,121.50,186.40,67.80,23.60,0.60,42.50,48.2,88.3,142.0,165.6
130,22.30,41.20,464.90,371.00,295.70,325.40,151.90,166.70,243.6,167.5,79.4,38.2
131,18.40,48.53,177.93,183.00,125.40,124.80,49.80,0.00,155.6,177.3,211.8,153.8


In [9]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler to the original target variable and transform it
y_standardized = scaler.fit_transform(y)

# Print the first 5 rows of the standardized target variable
print(y_standardized[:5])


[[-0.80433467 -1.02048163 -1.48978504  1.078935   -0.4274652  -0.1067041
   0.72395904  0.67043928  0.30009878  0.8039115  -1.03276693 -0.52514265]
 [-0.80433467 -0.83771821 -1.27542641  1.74610792 -0.11868161 -0.08273274
   0.18608188  1.67721135  1.88713723  0.50922335  0.13735782  0.38173772]
 [-0.78125558 -0.40470947 -1.35158013  1.53142993 -0.25672603 -1.16893509
  -0.83895451 -1.08541162 -1.1922315   1.50134014 -0.61693747  0.8043755 ]
 [-0.80433467 -0.66339002 -1.47145173  0.42847354  0.89001145  1.64170713
   0.27016689  0.96097576  3.54858926  0.73760667 -0.36550571 -0.20565714]
 [-0.80433467 -0.9248823  -1.35440064 -0.52536521 -0.41777787  0.25885917
   3.60020047  0.74749461  1.17141401  2.23560479 -1.12532697 -0.28302135]]


## Encoding categorical Feature values

In [10]:
# Perform one-hot encoding on the 'Station' column in X
X_encoded = pd.get_dummies(X, columns=['Station'], prefix='Station')


# Remove extra spaces from column names in X_encoded
X_encoded.columns = X_encoded.columns.str.strip()

# Display the updated column names
print(X_encoded.columns)


# Display the encoded feature set
print("\nEncoded Feature Set (X):")
print(X_encoded.head())



Index(['Station_Arua', 'Station_Buginyanya', 'Station_Entebbe', 'Station_Gulu',
       'Station_Jinja', 'Station_Kabale', 'Station_Kampala', 'Station_Kasese',
       'Station_Kibanda', 'Station_Kitgum', 'Station_Kotido',
       'Station_Kyembogo', 'Station_Lira', 'Station_Masindi',
       'Station_Mbarara', 'Station_Mubende', 'Station_Soroti',
       'Station_Tororo', 'Station_Wadelai'],
      dtype='object')

Encoded Feature Set (X):
   Station_Arua  Station_Buginyanya  Station_Entebbe  Station_Gulu  \
0         False               False            False         False   
1         False               False            False         False   
2         False               False            False         False   
3         False               False            False          True   
4          True               False            False         False   

   Station_Jinja  Station_Kabale  Station_Kampala  Station_Kasese  \
0          False           False            False           False   
1 

## Model training and evaluation

In [11]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle

In [12]:
# Shuffle the dataset
X_encoded_shuffled, y_shuffled = shuffle(X_encoded, y_standardized, random_state=42)

# Split the shuffled dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded_shuffled, y_shuffled, test_size=0.2, random_state=42)


## Linear Regression Model

In [13]:
from sklearn.linear_model import LinearRegression

# Define the linear regression model
linear_reg_model = LinearRegression()

#fit
linear_reg_model.fit(X_train, y_train)

# Perform k-fold cross-validation
cv_scores = cross_val_score(linear_reg_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
cv_rmse_scores = np.sqrt(-cv_scores)
print("Cross-Validation RMSE Scores:", cv_rmse_scores)
print("Mean Cross-Validation RMSE:", cv_rmse_scores.mean())

Cross-Validation RMSE Scores: [0.8658936  0.84390333 0.9313715  1.09687667 0.96951324]
Mean Cross-Validation RMSE: 0.9415116665133002


## RidgeCV model

In [14]:
from sklearn.linear_model import RidgeCV

# Define the Ridge regression model with built-in cross-validation
ridge_cv_model = RidgeCV(alphas=[0.1, 1.0, 2.0, 3.0, 5.0, 6.0, 7.0, 9.0, 10.0], cv=5)

# Train the model on the training data
ridge_cv_model.fit(X_train, y_train,)

# Perform k-fold cross-validation
cv_scores_ridge = cross_val_score(ridge_cv_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
cv_rmse_scores_ridge = np.sqrt(-cv_scores_ridge)
print("Ridge Regression Cross-Validation RMSE Scores:", cv_rmse_scores_ridge)
print("Mean Ridge Regression Cross-Validation RMSE:", cv_rmse_scores_ridge.mean())

print("Best Alpha value:", ridge_cv_model.alpha_)



Ridge Regression Cross-Validation RMSE Scores: [0.85790283 0.80087526 0.87736771 1.08864118 0.90667097]
Mean Ridge Regression Cross-Validation RMSE: 0.9062915916415628
Best Alpha value: 3.0


## DecisionTreeRegressor

In [15]:
from sklearn.tree import DecisionTreeRegressor

# Define the Decision Trees model
decision_tree_model = DecisionTreeRegressor(random_state=42)

#fit
decision_tree_model.fit(X_train, y_train)

# Perform k-fold cross-validation
decision_tree_cv_scores = cross_val_score(decision_tree_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
decision_tree_cv_rmse_scores = np.sqrt(-decision_tree_cv_scores)

# Print cross-validation results
print("Decision Trees Cross-Validation RMSE Scores:", decision_tree_cv_rmse_scores)
print("Mean Decision Trees Cross-Validation RMSE:", decision_tree_cv_rmse_scores.mean())


Decision Trees Cross-Validation RMSE Scores: [0.85555257 0.84390333 0.9313715  1.09687667 0.96951324]
Mean Decision Trees Cross-Validation RMSE: 0.9394434607925284


## RandomForestRegressor

In [16]:
from sklearn.ensemble import RandomForestRegressor

# Define the Random Forest model
random_forest_model = RandomForestRegressor(random_state=42)

random_forest_model.fit(X_train, y_train,)

# Perform k-fold cross-validation
random_forest_cv_scores = cross_val_score(random_forest_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
random_forest_cv_rmse_scores = np.sqrt(-random_forest_cv_scores)

# Print cross-validation results
print("Random Forest Cross-Validation RMSE Scores:", random_forest_cv_rmse_scores)
print("Mean Random Forest Cross-Validation RMSE:", random_forest_cv_rmse_scores.mean())


Random Forest Cross-Validation RMSE Scores: [0.85429178 0.83926772 0.93109006 1.09465439 0.95861835]
Mean Random Forest Cross-Validation RMSE: 0.9355844605596115


## Multi-Layer-Perceptron Regressor

In [17]:
from sklearn.neural_network import MLPRegressor

# Define the MLPRegressor model
mlp_regressor_model = MLPRegressor(random_state=42)

#fit
mlp_regressor_model.fit(X_train, y_train)

# Perform k-fold cross-validation
mlp_cv_scores = cross_val_score(mlp_regressor_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
mlp_cv_rmse_scores = np.sqrt(-mlp_cv_scores)

# Print cross-validation results
print("ANN Cross-Validation RMSE Scores:", mlp_cv_rmse_scores)
print("Mean ANN Cross-Validation RMSE:", mlp_cv_rmse_scores.mean())


ANN Cross-Validation RMSE Scores: [0.84652781 0.83201524 0.91507821 1.07923566 0.95762473]
Mean ANN Cross-Validation RMSE: 0.9260963298581675


## KNeighborsRegressor

In [18]:
from sklearn.neighbors import KNeighborsRegressor

# Define the KNN model
knn_model = KNeighborsRegressor()

# Perform k-fold cross-validation
knn_cv_scores = cross_val_score(knn_model,X_train, y_train, cv=5, scoring='neg_mean_squared_error')
knn_cv_rmse_scores = np.sqrt(-knn_cv_scores)

# Print cross-validation results
print("KNN Cross-Validation RMSE Scores:", knn_cv_rmse_scores)
print("Mean KNN Cross-Validation RMSE:", knn_cv_rmse_scores.mean())


KNN Cross-Validation RMSE Scores: [0.86223166 0.84477247 0.95403382 1.10348119 0.94585265]
Mean KNN Cross-Validation RMSE: 0.9420743577505574


## Analyzing these results,
- we can see that the Ridge Regression model has the lowest mean cross-validation RMSE of 63.49, followed closely by the Decision Tree and Random Forest models. These models generally outperform Linear Regression and KNN.
- However, the Artificial Neural Network (MLPRegressor) has a significantly higher RMSE compared to other models, indicating poorer performance in this particular task.

- Therefore, based on these results, the Ridge Regression model with an alpha value of 1.0 seems to be the best-performing model for the rainfall prediction task among the ones tested. 
- It offers a good balance between model complexity (regularization) and predictive accuracy.

## Model selection
- Based on the analysis of the cross-validation RMSE scores, the Ridge Regression model with an alpha value of 1.0 appears to be the best-performing model for the rainfall prediction task among the ones tested. 
- Therefore, we have selected the Ridge Regression model as our final choice for predicting rainfall.

## Define the Final Ridge regression model with built-in cross-validation

In [19]:

ridge_cv_modelfinal = RidgeCV(alphas= [3.0], cv=5)  # Setting alpha value to 1.0

# Train the model on the entire dataset
ridge_cv_modelfinal.fit(X_encoded_shuffled, y_shuffled)


# Perform k-fold cross-validation
cv_scores_ridge = cross_val_score(ridge_cv_modelfinal, X_encoded_shuffled, y_shuffled, cv=5, scoring='neg_mean_squared_error')
cv_rmse_scores_ridge = np.sqrt(-cv_scores_ridge)
print("Ridge Regression Cross-Validation RMSE Scores:", cv_rmse_scores_ridge)
print("Mean Ridge Regression Cross-Validation RMSE:", cv_rmse_scores_ridge.mean())



Ridge Regression Cross-Validation RMSE Scores: [1.09577105 0.82711863 0.84500301 0.89853547 0.87057712]
Mean Ridge Regression Cross-Validation RMSE: 0.9074010586701491


## Final Random Forest

In [20]:
from sklearn.ensemble import RandomForestRegressor

# Define the Random Forest model
random_forest_final_model = RandomForestRegressor(random_state=42)

random_forest_final_model.fit(X_encoded_shuffled, y_shuffled)

# Perform k-fold cross-validation
random_forest_cv_scores = cross_val_score(random_forest_final_model, X_encoded_shuffled, y_shuffled, cv=5, scoring='neg_mean_squared_error')
random_forest_cv_rmse_scores = np.sqrt(-random_forest_cv_scores)

# Print cross-validation results
print("Random Forest Cross-Validation RMSE Scores:", random_forest_cv_rmse_scores)
print("Mean Random Forest Cross-Validation RMSE:", random_forest_cv_rmse_scores.mean())

Random Forest Cross-Validation RMSE Scores: [1.11278475 0.86619524 0.8479061  0.94138852 0.89394201]
Mean Random Forest Cross-Validation RMSE: 0.9324433217458035


In [26]:

def predict_rainfall(station, month, model, scaler, encoded_columns):
    # Strip any leading/trailing spaces from station name
    station = station.strip()
    
    # Create a feature vector with one-hot encoded station
    feature_vector = np.zeros(len(encoded_columns))
    feature_vector[encoded_columns.get_loc(f'Station_{station}')] = 1
    
    # Predict the rainfall using the model
    rainfall_prediction_standardized = model.predict([feature_vector])
    
    # Inverse transform the predicted rainfall to the original scale
    rainfall_prediction_original_scale = scaler.inverse_transform(rainfall_prediction_standardized)

    
    # Return the predicted rainfall value for the specific month
    print(f'{month}: {rainfall_prediction_original_scale[0][encode_month(month)]} mm')



Apr: 182.2005939849624 mm




In [22]:
def encode_month(month):
    # Define a dictionary to map month names to numerical values
    month_mapping = {
        'Jan': 0, 'Feb': 1, 'Mar': 2, 'Apr': 3, 'May': 4, 'Jun': 5,
        'Jul': 6, 'Aug': 7, 'Sep': 8, 'Oct': 9, 'Nov': 10, 'Dec': 11
    }
    
    # Return the numerical value corresponding to the month
    return month_mapping.get(month, 0)  # Return 0 if month not found in dictionary


In [24]:
# predicting rainfall forspecific months
predict_rainfall("Kitgum", "Apr", ridge_cv_modelfinal,scaler, X_encoded.columns)
predict_rainfall("Kitgum", "Apr", mlp_regressor_model, scaler, X_encoded.columns)




ValueError: non-broadcastable output operand with shape (12,1) doesn't match the broadcast shape (12,12)