In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split     # train_test_split- It is used for splitting a dataset into training and testing subsets
from sklearn.ensemble import RandomForestRegressor    #  RandomForestRegressor- It is used for regression tasks, which means it is employed to predict continuous numerical values (e.g., predicting temperature, stock prices, or sales figures)
from sklearn.metrics import mean_squared_error        # mean_squared_error- It measures the average of the squared differences between the predicted values and the actual values. A lower MSE indicates that the model's predictions are closer to the actual values, while a higher MSE suggests that the model's predictions are less accurate
import matplotlib.pyplot as plt


In [None]:
data =pd.read_csv("/content/GlobalWeatherRepository.csv")   #load weather data
data

Unnamed: 0,country,location_name,latitude,longitude,timezone,last_updated_epoch,last_updated,temperature_celsius,temperature_fahrenheit,condition_text,...,air_quality_PM2.5,air_quality_PM10,air_quality_us-epa-index,air_quality_gb-defra-index,sunrise,sunset,moonrise,moonset,moon_phase,moon_illumination
0,Afghanistan,Kabul,34.52,69.18,Asia/Kabul,1693301400,2023-08-29 14:00,28.8,83.8,Sunny,...,7.9,11.1,1,1,05:24 AM,06:24 PM,05:39 PM,02:48 AM,Waxing Gibbous,93
1,Albania,Tirana,41.33,19.82,Europe/Tirane,1693301400,2023-08-29 11:30,27.0,80.6,Partly cloudy,...,28.2,29.6,2,3,06:04 AM,07:19 PM,06:50 PM,03:25 AM,Waxing Gibbous,93
2,Algeria,Algiers,36.76,3.05,Africa/Algiers,1693301400,2023-08-29 10:30,28.0,82.4,Partly cloudy,...,6.4,7.9,1,1,06:16 AM,07:21 PM,06:46 PM,03:50 AM,Waxing Gibbous,93
3,Andorra,Andorra La Vella,42.50,1.52,Europe/Andorra,1693301400,2023-08-29 11:30,10.2,50.4,Sunny,...,0.5,0.8,1,1,07:16 AM,08:34 PM,08:08 PM,04:38 AM,Waxing Gibbous,93
4,Angola,Luanda,-8.84,13.23,Africa/Luanda,1693301400,2023-08-29 10:30,25.0,77.0,Partly cloudy,...,139.6,203.3,4,10,06:11 AM,06:06 PM,04:43 PM,04:41 AM,Waxing Gibbous,93
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11101,Venezuela,Caracas,10.50,-66.92,America/Caracas,1698093900,2023-10-23 16:45,36.0,96.8,Partly cloudy,...,62.1,62.6,3,8,06:17 AM,06:07 PM,02:16 PM,01:16 AM,Waxing Gibbous,60
11102,Vietnam,Hanoi,21.03,105.85,Asia/Bangkok,1698093900,2023-10-24 03:45,23.0,73.4,Partly cloudy,...,252.5,319.8,6,10,05:55 AM,05:27 PM,01:40 PM,No moonset,Waxing Gibbous,60
11103,Yemen,Sanaa,15.35,44.21,Asia/Aden,1698093900,2023-10-23 23:45,14.8,58.6,Clear,...,22.7,98.7,2,2,05:57 AM,05:39 PM,01:45 PM,12:23 AM,Waxing Gibbous,60
11104,Zambia,Lusaka,-15.42,28.28,Africa/Lusaka,1698093900,2023-10-23 22:45,26.0,78.8,Clear,...,16.8,24.8,2,2,05:35 AM,06:07 PM,01:02 PM,01:25 AM,Waxing Gibbous,60


Data Preprocessing

In [7]:
data['last_updated']=pd.to_datetime(data['last_updated'])
data['Year']=data['last_updated'].dt.year
data['Month']=data['last_updated'].dt.month
data['Day']=data['last_updated'].dt.day
data.set_index('last_updated', inplace=True)

# **Define Features and target variable**

In [8]:
x=data[['Year','Month','Day']]
y=data['temperature_celsius']

# Split the data into training and testing sets

In [9]:
x_train,x_test, y_train, y_test= train_test_split(x,y,test_size=0.2, random_state=42)

In [12]:
x_train,x_test, y_train, y_test

(                     Year  Month  Day
 last_updated                         
 2023-10-08 00:15:00  2023     10    8
 2023-10-23 03:30:00  2023     10   23
 2023-09-05 01:30:00  2023      9    5
 2023-09-24 05:15:00  2023      9   24
 2023-09-17 00:30:00  2023      9   17
 ...                   ...    ...  ...
 2023-09-26 17:00:00  2023      9   26
 2023-09-23 23:15:00  2023      9   23
 2023-09-25 02:00:00  2023      9   25
 2023-09-02 02:30:00  2023      9    2
 2023-10-04 23:15:00  2023     10    4
 
 [8884 rows x 3 columns],
                      Year  Month  Day
 last_updated                         
 2023-10-12 18:00:00  2023     10   12
 2023-10-17 22:45:00  2023     10   17
 2023-09-14 10:30:00  2023      9   14
 2023-09-09 02:00:00  2023      9    9
 2023-10-15 00:00:00  2023     10   15
 ...                   ...    ...  ...
 2023-09-22 03:30:00  2023      9   22
 2023-10-10 23:00:00  2023     10   10
 2023-10-19 22:45:00  2023     10   19
 2023-08-29 22:45:00  2023      8   

# Train a machine learning model (ramdom forest regressor in this example)

In [14]:
model =RandomForestRegressor (n_estimators=100, random_state=42)
model.fit(x_train,y_train)

# Make prediction

In [15]:
y_pred=model.predict(x_test)

In [17]:
y_pred

array([21.55478557, 19.74735324, 26.05710439, ..., 21.40658807,
       25.28071831, 22.14331968])

# calculate mean square error

In [18]:
mse=mean_squared_error(y_test,y_pred)
print("Mean Squared Error:",{mse})

Mean Squared Error: {54.039344445110046}


# Visualize actual vs. predicted temperature

In [27]:
plt.figure(figsize=(100,20))
plt.plot(x_test.index,y_test,label="Actual Temperature")
plt.plot(x_test.index,y_pred,label="Predicted Temperature", linestyle='--')
plt.xlabel('last_updated')
plt.ylabel('temperature_celsius')
plt.legend()
plt.show()

Output hidden; open in https://colab.research.google.com to view.

# conclusion

In [None]:
1) In this simplified example, we built a weather forecasting model to predict temperatures based on Weather data.
2) We performed data preprocessing, trained a Random Forest Regressor, and evaluated the model's performance.
3) In a real-world project, you would need more features, extensive data, and more advanced models for accurate forecasting.
4) Additionally, ethical data handling, domain expertise, and real-time data are crucial for a practical weather forecasting system.