# Saudi Arabia Weather Prediction

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv('/content/weather-sa-2017-2019-clean.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,city,date,time,year,month,day,hour,minute,weather,temp,wind,humidity,barometer,visibility
0,0,Qassim,1 January 2017,00:00,2017,1,1,24,0,Clear,17,11,64%,1018.0,16
1,1,Qassim,1 January 2017,01:00,2017,1,1,1,0,Clear,17,6,64%,1018.0,16
2,2,Qassim,1 January 2017,03:00,2017,1,1,3,0,Clear,15,11,72%,1019.0,16
3,3,Qassim,1 January 2017,04:00,2017,1,1,4,0,Clear,15,11,72%,1019.0,16
4,4,Qassim,1 January 2017,05:00,2017,1,1,5,0,Clear,15,9,72%,1019.0,16


In [4]:
df.tail()

Unnamed: 0.1,Unnamed: 0,city,date,time,year,month,day,hour,minute,weather,temp,wind,humidity,barometer,visibility
249018,2848,Jawf,30 April 2019,19:00,2019,4,30,19,0,Passing clouds,32,19,14%,1014.0,-1
249019,2849,Jawf,30 April 2019,20:00,2019,4,30,20,0,Passing clouds,29,9,22%,1015.0,-1
249020,2850,Jawf,30 April 2019,21:00,2019,4,30,21,0,Passing clouds,27,7,24%,1016.0,-1
249021,2851,Jawf,30 April 2019,22:00,2019,4,30,22,0,Clear,26,0,26%,1017.0,16
249022,2852,Jawf,30 April 2019,23:00,2019,4,30,23,0,Clear,24,7,29%,1017.0,16


In [5]:
df.shape

(249023, 15)

In [6]:
df.columns

Index(['Unnamed: 0', 'city', 'date', 'time', 'year', 'month', 'day', 'hour',
       'minute', 'weather', 'temp', 'wind', 'humidity', 'barometer',
       'visibility'],
      dtype='object')

In [7]:
df['city'].unique()

array(['Qassim', 'Hail', 'Madina', 'EP', 'Riyadh', 'Mecca', 'Tabuk',
       'Assir', 'Northern boarder', 'Jazan', 'Najran', 'Baha', 'Jawf'],
      dtype=object)

In [8]:
# Drop features that are not required to build our model
df1 = df.drop(['Unnamed: 0','date','minute','time'],axis='columns')

In [9]:
df1.shape

(249023, 11)

In [10]:
df1.head()

Unnamed: 0,city,year,month,day,hour,weather,temp,wind,humidity,barometer,visibility
0,Qassim,2017,1,1,24,Clear,17,11,64%,1018.0,16
1,Qassim,2017,1,1,1,Clear,17,6,64%,1018.0,16
2,Qassim,2017,1,1,3,Clear,15,11,72%,1019.0,16
3,Qassim,2017,1,1,4,Clear,15,11,72%,1019.0,16
4,Qassim,2017,1,1,5,Clear,15,9,72%,1019.0,16


# Data Cleaning: Handle NA values

In [11]:
df1.isnull().sum()

city           0
year           0
month          0
day            0
hour           0
weather        0
temp           0
wind           0
humidity      17
barometer     72
visibility     0
dtype: int64

In [12]:
df2 = df1.dropna()
df2.isnull().sum()

city          0
year          0
month         0
day           0
hour          0
weather       0
temp          0
wind          0
humidity      0
barometer     0
visibility    0
dtype: int64

# Feature Engineering

In [13]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 248946 entries, 0 to 249022
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   city        248946 non-null  object 
 1   year        248946 non-null  int64  
 2   month       248946 non-null  int64  
 3   day         248946 non-null  int64  
 4   hour        248946 non-null  int64  
 5   weather     248946 non-null  object 
 6   temp        248946 non-null  int64  
 7   wind        248946 non-null  int64  
 8   humidity    248946 non-null  object 
 9   barometer   248946 non-null  float64
 10  visibility  248946 non-null  int64  
dtypes: float64(1), int64(7), object(3)
memory usage: 22.8+ MB


In [14]:
city = df2['city'].value_counts(ascending=False)
city

city
Jawf                20347
Mecca               20267
Tabuk               20234
Northern boarder    20233
Hail                20114
Madina              19964
Baha                19958
Najran              19847
Jazan               19829
Qassim              19786
EP                  18459
Riyadh              16420
Assir               13488
Name: count, dtype: int64

In [15]:
weather = df2['weather'].value_counts(ascending=False)
weather

weather
Clear                                          98788
Sunny                                          82170
Passing clouds                                 34374
Scattered clouds                               15302
Partly sunny                                    6925
                                               ...  
Heavy rain  More clouds than sun                   1
Strong thunderstorms  More clouds than sun         1
Hail  Cloudy                                       1
Strong thunderstorms  Cloudy                       1
Rain  Clear                                        1
Name: count, Length: 81, dtype: int64

In [16]:
weather.values.sum()

248946

In [17]:
# Normalize text to lowercase
df2['weather'] = df2['weather'].str.lower()

# Define a function to group similar conditions
def group_weather(condition):
    if 'clear' in condition:
        return 'Clear'
    elif 'cloud' in condition:
        return 'Cloudy'
    elif 'rain' in condition or 'thunderstorm' in condition:
        return 'Rain/Storm'
    elif 'sun' in condition:
        return 'Sunny'
    else:
        return 'Other'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['weather'] = df2['weather'].str.lower()


In [18]:
# Apply the function to group weather conditions
df2['weather_grouped'] = df2['weather'].apply(group_weather)

# One-Hot Encode the grouped weather conditions
df_weather_encoded = pd.get_dummies(df2['weather_grouped'], prefix='weather', drop_first=False)

# One-Hot Encode the city column
df_city_encoded = pd.get_dummies(df2['city'], prefix='city', drop_first=False)

# Concatenate the encoded weather and city columns back to the original dataframe
df3 = pd.concat([df2, df_weather_encoded, df_city_encoded], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['weather_grouped'] = df2['weather'].apply(group_weather)


# Feature Normalization

In [19]:
# Remove '%' from humidity and convert to float
df3['humidity'] = df3['humidity'].str.replace('%', '').astype(float)

In [20]:
# Normalization/Standardization for numerical columns
scaler = StandardScaler()
df3[['wind', 'humidity', 'barometer', 'visibility']] = scaler.fit_transform(df3[['wind', 'humidity', 'barometer', 'visibility']])

In [21]:
# Dropping the original weather and city columns as they are encoded now
df3.drop(['weather', 'weather_grouped', 'city'], axis=1, inplace=True)

In [22]:
df3.head()

Unnamed: 0,year,month,day,hour,temp,wind,humidity,barometer,visibility,weather_Clear,...,city_Hail,city_Jawf,city_Jazan,city_Madina,city_Mecca,city_Najran,city_Northern boarder,city_Qassim,city_Riyadh,city_Tabuk
0,2017,1,1,24,17,-0.224487,1.120874,0.365022,0.701271,True,...,False,False,False,False,False,False,False,True,False,False
1,2017,1,1,1,17,-0.798598,1.120874,0.365022,0.701271,True,...,False,False,False,False,False,False,False,True,False,False
2,2017,1,1,3,15,-0.224487,1.459959,0.508483,0.701271,True,...,False,False,False,False,False,False,False,True,False,False
3,2017,1,1,4,15,-0.224487,1.459959,0.508483,0.701271,True,...,False,False,False,False,False,False,False,True,False,False
4,2017,1,1,5,15,-0.454132,1.459959,0.508483,0.701271,True,...,False,False,False,False,False,False,False,True,False,False


In [23]:
df3.columns

Index(['year', 'month', 'day', 'hour', 'temp', 'wind', 'humidity', 'barometer',
       'visibility', 'weather_Clear', 'weather_Cloudy', 'weather_Other',
       'weather_Rain/Storm', 'weather_Sunny', 'city_Assir', 'city_Baha',
       'city_EP', 'city_Hail', 'city_Jawf', 'city_Jazan', 'city_Madina',
       'city_Mecca', 'city_Najran', 'city_Northern boarder', 'city_Qassim',
       'city_Riyadh', 'city_Tabuk'],
      dtype='object')

In [24]:
df3.shape

(248946, 27)

# MODEL

In [27]:
# Defining features and target variable
X = df3.drop(['temp'], axis=1)
y = df3['temp']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initializing and training the Random Forest Regressor
model = LinearRegression()
model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = model.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 16.20828621103675
R^2 Score: 0.794763007017401
