In [None]:
import pandas as pd
import pickle

In [None]:
table = pd.read_pickle("weather_data.pkl")
table.head()

In [None]:
table = table.drop('_id', axis=1)

## EDA

### Basic and Manually

In [None]:
table.shape

In [None]:
table.isnull().sum()

In [None]:
table.dtypes

In [None]:
corr_matrix = table.corr()

corr = pd.DataFrame(corr_matrix)
corr

In [None]:
corr_matrix['temperature'].sort_values(ascending = False)

### With swieetviz 

In [None]:
pip install sweetviz

In [None]:
import sweetviz
a = sweetviz.analyze(table)
a.show_html("report.html")  # It will get stored in the folder itself

##### Want more tools like this?
CHECK OUT -- https://www.linkedin.com/posts/kaushal-sahu_dataanalysis-datascience-eda-activity-7125097318911016960-9rH-?utm_source=share&utm_medium=member_desktop

## Graph Analysis

#### For only Temperature column (To make it simple and informative)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.boxplot(table['temperature'])

In [None]:
plt.boxplot(table['temperature'])

# Set the title and labels
plt.title("Box Plot of Temperature Data")
plt.xlabel("temperature")

plt.show()

#### Distribution Plot

In [None]:
sns.distplot(table['temperature'])

sns.distplot(table['temperature'], bins= 30) #to make more bars
sns.distplot(table['temperature'], kde= False) #to remove the line

## Sorting Data

##### I used only 3 cols to show here. You can use all {according to your GPU capacity}

In [None]:
## latitude
# data = table.query('latitude >= 5 and latitude <= 40')

## longitude
# data = data.query('longitude >= 65 and longitude <= 100')

## temperature
data = data.query('temperature >= -10 and temperature <= 40')

## pressure
# data = data.query('pressure >= 1000 and pressure <= 1020')

## precipitation
data = data.query('precipitation >= 0 and precipitation <= 100')

## sw_down
# data = data.query('sw_down >= 0 and sw_down <= 1000')

## humidity
data = data.query('humidity >= 0 and humidity <= 100')

## colud_cover
# data = data.query('cloud_cover >= 0.0 and cloud_cover <= 1.0')

In [None]:
## Sorting the values
data = data.sort_values(by=['latitude', 'longitude', 'datetime_local'])#

In [None]:
## Taking out hour
data['Temp_Hour'] = data['datetime_local'].dt.hour
data = data.drop(columns="datetime_local")

In [None]:
## Taking the rolling mean of required columns

data["rolling_temperature"] = data['temperature'].rolling(window=7).mean()
# data["rolling_pressure"] = data['pressure'].rolling(window=7).mean()
# data["rolling_sw_down"] = data['sw_down'].rolling(window=7).mean()
data["rolling_humidity"] = data['humidity'].rolling(window=7).mean()

In [None]:
## Taking lag values of required columns

data['temperature_lag_1'] = data['temperature'].shift(1)
# data['pressure_lag_1'] = data['pressure'].shift(1)
# data['sw_down_lag_1'] = data['sw_down'].shift(1)
data['humidity_lag_1'] = data['humidity'].shift(1)

#### Why I used rolling, lag, temp_hour
##### HAVE A GLANCE - https://www.analyticsvidhya.com/blog/2019/12/6-powerful-feature-engineering-techniques-time-series/

## Sampling Data

In [None]:
## DIRECTLY DROP THE NAN ROWS
data.dropna(axis=0, inplace= True)

## Sampling data
data = data.sample(frac= 0.1)

## Spling Train || Test data
x = data.drop(columns="temperature")
y = data["temperature"]

## Feature selection using PCA or RFECV

### PCA

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 5)
pca_x = pca.fit_transform(x)

In [None]:
## Just for lEARNING PURPOSE

## Recursive feature elimination cross-validation (RFECV)

from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split


## model
rdforest = RandomForestRegressor(n_estimators=50,
                                   random_state=0)

## making RFECV model
rfecv = RFECV(rdforest, step=1, cv= 3)
rfecv.fit_transform(x,y)

## Splitting data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.4, random_state= 30)

## Training and Testing
rdforest.fit(x_train, y_train)
rdforest.score(x_test, y_test)

## GridSearchCV (Auto HyperParameter Tuning)

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.4, random_state= 30)

model = XGBRegressor()

param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, 8],
    'learning_rate': [0.1, 0.05, 0.01],
    'gamma': [0, 0.1, 0.5],
}

# Create a GridSearchCV object
grid_search = GridSearchCV(model, param_grid, scoring='accuracy', cv=5)


## Training and Testing the data now

In [None]:
import gc

# Fit the GridSearchCV object to the training data
grid_search.fit(x_train, y_train)

gc.collect{}

In [None]:
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best model:", best_model)
print("Best parameters:", best_params)
print("Best score (neg_mean_squared_error):", best_score)

In [None]:
from sklearn.metrics import mean_squared_error

y_pred = best_model.predict(x_test)

# mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)  # Calculating root mean squared error (RMSE)

print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)