In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
weather_data = pd.read_csv("IndianWeatherRepository.csv")

In [None]:
weather_data.describe()

In [None]:
# Creating separate dataset for data containing numeric data for analysis

weather_df= weather_data.select_dtypes(include = 'number')
cols = weather_df.columns.values
cols

In [None]:
#Removing redundant columns [such as temperature in celsius & fahrenheit]

weather_df.drop(['temperature_fahrenheit','wind_mph','pressure_in','precip_mm','feels_like_fahrenheit','visibility_km','gust_mph'], 
                axis=1,inplace=True)
weather_df.columns

In [None]:
weather_df.hist(bins=16, figsize=(20,15));

In [None]:
#Creating a correlation matrix
sns.set(font_scale=0.9)

corr_matrix = weather_df.corr(method="kendall")

plt.figure(figsize=(14, 14))
heatmap= sns.heatmap(corr_matrix, vmin=-1, vmax=1, annot=True,cmap='BrBG',annot_kws={"fontsize":4},linewidths=0.1)
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':2}, pad=12);


In [None]:
sorted_corr_mat = corr_matrix.abs().unstack().sort_values()
sorted_corr_mat = sorted_corr_mat.to_frame(name="Correlation")

# Removing highly/ least correlated data (correlation>0.90 && <0.05)
sorted_corr_mat=sorted_corr_mat.drop(sorted_corr_mat[sorted_corr_mat['Correlation']>0.95].index)
sorted_corr_mat=sorted_corr_mat.drop(sorted_corr_mat[sorted_corr_mat['Correlation']<0.05].index)
print("Fields with max correlation are:\n")

sorted_corr_mat[sorted_corr_mat['Correlation']>0.80]

In [None]:
import plotly.express as px

In [None]:
# Average PM2.5 exposure

fig1=px.bar(weather_data.groupby('region')['air_quality_PM2.5'].mean().reset_index(),
           x='region', y='air_quality_PM2.5', 
           title='Average PM2.5 by State')
fig1.show()

In [None]:
# Average UV exposure

fig2=px.bar(weather_data.groupby('region')['uv_index'].mean().reset_index(), x='region', y='uv_index', 
            title='Average uv_index exposure by State')
fig2.show()

In [None]:
# UV index scatterplot
fig3 = px.scatter(weather_data, x='uv_index', y='temperature_celsius', color='region',
                 title='Correlation Between UV Index and Temperature by Region',
                 labels={'uv_index': 'UV Index', 'temperature_celsius': 'Temperature (Celsius)'})

fig3.update_layout(xaxis_title='UV Index', yaxis_title='Temperature (Celsius)')

fig3.show()

In [None]:
# Temperature 
fig4=px.bar(weather_data.groupby('region')['temperature_celsius'].mean().reset_index(), x='region', y='temperature_celsius', 
            title='Average temperature exposure by State')
fig4.show()

In [None]:
# Temperature range
plt.figure(figsize=(10, 10))
temp_range=sns.boxplot(data=weather_data, y='region', x='temperature_celsius', 
                       flierprops={'marker': '*', 'markersize': 2, 'markerfacecolor': 'blue'})
temp_range.set_title("Temperature range")

plt.show()

# Ploting Temperature vs various air quality metrices (Scaled)

In [None]:
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()

temp_weather=weather_data


# Scatter Plots for Air Quality Metrics vs Temperature
air_quality_metrics = ['air_quality_Carbon_Monoxide', 'air_quality_Ozone', 'air_quality_Nitrogen_dioxide', 
                       'air_quality_Sulphur_dioxide']

plt.figure(figsize=(16, 16))
plt.subplot(2, 2, 1)
plt.plot( "temperature_celsius","air_quality_Carbon_Monoxide", data=temp_weather,color="lightblue",marker='o',ms="5",ls = '',label="CO")
plt.xlabel("Temperature")
plt.ylabel("CO")

plt.subplot(2, 2, 2)
plt.plot( "temperature_celsius","air_quality_Ozone", data=temp_weather,color="r",marker='*',ms="5",ls = '',label="Ozone")
plt.xlabel("Temperature")
plt.ylabel("Ozone")

plt.subplot(2, 2, 3)
plt.plot( "temperature_celsius","air_quality_Nitrogen_dioxide", data=temp_weather,color="green",marker='s',ms="5",ls = '',label="NO2")
plt.xlabel("Temperature")
plt.ylabel("NO2")

plt.subplot(2, 2, 4)
plt.plot( "temperature_celsius","air_quality_Sulphur_dioxide", data=temp_weather,color="m",marker='^',ms="5",ls = '',label="SO2")
plt.xlabel("Temperature")
plt.ylabel("SO2")

plt.show()

In [None]:
weather_df.columns

# Comparing various models that can be used to predict temparature

In [None]:
temperature_data = weather_df['temperature_celsius']
temperature_factors = weather_df[['latitude','longitude','wind_kph','wind_degree','pressure_mb','precip_in','humidity','cloud']]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

x_train, x_test, y_train, y_test = train_test_split(temperature_factors, temperature_data, test_size = 0.3, random_state = 0)

In [None]:
#shapes of splitted data
print("X_train:",x_train.shape)
print("X_test:",x_test.shape)
print("Y_train:",y_train.shape)
print("Y_test:",y_test.shape)

# Model 1 : MultiLinear Regression

In [None]:
from sklearn.linear_model import LinearRegression

linreg=LinearRegression()
linreg.fit(x_train,y_train)

y_pred=linreg.predict(x_test)

model_accuracy=r2_score(y_test,y_pred)*100
print(" Accuracy of the model is %.2f" %model_accuracy)

# Model 2 : LASSO Regression

In [None]:
from sklearn.linear_model import Lasso

lasso = Lasso()
lasso.fit(x_train, y_train)

y_pred=lasso.predict(x_test)

model_accuracy=r2_score(y_test,y_pred)*100
print(" Accuracy of the model is %.2f" %model_accuracy)

# Model 3: Decision Tree Based Regression

In [None]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(x_train,y_train)

y_pred=regressor.predict(x_test)

model_accuracy=r2_score(y_test,y_pred)*100
print(" Accuracy of the model is %.2f" %model_accuracy)

# Model 4: Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=100,random_state=0)

regressor.fit(x_train,y_train)
y_pred=regressor.predict(x_test)

model_accuracy=r2_score(y_test,y_pred)*100
print(" Accuracy of the model is %.2f" %model_accuracy)

# Model 5: SGD

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gradient_booster = GradientBoostingRegressor(
    n_estimators=500,learning_rate=0.1,random_state=100,
    max_features=5,loss='squared_error',)
gradient_booster.fit(x_train, y_train)

y_pred=gradient_booster.predict(x_test)

model_accuracy=r2_score(y_test,y_pred)*100
print(" Accuracy of the model is %.2f" %model_accuracy)