In [None]:
!pip3 install statsmodels


In [None]:
!pip3 install prophet

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor, IsolationForest 

from statsmodels.tsa.statespace.sarimax import SARIMAX
from prophet import Prophet

In [None]:
df = pd.read_csv("/Users/kenilpatel/Downloads/archive/GlobalWeatherRepository.csv")
df.head()

In [5]:
df['last_updated'] = pd.to_datetime(df['last_updated'])
df = df.sort_values('last_updated')
df = df.set_index('last_updated')

In [6]:
df = df.ffill().bfill()

In [7]:
def remove_outliers(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    return data[(data[col] >= Q1 - 1.5*IQR) & (data[col] <= Q3 + 1.5*IQR)]

df = remove_outliers(df, 'temperature_celsius')

In [8]:
numeric_cols = df.select_dtypes(include=np.number).columns

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [None]:
plt.figure(figsize=(12,5))
plt.plot(df['temperature_celsius'])
plt.title("Temperature Over Time")
plt.show()

In [None]:
plt.figure(figsize=(12,5))
plt.plot(df['precip_mm'])
plt.title("Precipitation Over Time")
plt.show()

In [11]:
numeric_df = df.select_dtypes(include=['number'])

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(numeric_df.corr(), cmap="coolwarm", annot=False)
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
# Environmental Impact Analysis
# Relationship between air quality and weather

air_cols = [col for col in df.columns if 'air_quality' in col.lower()]

if len(air_cols) > 0:
    for col in air_cols:
        plt.figure(figsize=(6,4))
        sns.scatterplot(x=df[col], y=df['temperature_celsius'])
        plt.title(f"{col} vs Temperature")
        plt.show()

    print("Air quality columns analyzed:", air_cols)
else:
    print("No air quality columns found in dataset")

In [None]:
monthly_avg = df['temperature_celsius'].resample('M').mean()
monthly_avg.plot(figsize=(12,5), title="Monthly Avg Temperature")
plt.show()

In [14]:
train_size = int(len(df)*0.8)
train = df['temperature_celsius'][:train_size]
test = df['temperature_celsius'][train_size:]

In [None]:
model = SARIMAX(train, order=(2,1,2), seasonal_order=(1,1,1,12))
sarima_fit = model.fit()

sarima_forecast = sarima_fit.forecast(len(test))

In [None]:
mae_sarima = mean_absolute_error(test, sarima_forecast)
rmse_sarima = np.sqrt(mean_squared_error(test, sarima_forecast))

print("SARIMA MAE:", mae_sarima)
print("SARIMA RMSE:", rmse_sarima)

In [None]:
prophet_df = df.reset_index()[['last_updated','temperature_celsius']]
prophet_df.columns = ['ds','y']

train_prophet = prophet_df[:train_size]
test_prophet = prophet_df[train_size:]

model = Prophet()
model.fit(train_prophet)

future = model.make_future_dataframe(periods=len(test))
forecast = model.predict(future)

prophet_forecast = forecast['yhat'][-len(test):].values

In [None]:
mae_prophet = mean_absolute_error(test_prophet['y'], prophet_forecast)
rmse_prophet = np.sqrt(mean_squared_error(test_prophet['y'], prophet_forecast))

print("Prophet MAE:", mae_prophet)
print("Prophet RMSE:", rmse_prophet)

In [None]:
comparison = pd.DataFrame({
    "Model": ["SARIMA","Prophet","Ensemble"],
    "MAE": [mae_sarima, mae_prophet, mae_ensemble],
    "RMSE": [rmse_sarima, rmse_prophet, rmse_ensemble]
})
comparison

In [None]:
# Ensemble Forecast (SARIMA + Prophet)

# align predictions
sarima_pred = np.array(sarima_forecast)
prophet_pred = np.array(prophet_forecast)

# simple average ensemble
ensemble_forecast = (sarima_pred + prophet_pred) / 2

# evaluate ensemble 
mae_ensemble = mean_absolute_error(test_prophet['y'], ensemble_forecast)
rmse_ensemble = np.sqrt(mean_squared_error(test_prophet['y'], ensemble_forecast))

print("Ensemble MAE:", mae_ensemble)
print("Ensemble RMSE:", rmse_ensemble)

In [20]:
features = df[['temperature_celsius','humidity','pressure_mb']]

iso = IsolationForest(contamination=0.01)
df['anomaly'] = iso.fit_predict(features)

anomalies = df[df['anomaly']==-1]

In [None]:
plt.figure(figsize=(12,5))
plt.plot(df['temperature_celsius'])
plt.scatter(anomalies.index, anomalies['temperature_celsius'], color='red')
plt.title("Detected Weather Anomalies")
plt.show()

In [22]:
numeric_df = df.select_dtypes(include=['number'])

In [None]:
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import pandas as pd

#Keep only numeric columns
numeric_df = df.select_dtypes(include=['number'])

#Define features and target
X = numeric_df.drop(columns=['temperature_celsius'])
y = numeric_df['temperature_celsius']

#Train Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X, y)

#Get feature importance
importance = pd.Series(rf.feature_importances_, index=X.columns)

#Plot sorted importance
importance.sort_values().plot(kind='barh', figsize=(10,6))
plt.title("Feature Importance")
plt.xlabel("Importance Score")
plt.tight_layout()
plt.show()

In [None]:
df.groupby('country')['temperature_celsius'].std().sort_values().head()

In [None]:
# Geographic Weather Patterns

country_temp = df.groupby('country')['temperature_celsius'].mean().sort_values().tail(15)

plt.figure(figsize=(10,6))
country_temp.plot(kind='barh')
plt.title("Average Temperature by Country")
plt.xlabel("Temperature (scaled)")
plt.show()