In [None]:
!pip3 install statsmodels


In [None]:
!pip3 install prophet

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor, IsolationForest

from statsmodels.tsa.statespace.sarimax import SARIMAX
from prophet import Prophet

In [None]:
df = pd.read_csv("/Users/kenilpatel/Downloads/archive/GlobalWeatherRepository.csv")
df.head()

In [18]:
df['last_updated'] = pd.to_datetime(df['last_updated'])
df = df.sort_values('last_updated')
df = df.set_index('last_updated')

In [19]:
df = df.ffill().bfill()

In [21]:
def remove_outliers(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    return data[(data[col] >= Q1 - 1.5*IQR) & (data[col] <= Q3 + 1.5*IQR)]

df = remove_outliers(df, 'temperature_celsius')

In [None]:
numeric_cols = df.select_dtypes(include=np.number).columns

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [None]:
plt.figure(figsize=(12,5))
plt.plot(df['temperature_celsius'])
plt.title("Temperature Over Time")
plt.show()

In [None]:
plt.figure(figsize=(12,5))
plt.plot(df['precip_mm'])
plt.title("Precipitation Over Time")
plt.show()

In [30]:
numeric_df = df.select_dtypes(include=['number'])

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(numeric_df.corr(), cmap="coolwarm", annot=False)
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
monthly_avg = df['temperature_celsius'].resample('M').mean()
monthly_avg.plot(figsize=(12,5), title="Monthly Avg Temperature")
plt.show()

In [36]:
train_size = int(len(df)*0.8)
train = df['temperature_celsius'][:train_size]
test = df['temperature_celsius'][train_size:]

In [None]:
model = SARIMAX(train, order=(2,1,2), seasonal_order=(1,1,1,12))
sarima_fit = model.fit()

sarima_forecast = sarima_fit.forecast(len(test))

In [None]:
mae_sarima = mean_absolute_error(test, sarima_forecast)
rmse_sarima = np.sqrt(mean_squared_error(test, sarima_forecast))

print("SARIMA MAE:", mae_sarima)
print("SARIMA RMSE:", rmse_sarima)

In [None]:
prophet_df = df.reset_index()[['last_updated','temperature_celsius']]
prophet_df.columns = ['ds','y']

train_prophet = prophet_df[:train_size]
test_prophet = prophet_df[train_size:]

model = Prophet()
model.fit(train_prophet)

future = model.make_future_dataframe(periods=len(test))
forecast = model.predict(future)

prophet_forecast = forecast['yhat'][-len(test):].values

In [None]:
mae_prophet = mean_absolute_error(test_prophet['y'], prophet_forecast)
rmse_prophet = np.sqrt(mean_squared_error(test_prophet['y'], prophet_forecast))

print("Prophet MAE:", mae_prophet)
print("Prophet RMSE:", rmse_prophet)

In [None]:
comparison = pd.DataFrame({
    "Model": ["SARIMA","Prophet"],
    "MAE": [mae_sarima, mae_prophet],
    "RMSE": [rmse_sarima, rmse_prophet]
})
comparison

In [44]:
features = df[['temperature_celsius','humidity','pressure_mb']]

iso = IsolationForest(contamination=0.01)
df['anomaly'] = iso.fit_predict(features)

anomalies = df[df['anomaly']==-1]

In [None]:
plt.figure(figsize=(12,5))
plt.plot(df['temperature_celsius'])
plt.scatter(anomalies.index, anomalies['temperature_celsius'], color='red')
plt.title("Detected Weather Anomalies")
plt.show()

In [48]:
numeric_df = df.select_dtypes(include=['number'])

In [None]:
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import pandas as pd

# 1️⃣ Keep only numeric columns
numeric_df = df.select_dtypes(include=['number'])

# 2️⃣ Define features and target
X = numeric_df.drop(columns=['temperature_celsius'])
y = numeric_df['temperature_celsius']

# 3️⃣ Train Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X, y)

# 4️⃣ Get feature importance
importance = pd.Series(rf.feature_importances_, index=X.columns)

# 5️⃣ Plot sorted importance
importance.sort_values().plot(kind='barh', figsize=(10,6))
plt.title("Feature Importance")
plt.xlabel("Importance Score")
plt.tight_layout()
plt.show()

In [None]:
df.groupby('country')['temperature_celsius'].std().sort_values().head()