<a href="https://colab.research.google.com/github/Hossam-Osama/Forecasting-Temperature-Trends-for-Egypt/blob/main/Forecasting_Temperature_Trends_for_Egypt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'climate-change-earth-surface-temperature-data:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F29%2F2150%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240917%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240917T173225Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D7ea1e1eba26ceee2300f2051423690442712e05579cb5d2a17b12c9c73c848957a634dd8361e9054e01a29371a014241e488649d7bdf009b3e879c6fe9525c0cf8030c891603648195db19ed1f2c6b28d7631e31203ceff98469bda6478e90ad3ced6b4ce653bbca134c4bd6c8d34046e53660844691d661217c120cfaa6f765e16f650bffa7571ee827892d48b0b47069706e867884581de935ec193b2c4fdfaa1ebc3d1c2e5c257d507e5d20b7fa761a07945e6912ee473834f6506c0e8b61dc77a7df9d0bbefc511cc51866c85582ad37fdc41032d7698748dbbebeb3e07ec8f670add8afa954b70cb1101844cf5ee6db5183dc7f71ec85fe5e110105b6a9'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


**Welcome to our notebook!**
<br>
We are a group of students from the **Samsung Innovation Campus (SIC)**, exploring the power of artificial intelligence. Through this program, in collaboration with *Samsung*, *LifeMakers* and *Machinfy academy*, we’ve been able to work on real-world projects that challenge our understanding of AI and data science.
<br>
In this notebook, we focus on **predicting temperature for Egypt**, using a comprehensive dataset on global climate change. Throughout this journey, we’ve worked on a range of tasks including **Exploratory Data Analysis (EDA), data preprocessing, handling missing values, outlier detection and treatment, and time series analysis to check for stationarity**. We then applied models to predict temperatures and uncertainties, working with methods like **Linear Regression, Random Forest, and Time Series Forecasting Models.** We are excited to share what we’ve learned and how we’ve applied AI techniques to tackle this problem!

><a href= 'https://www.linkedin.com/in/eyad-abdelmeguid/'>Eyad Mohamed</a><br>
<a href= 'https://www.linkedin.com/in/hossam-osama-181760248/'>Hossam Eleraqi</a><br>
<a href= 'https://www.linkedin.com/in/shahd-ahmed-3a66b6284'>Shahd</a><br>

This notebook is more than just code; it’s a reflection of our learning process, the challenges we faced in handling complex datasets, and how we refined our approach to predicting temperatures. We hope you find this exploration of climate data insightful!<br>
<a href= ''>Our Presentation</a>

# Importing Libraries and Loading the data

In [None]:
# libraries for data gathering and visualization
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import iplot

# libraries for preprocessing
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer,KNNImputer,IterativeImputer
from sklearn.preprocessing import OneHotEncoder
plotsize = (13, 5)
from statsmodels.tsa.stattools import adfuller

# libraries for modeling
from pandas.plotting import register_matplotlib_converters
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.arima.model import ARIMA
register_matplotlib_converters()
from time import time
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf, month_plot, quarter_plot
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import silhouette_score

# for removing warnings
import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# adjusting display options to make sure all columns are visible and up to 100 rows can be displayed
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_colwidth', None)

In [None]:
df_cities = pd.read_csv('GlobalLandTemperaturesByCity.csv')

In [None]:
df_egy = df_cities[df_cities["Country"] == "Egypt"]

# Data summary

In [None]:
df_egy

In [None]:
df_egy.shape

In [None]:
df_egy['City'].value_counts()

In [None]:
df_egy.isnull().sum()

In [None]:
df_egy.duplicated().sum()

In [None]:
df_egy.describe().T

### We observe that there is 19 city/region in our dataset with 47248 entries and 7 features, no duplicates , and a few missing values in the average tempreature features


In [None]:
import geopandas as gpd

# Sample data for city average temperatures
data = {
    'City': ['Alexandria', 'Aswan', 'Asyut', 'Benha', 'Beni Suef', 'Cairo', 'Dekernes', 'El Faiyûm', 'El Mahalla El Kubra',
             'Gizeh', 'Ismailia', 'Luxor', 'Port Said', 'Qalyub', 'Qena', 'Sohag', 'Suez', 'Talkha', 'Tanta'],
    'AverageTemperature': [23, 30, 28, 25, 26, 27, 24, 26, 23, 27, 29, 31, 24, 25, 29, 28, 27, 24, 25],
    'Latitude': [31.2, 24.09, 27.18, 30.47, 29.07, 30.04, 31.18, 29.31, 30.97, 30.01, 30.6, 25.69, 31.26, 30.18, 26.17, 26.56, 29.96, 31.05, 30.79],
    'Longitude': [29.91, 32.89, 31.18, 31.18, 30.81, 31.24, 31.57, 30.84, 31.17, 31.21, 32.27, 32.65, 32.28, 30.12, 32.67, 31.7, 32.55, 31.38, 31.03]
}

df_cities = pd.DataFrame(data)

# Create a Geopandas GeoDataFrame for plotting
gdf_cities = gpd.GeoDataFrame(df_cities,
                              geometry=gpd.points_from_xy(df_cities['Longitude'], df_cities['Latitude']))

# Plot the cities with colors based on average temperatures
fig, ax = plt.subplots(figsize=(8, 8))

# Scatter plot of cities, with color representing temperature
scatter = ax.scatter(df_cities['Longitude'], df_cities['Latitude'],
                     c=df_cities['AverageTemperature'], cmap='coolwarm', s=100, edgecolor='black')

# Annotate each city
for i, row in df_cities.iterrows():
    ax.text(row['Longitude'], row['Latitude'], row['City'], fontsize=8, ha='right')

# Add a colorbar
cbar = plt.colorbar(scatter, ax=ax)
cbar.set_label('Average Temperature (°C)')

# Set axis limits to focus on Egypt
ax.set_xlim(25, 35)
ax.set_ylim(22, 32)
ax.set_title('Average Temperatures in Egyptian Cities')

plt.xlabel('Longitude')
plt.ylabel('Latitude')

plt.grid(True)
plt.tight_layout()
plt.show()


### The map indicates that temperatures generally increase as we move south towards Upper Egypt, due to its proximity to the equator.







In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
from scipy import stats

# Assuming df_cities is already loaded

# Extract the year from the date
egypt_data['Year'] = egypt_data['dt'].apply(lambda x: x[:4])

# Group by year and calculate the average temperature and uncertainty
egypt_yearly = egypt_data.groupby('Year').agg({
    'AverageTemperature': 'mean',
    'AverageTemperatureUncertainty': 'mean'
}).reset_index()

# Extract the values
years = egypt_yearly['Year'].astype(int).values
mean_temp_egypt = egypt_yearly['AverageTemperature'].values
mean_temp_egypt_uncertainty = egypt_yearly['AverageTemperatureUncertainty'].values

# Calculate the trend line (linear regression)
slope, intercept, r_value, p_value, std_err = stats.linregress(years, mean_temp_egypt)
trend_line = slope * years + intercept

# Create traces for the plot
trace0 = go.Scatter(
    x=years,
    y=mean_temp_egypt + mean_temp_egypt_uncertainty,
    fill=None,
    mode='lines',
    name='Uncertainty top',
    line=dict(color='rgb(0, 255, 255)')
)

trace1 = go.Scatter(
    x=years,
    y=mean_temp_egypt - mean_temp_egypt_uncertainty,
    fill='tonexty',
    mode='lines',
    name='Uncertainty bot',
    line=dict(color='rgb(0, 255, 255)')
)

trace2 = go.Scatter(
    x=years,
    y=mean_temp_egypt,
    name='Average Temperature',
    line=dict(color='rgb(199, 121, 093)')
)

# Add the trend line
trace3 = go.Scatter(
    x=years,
    y=trend_line,
    name='Trend Line',
    line=dict(color='rgb(255, 0, 0)', dash='dash')
)

# Add a vertical line to indicate the start of global warming (around 1970)
global_warming_start_year = 1970

trace4 = go.Scatter(
    x=[global_warming_start_year, global_warming_start_year],
    y=[min(mean_temp_egypt - mean_temp_egypt_uncertainty), max(mean_temp_egypt + mean_temp_egypt_uncertainty)],
    mode='lines',
    name='Start of Global Warming',
    line=dict(color='rgb(0, 100, 0)', dash='dot')
)

data = [trace0, trace1, trace2, trace3, trace4]

layout = go.Layout(
    xaxis=dict(title='Year'),
    yaxis=dict(title='Average Temperature, °C'),
    title='Average Land Temperature in Egypt with Trend Line and Global Warming Start',
    showlegend=True
)

fig = go.Figure(data=data, layout=layout)
fig.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Convert 'dt' to datetime format
df_egy['dt'] = pd.to_datetime(df_egy['dt'])

# Set the theme for Seaborn
sns.set_theme(style="darkgrid")

# Plotting the temperature trends for each city
plt.figure(figsize=(16, 10))

# Creating a lineplot for each city
sns.lineplot(data=df_egy, x='dt', y='AverageTemperature', hue='City', marker='o')

# Adding titles and labels
plt.title('Temperature Trends in Various Egyptian Cities (1791-2013)', fontsize=18)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Average Temperature (°C)', fontsize=14)
plt.legend(title='City', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)

plt.tight_layout()
plt.show()


### The average temperature has been rising, with a notable increase around 1970, coinciding with the peak of the industrial revolution and the expansion of factories, which contributed significantly to global warming.








In [None]:
df_egy['Month'] = df_egy['dt'].dt.month

# Plot
plt.figure(figsize=(16, 10))
sns.lineplot(data=df_egy, x='Month', y='AverageTemperature', hue='City', marker='o')
plt.title('Seasonal Temperature Patterns in Egyptian Cities', fontsize=18)
plt.xlabel('Month', fontsize=14)
plt.ylabel('Average Temperature (°C)', fontsize=14)
plt.xticks(range(1, 13), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.legend(title='City', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()

### The peak temperatures throughout the year occur from June to October.







# Renaming columns

In [None]:
df_egy = df_egy.rename(columns={'dt': 'date'})

In [None]:
plt.figure(figsize=(12, 7))
sns.heatmap(df_egy.isnull(), cbar=False, cmap='summer', yticklabels=False)
plt.title('Missing Data Heatmap')
plt.show()

# Feature Engineering

## Date Time Features

In [None]:
df_featured = df_egy.copy()

In [None]:
df_featured['date'] = pd.to_datetime(df_featured['date'])

In [None]:
df_featured['year'] = df_featured['date'].dt.year
df_featured['month'] = df_featured['date'].dt.month

In [None]:
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Fall'

In [None]:
df_featured['season'] = df_featured['date'].dt.month.apply(get_season)

In [None]:
# Plot
plt.figure(figsize=(12, 8))
sns.boxplot(data=df_featured, x='season', y='AverageTemperature', palette='coolwarm')
plt.title('Temperature Distribution by Season', fontsize=18)
plt.xlabel('Season', fontsize=14)
plt.ylabel('Average Temperature (°C)', fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.show()


### Above shows the average tempreatures of seasons

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Ensure 'season' column is categorical with specific order
season_order = ['Winter', 'Spring', 'Summer', 'Fall']
df_featured['season'] = pd.Categorical(df_featured['season'], categories=season_order, ordered=True)

# Compute the average temperature by season
seasonal_avg_temp = df_featured.groupby('season')['AverageTemperature'].mean().reset_index()

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(data=seasonal_avg_temp, x='season', y='AverageTemperature', palette='coolwarm')
plt.title('Average Temperature by Season', fontsize=18)
plt.xlabel('Season', fontsize=14)
plt.ylabel('Average Temperature (°C)', fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Compute anomalies
baseline_avg_temp = df_featured.groupby('season')['AverageTemperature'].mean()
df_featured['Anomaly'] = df_featured.apply(lambda row: row['AverageTemperature'] - baseline_avg_temp[row['season']], axis=1)

# Compute average anomalies by season
seasonal_anomalies = df_featured.groupby('season')['Anomaly'].mean().reset_index()

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(data=seasonal_anomalies, x='season', y='Anomaly', palette='coolwarm')
plt.title('Average Temperature Anomalies by Season', fontsize=18)
plt.xlabel('Season', fontsize=14)
plt.ylabel('Temperature Anomaly (°C)', fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.show()


### The anomalies of seasons measure how much detoriation can happen in the season, and as we can observe summer is the season with most detoriations overall

In [None]:
# Compute average temperature for each season and year
seasonal_temp_trend = df_featured.groupby(['Year', 'season'])['AverageTemperature'].mean().reset_index()

# Plot
plt.figure(figsize=(16, 10))
sns.lineplot(data=seasonal_temp_trend, x='Year', y='AverageTemperature', hue='season', marker='o')
plt.title('Seasonal Temperature Trends Over Time (1791-2013)', fontsize=18)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Average Temperature (°C)', fontsize=14)
plt.legend(title='Season', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()


### This is the observation of the time changes and tempreature per season, the up trend is still seen

# EDA Summary
 1. Seasonal Analysis:

 Average Temperature by Season: A bar plot visualized the average temperatures for each season, revealing how temperatures vary throughout the year. This helps understand seasonal patterns and average conditions.

 Seasonal Temperature Trends: Line plots depicted how temperatures for each season have evolved over the years. This visualization highlighted trends in seasonal temperatures, showing whether certain seasons have become warmer or cooler over time.

 Boxplot of Temperatures by Season: This plot displayed the distribution of temperatures within each season, including median, quartiles, and outliers. It provided insights into temperature variability and extremes for each season.

 Seasonal Temperature Anomalies: Bar plots displayed temperature anomalies for each season, comparing current temperatures against historical averages. This analysis helped identify unusual temperature patterns and extremes.

* 2. Observations
 Geographical Trends:
 Temperatures generally increase as one moves south towards Upper Egypt due to proximity to the equator.

 Industrial Impact:
 There has been a notable rise in average temperatures since the industrial revolution, reflecting the impact of increased industrial activity on global    warming.

Seasonal Patterns:

 * Winter: Generally the coolest season, with temperatures showing significant variation from year to year.
 * Spring: Marked by gradually rising temperatures; however, it can experience abrupt changes.
 * Summer: The hottest season with consistently high temperatures and potential for extreme heat events.
 * Autumn: Temperatures start to drop, showing a decrease towards winter, with some variability depending on the year.

3. Additional Insights
 * Long-Term Trends: Analysis of long-term trends revealed that while average temperatures have been rising, the rate of increase varies by season. For example, summer  temperatures have risen more sharply compared to winter temperatures.

 * Seasonal Extremes: Some seasons, particularly summer, show increased frequency of temperature extremes, which could impact agriculture and water resources.

 * Comparative Analysis: Seasonal temperature anomalies highlighted periods where temperatures deviated significantly from historical norms, potentially indicating extreme weather events or shifts in climate patterns.

4. Recommendations for Securing Crops and Resources
 * Monitor Seasonal Trends: Understanding how different seasons are affected by temperature changes can guide crop planning and resource management.

 * Implement Adaptive Strategies: Develop crop varieties that are resilient to extreme temperatures and adjust planting schedules based on seasonal temperature trends.

 * Climate-Resilient Practices: Incorporate water-saving techniques and soil management practices to mitigate the impact of rising temperatures on agriculture.

 * Early Warning Systems: Use temperature anomalies and seasonal data to set up early warning systems for extreme weather conditions, allowing for timely interventions.

## Lag Features

In [None]:
df_featured['Temperature 1 Month Change'] = df_featured['AverageTemperature'] - df_featured['AverageTemperature'].shift(1)

## Climate Regions Feature

In [None]:
def classify_climate_region(row):
    lat = float(row['Latitude'][:-1])
    if lat >= 30.0:
        return 'Mediterranean'
    elif lat < 30.0 and lat > 23.5:
        return 'Desert'
    else:
        return 'Other'

In [None]:
df_featured['ClimateRegion'] = df_featured.apply(classify_climate_region, axis=1)

In [None]:
numeric_data = df_featured.select_dtypes(include=['number'])
categorical_data = df_featured.select_dtypes(exclude=['number'])

# Handling Missing Values (NULLS)

Several approaches exist, each with distinct strengths:

1. **Removing Missing Records**: The simplest approach, but it risks losing valuable data if too many records are discarded.
2. **Statistical Imputation**: Fills missing values using the mean, median, or mode, offering a quick fix while maintaining dataset integrity.
3. **KNN Imputation**: More advanced, it leverages similar data points to estimate missing values, ensuring better context-driven predictions.
4. **Iterative Imputation**: A machine learning-based method that predicts missing values by modeling relationships with other features, refining estimates iteratively.

Each of these approaches will be tested, allowing us to determine the best-performing combination for our dataset.

## 1. Removing Missing Records

In [None]:
def remove_nulls(df):
    df_cleaned = df.dropna()

    return df_cleaned

In [None]:
df_remove_nulls = remove_nulls(df_featured)

In [None]:
df_remove_nulls.isnull().sum()

## 2. Statistical Imputation:

In [None]:
def statistical_impute(df, method):
    numeric_data = df.select_dtypes(include=['number'])
    categorical_data = df.select_dtypes(exclude=['number'])

    if method in ['mean', 'median']:
        imputer_numeric = SimpleImputer(strategy=method)
        df[numeric_data.columns] = imputer_numeric.fit_transform(numeric_data)
    elif method == 'most_frequent':
        imputer_non_numeric = SimpleImputer(strategy='most_frequent')
        df[categorical_data.columns] = imputer_non_numeric.fit_transform(categorical_data)
    else:
        raise ValueError("Method should be 'mean', 'median' for numeric columns or 'most_frequent' for categorical columns.")

    return df

In [None]:
df_statistical_impute_mean = statistical_impute(df_featured,'mean')
df_statistical_impute_median = statistical_impute(df_featured,'median')
df_statistical_impute_most_frequent = statistical_impute(df_featured,'most_frequent')

In [None]:
df_statistical_impute_mean.isnull().sum()

In [None]:
df_statistical_impute_median.isnull().sum()

In [None]:
df_statistical_impute_most_frequent.isnull().sum()

## 3. KNN Imputation

In [None]:
from sklearn.impute import KNNImputer
import pandas as pd

def knn_impute(df, n_neighbors, weights, metric):
    numeric_data = df.select_dtypes(include=['number'])
    categorical_data = df.select_dtypes(exclude=['number'])

    imputer = KNNImputer(n_neighbors=n_neighbors, weights=weights, metric=metric)
    imputed_array = imputer.fit_transform(numeric_data)

    df_imputed = pd.DataFrame(imputed_array, columns=numeric_data.columns, index=df.index)

    df_combined = pd.concat([df_imputed, categorical_data], axis=1)

    return df_combined

In [None]:
df_knn_impute = knn_impute(df_featured,5,'uniform','nan_euclidean')

In [None]:
df_knn_impute.isnull().sum()

## 4. Iterative Imputation

In [None]:
def iterative_impute(df, max_iter, random_state):
    numeric_data = df.select_dtypes(include=['number'])
    categorical_data = df.select_dtypes(exclude=['number'])

    imputer = IterativeImputer(max_iter=max_iter, random_state=random_state)
    imputed_array = imputer.fit_transform(numeric_data)

    imputed_df = pd.DataFrame(imputed_array, columns=numeric_data.columns, index=df.index)

    df_combined = pd.concat([imputed_df, categorical_data], axis=1)

    return df_combined

In [None]:
df_iterative_impute = iterative_impute(df_featured , 100,42)

In [None]:
df_iterative_impute.isnull().sum()

# Categorical Encoding
- one-hot encoding : `City`,`season`,`ClimateRegion`
- [cyclic encoding](https://shrmtmt.medium.com/understand-the-capabilities-of-cyclic-encoding-5b68f831387e) : `month`

In [None]:
def encoding(df):

# One-Hot Encoding for specific categorical features
    one_hot_encoded_columns = ['City', 'season','ClimateRegion']
    encoder = OneHotEncoder(drop='if_binary',sparse_output=False,handle_unknown='ignore')
    encoded_cols = encoder.fit_transform(df[one_hot_encoded_columns])
    df.drop(columns=one_hot_encoded_columns,inplace=True)
    df[encoder.get_feature_names_out()]=encoded_cols

    return df

In [None]:
df_dncoded = encoding(df_statistical_impute_median)

In [None]:
df_dncoded.head(2)

## Dropping Columns

In [None]:
df_egy_cleaned = df_dncoded.drop(columns=['Country','Longitude','Latitude'])

In [None]:
df_egy_cleaned.head(2)

In [None]:
numeric_data = df_iterative_impute.select_dtypes(include=['number'])
categorical_data = df_iterative_impute.select_dtypes(exclude=['number'])

# Outliers Detection

In [None]:
def count_outliers(column):
    if pd.api.types.is_numeric_dtype(column):
        Q1 = column.quantile(0.25)
        Q3 = column.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = column[(column < lower_bound) | (column > upper_bound)]
        return len(outliers)
    else:
        return 0

In [None]:
for index, col in enumerate(numeric_data.columns):
    column_data = numeric_data[col]
    outliers = count_outliers(column_data)
    print(col,  outliers)

In [None]:
plt.figure(figsize=(20, 15))
for n, feature in enumerate(numeric_data.columns):
    plt.subplot(3, 2, n + 1)
    sns.boxplot(data=df_featured, x=feature)
    plt.title(f'Box Plot of {feature}')
    plt.xlabel(feature)
plt.tight_layout()
plt.show()

# Handling Outliers
### Different methods to handle outliers include:

1. **Outlier Removal**: Excluding data points that fall outside a defined threshold.

2. **Quantile-based Flooring and Capping**: Limiting extreme values by setting them to the boundaries defined by quantiles.

3. **Mean/Median Imputation**: Replacing outliers with the mean or median of the feature.

4. **Not Handling Outliers**: In some cases, outliers may carry important information or have minimal impact, so it's better to leave them as they are.

## 1. Outlier Removal

In [None]:
def remove_outliers(df):

    Q1 = numeric_data.quantile(0.25)
    Q3 = numeric_data.quantile(0.75)
    IQR = Q3 - Q1
    df = df[~((numeric_data < (Q1 - 1.5 * IQR)) | (numeric_data > (Q3 + 1.5 * IQR))).any(axis=1)]

    return df

In [None]:
df_remove_outliers = remove_outliers(df_iterative_impute)

In [None]:
for index, col in enumerate(df_remove_outliers.columns):
    column_data = df_remove_outliers[col]
    outliers = count_outliers(column_data)
    print(col,  outliers)

## 2. Quantile-based Flooring and Capping

In [None]:
def quantile_flooring_capping(df, numeric_columns, lower_quantile=0.01, upper_quantile=0.99):
    for col in numeric_columns:
        lower_bound = df[col].quantile(lower_quantile)
        upper_bound = df[col].quantile(upper_quantile)

        df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)

    return df

In [None]:
df_quantile_flooring_capping = quantile_flooring_capping(df_iterative_impute,numeric_data.columns,0.01,0.99)

In [None]:
for index, col in enumerate(df_quantile_flooring_capping.columns):
    column_data = df_quantile_flooring_capping[col]
    outliers = count_outliers(column_data)
    print(col,  outliers)

## 3. Mean/Median Imputation

In [None]:
def impute_outliers_with_mean_median(df, numeric_columns, method):
    for col in numeric_columns:
        outlier_count = count_outliers(df[col])

        if outlier_count > 0:
            if method == 'mean':
                impute_value = df[col].mean()
            elif method == 'median':
                impute_value = df[col].median()
            else:
                raise ValueError("Method should be either 'mean' or 'median'")

            # Replace outliers with the imputation value
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            df.loc[(df[col] < lower_bound) | (df[col] > upper_bound), col] = impute_value
    return df

In [None]:
df_impute_outliers_with_mean = impute_outliers_with_mean_median(df_iterative_impute,numeric_data.columns,'mean')

In [None]:
for index, col in enumerate(df_impute_outliers_with_mean.columns):
    column_data = df_impute_outliers_with_mean[col]
    outliers = count_outliers(column_data)
    print(col,  outliers)

In [None]:
df_impute_outliers_with_median = impute_outliers_with_mean_median(df_iterative_impute,numeric_data.columns,'median')

In [None]:
for index, col in enumerate(df_impute_outliers_with_median.columns):
    column_data = df_impute_outliers_with_median[col]
    outlier_count = count_outliers(column_data)
    print(col, outlier_count)

# Data Transformation

## Upsampling

In [None]:
df_featured['date'] = pd.to_datetime(df_featured['date'], errors='coerce')
df_featured.set_index('date', inplace=True)

columns_to_interpolate = ['AverageTemperature', 'AverageTemperatureUncertainty']

original_index = df_featured.index

In [None]:
upsampled = df_featured[columns_to_interpolate].resample('D').mean()
interpolated_poly = upsampled.interpolate(method='spline', order=2)

combined_index = original_index.union(interpolated_poly.index)

In [None]:
df_combined = pd.DataFrame(index=combined_index)

df_combined = df_combined.join(df_featured)

df_combined = df_combined.join(interpolated_poly, how='left', rsuffix='_interpolated')

df_combined.loc[df_combined.index.difference(original_index), columns_to_interpolate] = np.nan

df_combined[columns_to_interpolate] = df_combined[columns_to_interpolate].fillna(df_combined.filter(like='_interpolated'))

df_combined.drop(columns=df_combined.filter(like='_interpolated').columns, inplace=True)

df_combined = df_combined[~df_combined.index.duplicated(keep='first')]

In [None]:
plt.figure(figsize=(20,4))
interpolated_poly.plot()
plt.show()

## Check Stationarity

In [None]:
def check_stationarity(series):
    # Check if the series is constant
    if series.nunique() == 1:
        print("The series is constant and cannot be used for the ADF test.")
        return

    # Rolling statistics
    rolling_mean = series.rolling(window=12).mean()
    rolling_std = series.rolling(window=12).std()

    # Plot rolling statistics
    plt.figure(figsize=(10, 6))
    plt.plot(series, color='blue', label='Original')
    plt.plot(rolling_mean, color='red', label='Rolling Mean')
    plt.plot(rolling_std, color='black', label='Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show()

    # Perform ADF Test
    adf_test = adfuller(series, autolag='AIC')

    print('Results of Dickey-Fuller Test:')
    df_output = pd.Series(adf_test[0:4], index=['Test Statistic', 'p-value', '#Lags Used', 'Number of Observations Used'])
    for key, value in adf_test[4].items():
        df_output['Critical Value (%s)' % key] = value

    print(df_output)

In [None]:
check_stationarity(df_egy_cleaned['AverageTemperature'])

#### The results of the Dickey-Fuller test indicate that the `AverageTemperature` time series is stationary.

## Moving Average Smoothing : tail-rolling average transform as data Preparation

In [None]:
df = df_egy[['date','AverageTemperature']].set_index('date')
series = df.squeeze()

In [None]:
# tail-rolling average transform
rolling = series.rolling(window=3)
rolling_mean = rolling.mean()
print(rolling_mean.head(10))
# plot original and transformed dataset
series.plot(label='original')
rolling_mean.plot(color='red',label='transformed')
plt.legend()
plt.show()
# zoomed plot original and transformed dataset
series[:100].plot(label='original')
rolling_mean[:100].plot(color='red',label='transformed')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(20, 15))
for n, feature in enumerate(numeric_data):
    plt.subplot(4, 2, n + 1)
    sns.kdeplot(data=df_knn_impute, x=feature)
    plt.title(f' Density of {feature}')
    plt.xlabel(feature)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.preprocessing import QuantileTransformer
quantile = QuantileTransformer(output_distribution='normal')
data_trans = quantile.fit_transform(df_remove_outliers[numeric_data.columns])
# histogram of the transformed data
plt.hist(data_trans, bins=25)
plt.show()

In [None]:
df = pd.DataFrame(data_trans)

In [None]:
df.head()

In [None]:
plt.figure(figsize=(20, 15))
for n, feature in enumerate(df):
    plt.subplot(4, 2, n + 1)
    sns.kdeplot(data=df, x=feature)
    plt.title(f' Density of {feature}')
    plt.xlabel(feature)
plt.tight_layout()
plt.show()

In [None]:
# Plot time series data
plt.figure(figsize=(35, 6))
plt.plot(df_egy_cleaned['date'], df_egy_cleaned['AverageTemperature'], label='Average Temperature', color='blue')
plt.xlabel('Date')
plt.ylabel('Average Temperature')
plt.title('Time Series of Average Temperature')
plt.legend()
plt.show()


# Clustering TO see similar attribute between cities

In [None]:
df_egy_cluster=df_egy.drop(['Country','date','Longitude','Latitude','City'],axis=1)

In [None]:
df_egy_cluster.dropna(inplace=True)
df_egy_copy=df_egy.copy()
df_egy_copy.dropna(inplace=True)
df_egy_copy.info()

In [None]:
df_egy_cluster.info()

In [None]:
# Different scaling methods
scalers = {
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler(),
    'RobustScaler': RobustScaler()
}

# Dictionary to store inertia values for each scaling method
inertia_dict = {}

for scaler_name, scaler in scalers.items():
    # Scale the data
    df_scaled = scaler.fit_transform(df_egy_cluster)

    # Determine optimal number of clusters (e.g., using Elbow method)
    inertia = []
    for i in range(1, 11):
        kmeans = KMeans(n_clusters=i, random_state=42)
        kmeans.fit(df_scaled)
        inertia.append(kmeans.inertia_)

    inertia_dict[scaler_name] = inertia

    # Plot Elbow method results
    plt.plot(range(1, 11), inertia, marker='o', label=scaler_name)

plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.legend()
plt.show()

In [None]:
X = df_egy_cluster

scalers = {
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler(),
    'RobustScaler': RobustScaler()
}

# Range of k (number of clusters) to try
k_values = range(2, 5)  # You can adjust the range as needed

# Dictionary to store silhouette scores
results = {}

for scaler_name, scaler in scalers.items():
    for k in k_values:
        try:

            model = KMeans(n_clusters=k, random_state=42)

            preferred_scaler = scalers[scaler_name]

            X_scaled = preferred_scaler.fit_transform(df_egy_cluster)

            labels = model.fit_predict(X_scaled)


            score = silhouette_score(X_scaled, labels)


            results[(scaler_name, k)] = score

            print(f'Linkage: {scaler_name}, Clusters: {k}, Silhouette Score: {score:.4f}')

        except ValueError as e:

            print(f'Error for Linkage: {scaler_name}, Clusters: {k}: {e}')


best_params = max(results, key=results.get)
best_score = results[best_params]

print(f'\nBest combination: Linkage={best_params[0]}, Clusters={best_params[1]} with Silhouette Score={best_score:.4f}')

In [None]:
model = KMeans(n_clusters=3 , random_state=42)

preferred_scaler = MinMaxScaler()

X_scaled = preferred_scaler.fit_transform(df_egy_cluster)

labels = model.fit_predict(X_scaled)
score = silhouette_score(X_scaled, labels)
print(f'Silhouette Score: {score}')
df_egy_copy['clusterCities'] = model.labels_

plt.scatter(df_egy_cluster['AverageTemperature'], df_egy_cluster['AverageTemperatureUncertainty'], c=df_egy_copy['clusterCities'], cmap='viridis')
plt.title('Customer Segmentation with kmeans')
plt.xlabel('Egyptian cities')
plt.ylabel('Temp')
plt.show()

In [None]:
aCites=df_egy_copy[df_egy_copy['clusterCities']==0]
bCites=df_egy_copy[df_egy_copy['clusterCities']==1]
cCites=df_egy_copy[df_egy_copy['clusterCities']==2]

In [None]:
# Assuming FCites is your DataFrame
unique_cities = aCites['City'].value_counts()

# Display the unique cities and their counts
print(unique_cities)

In [None]:
import matplotlib.pyplot as plt

# Assuming 'aCites' is your DataFrame
unique_cities = aCites['City'].value_counts()

# Plotting the bar chart
plt.figure(figsize=(12, 8))
unique_cities.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Count of Unique Cities')
plt.xlabel('City')
plt.ylabel('Count')
plt.xticks(rotation=90)  # Rotate city names for better readability
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
# Assuming FCites is your DataFrame
unique_cities = bCites['City'].value_counts()

# Display the unique cities and their counts
print(unique_cities)

In [None]:
# Assuming FCites is your DataFrame
unique_cities = cCites['City'].value_counts()

# Display the unique cities and their counts
print(unique_cities)

# Forcasting Cluster 1

In [None]:
df_egy['dt'] = pd.to_datetime(df_egy['dt'])
c0 =df_egy[df_egy['City']=='Alexandria']
c0.set_index('dt', inplace=True)
c0= c0.fillna(0)

In [None]:
from datetime import datetime
from datetime import timedelta
start_date = datetime(1810,5,1)
end_date = datetime(2013,8,1)
lim_df = c0[start_date:end_date]
lim_df=lim_df.drop(['City','Country','Longitude','Latitude','AverageTemperatureUncertainty'],axis=1)

In [None]:
plt.figure(figsize=(10,4))
plt.plot(lim_df)
plt.ylabel('avg temp', fontsize=16)
for year in range(start_date.year,end_date.year):
    plt.axvline(pd.to_datetime(str(year)+'-01-01'), color='k', linestyle='--', alpha=0.2)

In [None]:
plot_acf(lim_df, lags=24, title='Autocorrelation')

In [None]:
plot_pacf(lim_df, lags=24, title='partial correlation')

In [None]:
first_diff = lim_df.diff(periods=2)[2:]

In [None]:
first_diff

In [None]:
plt.figure(figsize=(10,4))
plt.plot(first_diff)
plt.title('diffrence of temp', fontsize=20)
for year in range(start_date.year,end_date.year):
    plt.axvline(pd.to_datetime(str(year)+'-01-01'), color='k', linestyle='--', alpha=0.2)

In [None]:
plot_acf(first_diff, lags=30, title='Autocorrelation')

In [None]:
plot_pacf(first_diff, lags=12, title='Partial Autocorrelation ')

In [None]:
first_diff.head()

In [None]:
first_diff.tail()

In [None]:
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
train1_start=datetime(1999,1,1)
train1_end = datetime(2000,1,1)
test1_end = datetime(2013,7,1)

train1_data = first_diff[:train1_end]
# Use relativedelta to add 1 month to train_end
test1_data = first_diff[train1_end + timedelta(days=1):test1_end]

In [None]:
train1_data.head()

In [None]:
test1_data.head()

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
sarima_model1 = SARIMAX(
    train1_data,
    order=(2, 0, 2),                 # ARIMA order (p, d, q)
    seasonal_order=(1, 1, 1, 12)    # Seasonal order (P, D, Q, S) with period 12
).fit()

In [None]:
#get prediction start and end dates
pred1_start_date = test1_data.index[0]
pred1_end_date = test1_data.index[-1]

In [None]:
#get the predictions and residuals
predictions = sarima_model1.predict(start=pred1_start_date, end=pred1_end_date)
residuals = test1_data['AverageTemperature'] - predictions

In [None]:
print('Root Mean Squared Error:', np.sqrt(np.mean(residuals**2)))

plt.figure(figsize=(10,4))

plt.plot(test1_data)
plt.plot(predictions)

plt.legend(('Data', 'Predictions'), fontsize=16)

plt.title('First Difference ', fontsize=20)
plt.ylabel('avg Temp', fontsize=16)

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
sarima_model1 = SARIMAX(
    test1_data,
    order=(2, 0, 2),                 # ARIMA order (p, d, q)
    seasonal_order=(1, 1, 1, 12)    # Seasonal order (P, D, Q, S) with period 12
).fit()

In [None]:
import matplotlib.pyplot as plt

# Forecast the next 12 periods
fore_sarima = sarima_model1.get_forecast(steps=12)

# Get the predicted values and confidence intervals
predicted_values = fore_sarima.predicted_mean
confidence_intervals = fore_sarima.conf_int()

# Get the index for the forecasted period
forecast_index = pd.date_range(start=test1_data.index[-1], periods=12+1, freq='MS')[1:]

# Plotting the actual data
plt.plot(test1_data.index, test1_data, label='Observed', color='blue')

# Plotting the forecasted data
plt.plot(forecast_index, predicted_values, label='Forecast', color='green')

# Plotting the confidence intervals
plt.fill_between(forecast_index,
                 confidence_intervals.iloc[:, 0],
                 confidence_intervals.iloc[:, 1],
                 color='lightgreen', alpha=0.5, label='Confidence Interval')

# Adding labels and legend
plt.xlabel('Date')
plt.ylabel('Values')
plt.title('SARIMA Forecast')
plt.legend()

# Show the plot
plt.show()


# Forcasting Cluster 2

In [None]:
df_egy=df[df['Country']=='Egypt']

In [None]:
# Subsetting
c1 =df_egy[df_egy['City']=='Aswan']
c1.set_index('dt', inplace=True)
c1

In [None]:
c1= c1.fillna(0)

In [None]:
start_date = datetime(1860,5,1)
end_date = datetime(2013,8,1)
lim1_df = c1[start_date:end_date]
lim1_df=lim1_df.drop(['City','Country','Longitude','Latitude','AverageTemperatureUncertainty'],axis=1)
plt.figure(figsize=(10,4))
plt.plot(lim1_df)
plt.ylabel('AverageTemperature', fontsize=16)
for year in range(start_date.year,end_date.year):
    plt.axvline(pd.to_datetime(str(year)+'-01-01'), color='k', linestyle='--', alpha=0.2)

In [None]:
from prophet.plot import plot_components

model = Prophet()
model.fit(c1)
future = model.make_future_dataframe(periods=0)
forecast = model.predict(future)

plot_components(model, forecast)
plt.show()

In [None]:
c1.tail()

In [None]:
plot_acf(lim1_df, lags=24, title='Autocorrelation')

In [None]:
plot_pacf(lim1_df, lags=24, title='partialcorrelation')

In [None]:

from numpy import diff

second_diff = lim1_df.diff(periods=3)[3:]

plt.figure(figsize=(10,4))
plt.plot(second_diff)
plt.title('diffrence of temp', fontsize=20)
plt.ylabel('Sales', fontsize=16)
for year in range(start_date.year,end_date.year):
    plt.axvline(pd.to_datetime(str(year)+'-01-01'), color='k', linestyle='--', alpha=0.2)


In [None]:
lim1_df.head()

In [None]:
second_diff.head()

In [None]:
second_diff.tail()

In [None]:
plot_acf(second_diff, lags=30, title='Autocorrelation')
plot_pacf(second_diff, lags=24, title='Partial Autocorrelation ')

In [None]:
from datetime import datetime, timedelta


train_end = datetime(2000,1,1)
test_end = datetime(2013,8,1)

train_data = second_diff[:train_end]

test_data = second_diff[train_end + timedelta(days=1):test_end]
original_test_data = lim1_df[train_end + timedelta(days=1):test_end]

In [None]:
pred_start_date = test_data.index[0]
pred_end_date = test_data.index[-1]

In [None]:
plot_acf(second_diff, lags=24, title='Autocorrelation')
plot_pacf(second_diff, lags=24, title='Partial Autocorrelation ')

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX # type: ignore
sarima_model1 = SARIMAX(
    train_data,
    order=(2, 0, 2),                 # ARIMA order (p, d, q)
    seasonal_order=(1, 1, 1, 12)    # Seasonal order (P, D, Q, S) with period 12
).fit()

In [None]:

#get the predictions and residuals
predictions = sarima_model1.predict(start=pred_start_date, end=pred_end_date)
residuals = test_data['AverageTemperature'] - predictions

In [None]:
plt.figure(figsize=(10,4))
plt.plot(test_data)
plt.plot(predictions)
plt.legend(('Data', 'Predictions'), fontsize=16)
# plt.title('First Difference ', fontsize=20)
plt.ylabel('avg Temp', fontsize=16)
print('Root Mean Squared Error:', np.sqrt(np.mean(residuals**2)))

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX # type: ignore
sarima_model1 = SARIMAX(
    test_data,
    order=(2, 0, 2),                 # ARIMA order (p, d, q)
    seasonal_order=(1, 1, 1, 12)    # Seasonal order (P, D, Q, S) with period 12
).fit()

In [None]:
import matplotlib.pyplot as plt



# Get the index for the forecasted period
forecast_index = pd.date_range(start=test_data.index[-1], periods=12+1, freq='MS')[1:]

# Plotting the actual data
plt.plot(test_data.index, test_data, label='Observed', color='blue')

# Plotting the forecasted data
plt.plot(forecast_index, predicted_values, label='Forecast', color='green')

# Plotting the confidence intervals
plt.fill_between(forecast_index,
                 confidence_intervals.iloc[:, 0],
                 confidence_intervals.iloc[:, 1],
                 color='lightgreen', alpha=0.5, label='Confidence Interval')

# Adding labels and legend
plt.xlabel('Date')
plt.ylabel('Values')
plt.title('SARIMA Forecast')
plt.legend()

# Show the plot
plt.show()


# Forcasting Cluster 3

In [None]:
c2 =df_egy[df_egy['City']=='Cairo']
c2.set_index('dt', inplace=True)
c2
c2= c2.fillna(0)

In [None]:
c2.head()

In [None]:
start_date = datetime(1840,1,1)
end_date = datetime(2013,8,1)
lim2_df = c2[start_date:end_date]
lim2_df=lim2_df.drop(['City','Country','Longitude','Latitude','AverageTemperatureUncertainty'],axis=1)
plt.figure(figsize=(10,4))
plt.plot(lim2_df)
plt.ylabel('AverageTemperature', fontsize=16)
for year in range(start_date.year,end_date.year):
    plt.axvline(pd.to_datetime(str(year)+'-01-01'), color='k', linestyle='--', alpha=0.2)

In [None]:
plot_acf(lim2_df, lags=24, title='Autocorrelation')
plot_pacf(lim2_df, lags=24, title='partialcorrelation')

In [None]:
from numpy import diff

third_diff = lim2_df.diff(periods=3)[3:]

plt.figure(figsize=(10,4))
plt.plot(third_diff)
plt.title('diffrence of temp', fontsize=20)
for year in range(start_date.year,end_date.year):
    plt.axvline(pd.to_datetime(str(year)+'-01-01'), color='k', linestyle='--', alpha=0.2)

In [None]:
plot_acf(third_diff, lags=24, title='Autocorrelation')



plot_pacf(third_diff, lags=24, title='partialcorrelation')


In [None]:
from datetime import datetime, timedelta


train3_end = datetime(2000,1,1)
test3_end = datetime(2013,8,1)

train3_data = third_diff[:train3_end]

test3_data = third_diff[train3_end + timedelta(days=1):test3_end]
pred3_start_date = test3_data.index[0]
pred3_end_date = test3_data.index[-1]

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX # type: ignore
sarima_model3 = SARIMAX(
    train3_data,
    order=(2, 0, 2),                 # ARIMA order (p, d, q)
    seasonal_order=(1, 1, 1, 12)    # Seasonal order (P, D, Q, S) with period 12
).fit()


In [None]:
#get the predictions and residuals
predictions = sarima_model3.predict(start=pred3_start_date, end=pred3_end_date)
residuals = test_data['AverageTemperature'] - predictions

In [None]:
plt.figure(figsize=(10,4))
plt.plot(test3_data)
plt.plot(predictions)
plt.legend(('Data', 'Predictions'), fontsize=16)
# plt.title('First Difference ', fontsize=20)
plt.ylabel('avg Temp', fontsize=16)
print('Root Mean Squared Error:', np.sqrt(np.mean(residuals**2)))

In [None]:
import matplotlib.pyplot as plt



# Get the index for the forecasted period
forecast_index = pd.date_range(start=train3_data.index[-1], periods=12+1, freq='MS')[1:]

# Plotting the actual data
plt.plot(train3_data.index, train3_data, label='Observed', color='blue')

# Plotting the forecasted data
plt.plot(forecast_index, predicted_values, label='Forecast', color='green')

# Plotting the confidence intervals
plt.fill_between(forecast_index,
                 confidence_intervals.iloc[:, 0],
                 confidence_intervals.iloc[:, 1],
                 color='lightgreen', alpha=0.5, label='Confidence Interval')

# Adding labels and legend
plt.xlabel('Date')
plt.ylabel('Values')
plt.title('SARIMA Forecast')
plt.legend()

# Show the plot
plt.show()


In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX # type: ignore
sarima_model3 = SARIMAX(
    test3_data,
    order=(2, 0, 2),                 # ARIMA order (p, d, q)
    seasonal_order=(1, 1, 1, 12)    # Seasonal order (P, D, Q, S) with period 12
).fit()


In [None]:
import matplotlib.pyplot as plt



# Get the index for the forecasted period
forecast_index = pd.date_range(start=test3_data.index[-1], periods=12+1, freq='MS')[1:]

# Plotting the actual data
plt.plot(test3_data.index, test3_data, label='Observed', color='blue')

# Plotting the forecasted data
plt.plot(forecast_index, predicted_values, label='Forecast', color='green')

# Plotting the confidence intervals
plt.fill_between(forecast_index,
                 confidence_intervals.iloc[:, 0],
                 confidence_intervals.iloc[:, 1],
                 color='lightgreen', alpha=0.5, label='Confidence Interval')

# Adding labels and legend
plt.xlabel('Date')
plt.ylabel('Values')
plt.title('SARIMA Forecast')
plt.legend()

# Show the plot
plt.show()


# Regression

In [None]:
df_regression=df_egy.copy()

In [None]:
df_regression.info()

In [None]:
import pandas as pd
import numpy as np


# Convert Latitude and Longitude to numeric values
df_regression['Latitude'] = df_regression['Latitude'].str[:-1].astype(float)  # Remove the 'N' or 'S' and convert to float
df_regression['Longitude'] = df_regression['Longitude'].str[:-1].astype(float)  # Remove the 'E' or 'W' and convert to float


df_regression['Year'] = df_regression['dt'].dt.year
df_regression['Month'] = df_regression['dt'].dt.month

# Drop rows where AverageTemperature is NaN (or you can impute if necessary)
df_clean = df_regression.dropna(subset=['AverageTemperature'])

# Selecting the features and target variable
X = df_clean[['Longitude', 'Latitude', 'Month']]
y = df_clean['AverageTemperature']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Assuming 'X' contains your features and 'y' contains the target variable (AverageTemperature)
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions for both training and testing sets
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Evaluate on the training set
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_r2 = r2_score(y_train, y_train_pred)

# Evaluate on the testing set
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_r2 = r2_score(y_test, y_test_pred)

# Print results
print(f"Training Set RMSE: {train_rmse:.4f}")
print(f"Training Set R² Score: {train_r2:.4f}")
print(f"Testing Set RMSE: {test_rmse:.4f}")
print(f"Testing Set R² Score: {test_r2:.4f}")

In [None]:
# Initialize and fit the RandomForest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions for both training and testing sets
y_train_pred_rf = rf_model.predict(X_train)
y_test_pred_rf = rf_model.predict(X_test)

# Evaluate on the training set
train_rmse_rf = np.sqrt(mean_squared_error(y_train, y_train_pred_rf))
train_r2_rf = r2_score(y_train, y_train_pred_rf)

# Evaluate on the testing set
test_rmse_rf = np.sqrt(mean_squared_error(y_test, y_test_pred_rf))
test_r2_rf = r2_score(y_test, y_test_pred_rf)

# Print results
print(f"Random Forest - Training Set RMSE: {train_rmse_rf:.4f}")
print(f"Random Forest - Training Set R² Score: {train_r2_rf:.4f}")
print(f"Random Forest - Testing Set RMSE: {test_rmse_rf:.4f}")
print(f"Random Forest - Testing Set R² Score: {test_r2_rf:.4f}")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.pipeline import Pipeline


# Split the data (assuming X and y are defined)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models and their hyperparameter grids
models = {
    'LinearRegression': {
        'model': LinearRegression(),
        'params': {}
    },
    'Lasso': {
        'model': Lasso(),
        'params': {
            'alpha': [0.01, 0.1, 1, 10]
        }
    },
    'PolynomialRegression': {
        'model': Pipeline([
            ('poly', PolynomialFeatures()),
            ('linear', LinearRegression())
        ]),
        'params': {
            'poly__degree': [2, 3, 4]
        }
    },
    'RandomForest': {
        'model': RandomForestRegressor(random_state=42),
        'params': {
            'n_estimators': [50, 100, 150],
            'max_depth': [10, 20, 30]
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingRegressor(random_state=42),
        'params': {
            'n_estimators': [50, 100, 150],
            'learning_rate': [0.01, 0.05, 0.1],
            'max_depth': [3, 5, 7]
        }
    },
    'AdaBoost': {
        'model': AdaBoostRegressor(random_state=42),
        'params': {
            'n_estimators': [50, 100, 150],
            'learning_rate': [0.01, 0.1, 1.0]
        }
    },
}

# Store the results for each model
results = {}

# Function to train model and calculate R² for train and test sets
def evaluate_model(model_name, model, params):
    grid_search = GridSearchCV(model, params, cv=5, scoring='r2', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_

    # Get R² for the train set
    y_train_pred = best_model.predict(X_train)
    train_r2 = r2_score(y_train, y_train_pred)

    # Get R² for the test set
    y_test_pred = best_model.predict(X_test)
    test_r2 = r2_score(y_test, y_test_pred)

    # Store results
    results[model_name] = {
        'best_model': best_model,
        'train_r2': train_r2,
        'test_r2': test_r2
    }

    print(f"{model_name} - Best Params: {grid_search.best_params_}, Train R²: {train_r2:.4f}, Test R²: {test_r2:.4f}")

# Iterate over each model, tune, and evaluate
for model_name, model_info in models.items():
    evaluate_model(model_name, model_info['model'], model_info['params'])

# Plot the R² scores
train_r2_scores = [results[model]['train_r2'] for model in results]
test_r2_scores = [results[model]['test_r2'] for model in results]
model_names = list(results.keys())

plt.figure(figsize=(10, 6))
plt.barh(model_names, train_r2_scores, color='skyblue', label='Train R²')
plt.barh(model_names, test_r2_scores, left=train_r2_scores, color='salmon', label='Test R²')
plt.xlabel('R² Score')
plt.title('Train and Test R² Scores for Different Models')
plt.legend()
plt.show()

# Find and print the best model based on test R²
best_model_name = max(results, key=lambda model: results[model]['test_r2'])
best_model = results[best_model_name]['best_model']
best_test_r2 = results[best_model_name]['test_r2']

print(f"\nBest Model: {best_model_name} with Test R² Score: {best_test_r2:.4f}")