# Weather Data Analysis 
The project analysis daily weather temperature. This dataset contains the daily weather data recorded in the capital location (based on lat, long values) of 194 countries in the world.


### Data Loading And Overview

In [None]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

##Data loading and overview
weather = pd.read_csv('daily_weather_data.csv')

#view first 5 rows
weather.head()

In [None]:
#view last five rows
weather.tail()

In [None]:
#look at the dataframe properties
weather.info()

### Identifying Missing Values and Data Cleaning

In [None]:
#check for missing data/null values
weather.isnull().sum()

"The convention of the weather and climate community has been to calculate the observed daily *mean* temperature by summing the maximum and minimum instantaneous temperatures during a 24-hour period and dividing by two"

### Converting Data Types

In [None]:
# Converting the 'date' column to datetime
weather['date'] = pd.to_datetime(weather['date'])

In [None]:
# Removing rows with missing values in 'tavg', 'tmin', and 'tmax' if more than one value is missing in the same row
weather.dropna(subset=['tavg', 'tmin', 'tmax'], thresh=2, inplace=True)

# Filling in missing 'tavg' values using the average of 'tmin' and 'tmax' in the same row
weather['tavg'].fillna((weather['tmin'] + weather['tmax']) / 2, inplace=True)

# Filling in missing 'tmin' and 'tmax' values accordingly
weather['tmin'].fillna(2 * weather['tavg'] - weather['tmax'], inplace=True)
weather['tmax'].fillna(2 * weather['tavg'] - weather['tmin'], inplace=True)

# Display the final dataset
weather

### Check For Duplicated Rows

In [None]:
weather.duplicated().sum()

# Exploratory Data Analysis (EDA)

### Descriptive Statistics

In [None]:
weather.describe().round()

In [None]:
#The relationship between columns
cor = weather.corr()

#Visualization of the relationship between columns
sns.heatmap(cor, annot=True);

The statistics heatmap shows that the strongest relationships are between the recorded temperature columns: min daily temperature, max daily temperature and the AVG daily temperature. 

### Distribution Visualization

In [None]:
#Visulization that illustrates the distribution of the temperatre columns
plt.figure(figsize=(10, 6))
plt.boxplot([weather['tmax'], weather['tmin'], weather['tavg']], labels=['tmax', 'tmin', 'tavg'])
plt.title('Boxplot of Temperature Data')
plt.xlabel('Temperature Type')
plt.ylabel('Temperature (°C)')
plt.show()

This distribution of temperature visualization shows that max daily temperature (tmax) suffers that most from outliers. Therefore, it is worth looking into. 

### Potential Outliers

In [None]:
#Display temperature outliers

def find_outliers_IQR(weather):
    q1=weather.quantile(0.25)
    q3=weather.quantile(0.75)
    IQR=q3-q1
    outliers = weather[((weather<(q1-1.5*IQR)) | (weather>(q3+1.5*IQR)))]
    return outliers

outliers = find_outliers_IQR(weather["tavg"])

print("number of outliers: "+ str(len(outliers)))

print("max outlier value: "+ str(outliers.max()))

print("min outlier value: "+ str(outliers.min()))

outliers

In [None]:
outliers = find_outliers_IQR(weather["tmin"])

print("number of outliers: "+ str(len(outliers)))

print("max outlier value: "+ str(outliers.max()))

print("min outlier value: "+ str(outliers.min()))

outliers

In [None]:
outliers = find_outliers_IQR(weather["tmax"])

print("number of outliers: "+ str(len(outliers)))

print("max outlier value: "+ str(outliers.max()))

print("min outlier value: "+ str(outliers.min()))

outliers

# Data Visualization

### Forecasting - Prophet 

In [None]:
from prophet import Prophet

# Renaming the columns to match Prophet's requirements
temperature_data = weather.rename(columns={'date': 'ds', 'tavg': 'y'})

# Initializing the Prophet model
model = Prophet()

# Fitting the model the dataframe
model.fit(temperature_data)

# Creating a dataframe with future dates for prediction
future = model.make_future_dataframe(periods=365)  # Predicting one year into the future

# Temperature predictions for future dates
forecast = model.predict(future)

# Visualizing the forecast
fig = model.plot(forecast)

Prophet shows no critical changes in AVG daily temperature in the following year (365 days).