## Import

In [486]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Exploring your data

### Read raw data from file

In [487]:
raw_df = pd.read_csv("../data/crawl/raw_data.csv")
raw_df.head()

Unnamed: 0,datetime,temp (K),feels_like,pressure,humidity,temp_min,temp_max,wind_speed,wind_deg,clouds_all,id_weatrher,main_weatrher,description_weatrher,icon_weatrher,rain_1h,wind_gust
0,1670605000.0,299.16,299.16,1010.0,94.0,299.16,299.16,1.03,0.0,40.0,802.0,Clouds,scattered clouds,03n,,
1,1670609000.0,299.16,299.16,1009.0,94.0,299.16,299.16,1.03,0.0,40.0,802.0,Clouds,scattered clouds,03n,,
2,1670612000.0,298.16,299.33,1009.0,100.0,298.16,298.16,1.03,0.0,40.0,802.0,Clouds,scattered clouds,03n,,
3,1670616000.0,298.16,298.88,1008.0,83.0,298.16,298.16,1.03,20.0,40.0,802.0,Clouds,scattered clouds,03n,,
4,1670620000.0,298.16,298.88,1008.0,83.0,298.16,298.16,1.03,50.0,40.0,802.0,Clouds,scattered clouds,03n,,


### Checking shape

In [488]:
shape = raw_df.shape

In [489]:
print(f"Shape: {shape}")

if shape[0] > 1000:
    print(f"Data is qualified.")
else:
    print(f"Data isn't qualified")

Shape: (9720, 16)
Data is qualified.


### Dealing with duplicates

In [490]:
index = raw_df.index
detectDupSeries = index.duplicated(keep='first')
num_duplicated_rows = detectDupSeries.sum()

In [491]:
if num_duplicated_rows == 0:
    print(f"No duplicated line.")
else:
    if num_duplicated_rows > 1:
        ext = "lines"
    else:
        ext = "line"
    print(f"{num_duplicated_rows} duplicated " + ext + ". De-deduplicating the raw data.")

No duplicated line.


In [492]:
# De-deduplicate
raw_df = raw_df[~detectDupSeries]
post_deduplication_count = raw_df.index.duplicated(keep='first').sum()
assert post_deduplication_count == 0, "Still have duplicated rows after de-duplication."

In [493]:
# Converting 'datetime' from Unix timestamp to readable datetime format
raw_df['datetime'] = pd.to_datetime(raw_df['datetime'], unit='s')

# Renaming columns
raw_df = raw_df.rename(columns={
    'id_weatrher': 'id_weather',
    'main_weatrher': 'main_weather',
    'description_weatrher': 'description_weather',
    'icon_weatrher': 'icon_weather'
})

missing_values = raw_df.isnull().sum()

data_types = raw_df.dtypes

missing_values, data_types

(datetime               1008
 temp (K)               1008
 feels_like             1008
 pressure               1008
 humidity               1008
 temp_min               1008
 temp_max               1008
 wind_speed             1008
 wind_deg               1008
 clouds_all             1008
 id_weather              945
 main_weather            945
 description_weather     945
 icon_weather            945
 rain_1h                8794
 wind_gust              9605
 dtype: int64,
 datetime               datetime64[ns]
 temp (K)                      float64
 feels_like                    float64
 pressure                      float64
 humidity                      float64
 temp_min                      float64
 temp_max                      float64
 wind_speed                    float64
 wind_deg                      float64
 clouds_all                    float64
 id_weather                    float64
 main_weather                   object
 description_weather            object
 icon_weather 

### Checking distribution

For columns with numeric data types:
- Percentage (from 0 to 100) of missing values
- The min
- The lower quartile (phân vị 25)
- The median (phân vị 50)
- The upper quartile (phân vị 75)
- The max

For viewing, using `.round(1)`.

In [494]:
def numeric_column_info(df: pd.DataFrame) -> pd.DataFrame:
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns

    num_col_info = {}
    for col in numeric_columns:
        num_col_info[col] = [
            df[col].isnull().mean() * 100,  # Missing ratio
            df[col].min(),                  # Min
            df[col].quantile(0.25),         # Lower quartile
            df[col].median(),               # Median
            df[col].quantile(0.75),         # Upper quartile
            df[col].max()                   # Max
        ]

    # Creating the DataFrame
    num_col_info_df = pd.DataFrame(num_col_info, index=["missing_ratio", "min", "lower_quartile", "median", "upper_quartile", "max"])

    # Rounding values to 1 decimal place
    num_col_info_df = num_col_info_df.round(1)

    return num_col_info_df

In [495]:
num_col_info_df = numeric_column_info(raw_df)

num_col_info_df

Unnamed: 0,temp (K),feels_like,pressure,humidity,temp_min,temp_max,wind_speed,wind_deg,clouds_all,id_weather,rain_1h,wind_gust
missing_ratio,10.4,10.4,10.4,10.4,10.4,10.4,10.4,10.4,10.4,9.7,90.5,98.8
min,292.2,292.4,1001.0,14.0,292.2,292.2,0.0,0.0,0.0,200.0,0.1,1.9
lower_quartile,299.2,299.3,1008.0,69.0,299.2,299.2,1.5,10.0,40.0,801.0,0.2,5.3
median,301.2,305.8,1010.0,84.0,301.2,301.2,2.6,160.0,40.0,802.0,0.8,9.8
upper_quartile,304.2,309.6,1011.0,94.0,304.2,304.2,4.1,250.0,75.0,802.0,2.2,12.9
max,311.2,317.2,1018.0,100.0,311.2,311.2,11.3,360.0,100.0,804.0,36.5,20.6


In [497]:
dict(num_col_info_df.iloc[0])

{'temp (K)': 10.4,
 'feels_like': 10.4,
 'pressure': 10.4,
 'humidity': 10.4,
 'temp_min': 10.4,
 'temp_max': 10.4,
 'wind_speed': 10.4,
 'wind_deg': 10.4,
 'clouds_all': 10.4,
 'id_weather': 9.7,
 'rain_1h': 90.5,
 'wind_gust': 98.8}

In [498]:
def drop_missing_features(df: pd.DataFrame, missing_lst=None, threshold: float = 75.0) -> pd.DataFrame:

    if (df is None) or (missing_lst is None) or (threshold is None):
        print(f"Invalid.")
        raise ValueError
    
    df_cp = df.copy()
    
    cols_to_trim = []
    
    for key, value in missing_lst.items():
        if float(value) > threshold:
            cols_to_trim.append(key)
            
    if len(cols_to_trim) > 0:
        df_cp = df_cp.drop(columns=cols_to_trim)
        print("Dropped column(s): " + " ".join(cols_to_trim))
    else:
        print("Have no column(s) to trim.")
        
    return df_cp

In [499]:
raw_df = drop_missing_features(raw_df, dict(num_col_info_df.iloc[0]))
# Drop columns with no dates
raw_df.dropna(subset=['datetime'], inplace=True)

Dropped column(s): rain_1h wind_gust


In [500]:
raw_df.head()

Unnamed: 0,datetime,temp (K),feels_like,pressure,humidity,temp_min,temp_max,wind_speed,wind_deg,clouds_all,id_weather,main_weather,description_weather,icon_weather
0,2022-12-09 17:00:00,299.16,299.16,1010.0,94.0,299.16,299.16,1.03,0.0,40.0,802.0,Clouds,scattered clouds,03n
1,2022-12-09 18:00:00,299.16,299.16,1009.0,94.0,299.16,299.16,1.03,0.0,40.0,802.0,Clouds,scattered clouds,03n
2,2022-12-09 19:00:00,298.16,299.33,1009.0,100.0,298.16,298.16,1.03,0.0,40.0,802.0,Clouds,scattered clouds,03n
3,2022-12-09 20:00:00,298.16,298.88,1008.0,83.0,298.16,298.16,1.03,20.0,40.0,802.0,Clouds,scattered clouds,03n
4,2022-12-09 21:00:00,298.16,298.88,1008.0,83.0,298.16,298.16,1.03,50.0,40.0,802.0,Clouds,scattered clouds,03n


In [501]:
num_col_info_df = numeric_column_info(raw_df)

num_col_info_df

Unnamed: 0,temp (K),feels_like,pressure,humidity,temp_min,temp_max,wind_speed,wind_deg,clouds_all,id_weather
missing_ratio,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,292.2,292.4,1001.0,14.0,292.2,292.2,0.0,0.0,0.0,200.0
lower_quartile,299.2,299.3,1008.0,69.0,299.2,299.2,1.5,10.0,40.0,801.0
median,301.2,305.8,1010.0,84.0,301.2,301.2,2.6,160.0,40.0,802.0
upper_quartile,304.2,309.6,1011.0,94.0,304.2,304.2,4.1,250.0,75.0,802.0
max,311.2,317.2,1018.0,100.0,311.2,311.2,11.3,360.0,100.0,804.0


Skipping filling because there is no missing cells.

For columns with non-numeric data types:
- Percentage (from 0 to 100) of missing values
- Number of values (the values here are different values and we do not consider missing values)
- The percentage (from 0 to 100) of each value is sorted by decreasing percentage (we do not consider missing values, the ratio is the ratio compared to the number of non-missing values)

In [502]:
non_numeric_columns = raw_df.select_dtypes(exclude=['float64']).columns

cat_col_info = {}
for col in non_numeric_columns:
    missing_ratio = raw_df[col].isnull().mean() * 100

    num_values = raw_df[col].nunique()

    value_counts = raw_df[col].value_counts(normalize=True) * 100
    value_ratios = value_counts.to_dict()

    cat_col_info[col] = [missing_ratio, num_values, value_ratios]

cat_col_info_df = pd.DataFrame(cat_col_info, index=["missing_ratio", "num_values", "value_ratios"])

cat_col_info_df.loc['missing_ratio'] = cat_col_info_df.loc['missing_ratio'].apply(lambda x: round(x, 1) if isinstance(x, float) else x)

cat_col_info_df

Unnamed: 0,datetime,main_weather,description_weather,icon_weather
missing_ratio,0.0,0.0,0.0,0.0
num_values,8712,7,19,16
value_ratios,"{2022-12-09 17:00:00: 0.01147842056932966, 2023-08-08 19:00:00: 0.01147842056932966, 2023-08-08 ...","{'Clouds': 85.43388429752066, 'Rain': 9.77961432506887, 'Clear': 2.9843893480257115, 'Mist': 1.2...","{'scattered clouds': 41.976584022038566, 'broken clouds': 21.556473829201103, 'few clouds': 21.0...","{'03n': 22.302571166207528, '03d': 19.674012855831037, '04d': 15.495867768595042, '02n': 12.3048..."


### Is the collected data reasonable?

In [503]:
# Range and Value Checks for Numeric Columns
# Example ranges:
# Temperature in Kelvin (0K is absolute zero, upper limit is less defined but should be reasonable)
# Atmospheric pressure in hPa (typical range: 300 to 1100)
# Humidity in % (0 to 100)
# Wind speed in m/s (no upper limit, but excessively high values may be suspect)
# Wind degree (0 to 360)
# Cloudiness in % (0 to 100)

temp_range_check = raw_df['temp (K)'].between(0, 350).all()
pressure_range_check = raw_df['pressure'].between(300, 1100).all()
humidity_range_check = raw_df['humidity'].between(0, 100).all()
wind_speed_range_check = raw_df['wind_speed'] >= 0
wind_deg_range_check = raw_df['wind_deg'].between(0, 360).all()
clouds_all_range_check = raw_df['clouds_all'].between(0, 100).all()

# Results
range_check_results = {
    'Temperature (0K-350K)': temp_range_check,
    'Pressure (300hPa-1100hPa)': pressure_range_check,
    'Humidity (0%-100%)': humidity_range_check,
    'Wind Speed (>=0 m/s)': wind_speed_range_check.all(),
    'Wind Degree (0°-360°)': wind_deg_range_check,
    'Cloudiness (0%-100%)': clouds_all_range_check
}

range_check_results

{'Temperature (0K-350K)': True,
 'Pressure (300hPa-1100hPa)': True,
 'Humidity (0%-100%)': True,
 'Wind Speed (>=0 m/s)': True,
 'Wind Degree (0°-360°)': True,
 'Cloudiness (0%-100%)': True}

### Saving

In [504]:
print(f"Total number of features: {raw_df.shape[1]}")
raw_df.dtypes

Total number of features: 14


datetime               datetime64[ns]
temp (K)                      float64
feels_like                    float64
pressure                      float64
humidity                      float64
temp_min                      float64
temp_max                      float64
wind_speed                    float64
wind_deg                      float64
clouds_all                    float64
id_weather                    float64
main_weather                   object
description_weather            object
icon_weather                   object
dtype: object

In [505]:
raw_df.to_csv("../data/processed/processed_data.csv", index=False)