In [1]:
import os
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler


- pandas: For data ingestion, manipulation, cleaning, and merging.
- numpy: For numerical operations and calculations.
- matplotlib.pyplot: For basic plotting and visualization.
- seaborn: For enhanced statistical visualizations.
- scikit-learn: For data preprocessing utilities (e.g., imputation, scaling train/test split, feature selection).
- datetime: (Built-in) For parsing and handling dates and times.
- os: (Built-in) For file and path operations if needed.

In [12]:
# Load the CSV file
weather_df = pd.read_csv(r"C:\Users\Linds\Repos\East_River\Data\Processed\all_weather_clean.csv")

# Convert the observation timestamp to datetime
weather_df['obs_tms_lcl'] = pd.to_datetime(weather_df['obs_tms_lcl'])



In [13]:
# Check for missing values
print(weather_df.isnull().sum())



location                0
obs_tms_lcl             0
temperature             0
wind_speed              0
wind_degree             0
wind_dir                0
weather_descriptions    0
precip                  0
humidity                0
visibility              0
pressure                0
cloudcover              0
heatindex               0
dewpoint                0
windchill               0
windgust                0
feelslike               0
uv_index                0
time                    0
year                    0
minute                  0
dtype: int64


In [None]:
#checking the DataFrame
print(weather_df.head())


                 location               obs_tms_lcl  temperature  wind_speed  \
0  Aberdeen, South Dakota 2020-12-31 06:00:00+00:00           12           7   
1  Alsville, South Dakota 2020-12-31 06:00:00+00:00            5           5   
2      Ames, South Dakota 2020-12-31 06:00:00+00:00           16           7   
3   Andover, South Dakota 2020-12-31 06:00:00+00:00           12           7   
4         Arco, Minnesota 2020-12-31 06:00:00+00:00            7           6   

   wind_degree wind_dir weather_descriptions  precip  humidity  visibility  \
0          197      SSW               Cloudy     0.0        90           6   
1          206      SSW               Cloudy     0.0        86           6   
2          165      SSE                Clear     0.0        84           6   
3          197      SSW               Cloudy     0.0        90           6   
4          228       SW               Cloudy     0.0        88           6   

   ...  cloudcover  heatindex  dewpoint  windchill

In [15]:
print(weather_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 21 columns):
 #   Column                Non-Null Count    Dtype              
---  ------                --------------    -----              
 0   location              1048575 non-null  object             
 1   obs_tms_lcl           1048575 non-null  datetime64[ns, UTC]
 2   temperature           1048575 non-null  int64              
 3   wind_speed            1048575 non-null  int64              
 4   wind_degree           1048575 non-null  int64              
 5   wind_dir              1048575 non-null  object             
 6   weather_descriptions  1048575 non-null  object             
 7   precip                1048575 non-null  float64            
 8   humidity              1048575 non-null  int64              
 9   visibility            1048575 non-null  int64              
 10  pressure              1048575 non-null  float64            
 11  cloudcover            1048575 non-nul

In [16]:
print(weather_df.describe())

        temperature    wind_speed   wind_degree        precip      humidity  \
count  1.048575e+06  1.048575e+06  1.048575e+06  1.048575e+06  1.048575e+06   
mean   3.150522e+01  9.754246e+00  2.004905e+02  1.082612e-03  7.548892e+01   
std    1.811668e+01  4.917239e+00  9.723591e+01  1.092752e-02  1.844254e+01   
min   -3.100000e+01  1.000000e+00  0.000000e+00  0.000000e+00  1.500000e+01   
25%    2.300000e+01  6.000000e+00  1.300000e+02  0.000000e+00  6.400000e+01   
50%    3.400000e+01  9.000000e+00  1.950000e+02  0.000000e+00  8.100000e+01   
75%    4.300000e+01  1.200000e+01  2.910000e+02  0.000000e+00  9.100000e+01   
max    8.400000e+01  3.500000e+01  3.600000e+02  6.000000e-01  1.000000e+02   

         visibility      pressure    cloudcover     heatindex      dewpoint  \
count  1.048575e+06  1.048575e+06  1.048575e+06  1.048575e+06  1.048575e+06   
mean   5.319995e+00  3.007560e+01  5.382865e+01  3.152459e+01  2.301657e+01   
std    1.443490e+00  2.927843e-01  3.758614e+01  1.

In [17]:
# Check the data types
print(weather_df.dtypes)

location                             object
obs_tms_lcl             datetime64[ns, UTC]
temperature                           int64
wind_speed                            int64
wind_degree                           int64
wind_dir                             object
weather_descriptions                 object
precip                              float64
humidity                              int64
visibility                            int64
pressure                            float64
cloudcover                            int64
heatindex                             int64
dewpoint                              int64
windchill                             int64
windgust                              int64
feelslike                             int64
uv_index                              int64
time                                 object
year                                  int64
minute                                int64
dtype: object


In [None]:
# Set the datetime as the index (optional)
#weather_df.set_index('obs_tms_lcl', inplace=True)

Check Format and Consistency:

Verify timestamp column is in datetime format.
Look at the unique timestamp values and compute the time differences to ensure regular intervals.

In [18]:
# Display the first few timestamp values
print("Timestamps:", weather_df['obs_tms_lcl'].head())

# Convert to datetime (if not already done)
weather_df['obs_tms_lcl'] = pd.to_datetime(weather_df['obs_tms_lcl'])

# Check time differences between consecutive observations
time_diff = weather_df['obs_tms_lcl'].diff()
print("Time differences:\n", time_diff.value_counts())

Timestamps: 0   2020-12-31 06:00:00+00:00
1   2020-12-31 06:00:00+00:00
2   2020-12-31 06:00:00+00:00
3   2020-12-31 06:00:00+00:00
4   2020-12-31 06:00:00+00:00
Name: obs_tms_lcl, dtype: datetime64[ns, UTC]
Time differences:
 obs_tms_lcl
0 days 00:00:00    1042550
0 days 00:30:00       6024
Name: count, dtype: int64


In [19]:
unique_timestamps = weather_df['obs_tms_lcl'].nunique()
total_records = len(weather_df)
print("Number of unique timestamps:", unique_timestamps)
print("Total records:", total_records)

Number of unique timestamps: 6025
Total records: 1048575


In [20]:
# First, get the unique datetime stamps from the DataFrame.
unique_times = weather_df['obs_tms_lcl'].drop_duplicates()

# Group timestamps by date and count how many unique timestamps appear each day.
daily_counts = unique_times.groupby(unique_times.dt.date).count()

# Print the total number of unique days (in a complete 4-year dataset this should be 4*365 = 1460)
print("Total unique days:", len(daily_counts))

# Print the distribution of daily counts – ideally, each day should have 24 timestamps.
print("Distribution of timestamp counts per day:")
print(daily_counts.value_counts())

# Optionally, check if every day has 24 timestamps:
if (daily_counts == 24).all():
    print("All days have 24 unique timestamps.")
else:
    print("Not all days have 24 unique timestamps.")

Total unique days: 126
Distribution of timestamp counts per day:
obs_tms_lcl
48    124
36      1
37      1
Name: count, dtype: int64
Not all days have 24 unique timestamps.
