In [1]:
import pandas as pd
import numpy as np
from dataframe_checks import check_time_column, check_col_types, check_missing_values, data_report

# Daily Climate time series data

Getting data and shaping it.

In [2]:
types = {'meantemp': np.float64, 'humidity': np.float64, 'wind_speed': np.float64, 'meanpressure': np.float64}
dcts_test = pd.read_csv('datasets/raw/Daily Climate time series data/DailyDelhiClimateTest.csv', parse_dates=['date'], dtype=types)
dcts_train = pd.read_csv('datasets/raw/Daily Climate time series data/DailyDelhiClimateTrain.csv', parse_dates=['date'], dtype=types)
dcts = pd.concat([dcts_train, dcts_test,], ignore_index=True).sort_values(by='date', ascending=True)

Data report on current data.

In [3]:
data_report(dcts,'date', 'D')

________________________________________________________________
                          DATA REPORT                          
________________________________________________________________
1. Column order is correct!
2. All column types are correct!
3. Number of missing timesteps: 0
4. Number of duplicated timesteps: 1
5. Number of Time Series with missing values : 0
________________________________________________________________


Fix issues reported on Data Report.

In [4]:
correct4 = check_time_column( dcts, time_col_name="date", freq="D", fix = True)
correct4

3. Number of missing timesteps: 0
4. Number of duplicated timesteps: 1


Unnamed: 0,date,meantemp,humidity,wind_speed,meanpressure
0,2013-01-01,10.000000,84.500000,0.000000,1015.666667
1,2013-01-02,7.400000,92.000000,2.980000,1017.800000
2,2013-01-03,7.166667,87.000000,4.633333,1018.666667
3,2013-01-04,8.666667,71.333333,1.233333,1017.166667
4,2013-01-05,6.000000,86.833333,3.700000,1016.500000
...,...,...,...,...,...
1571,2017-04-20,34.500000,27.500000,5.562500,998.625000
1572,2017-04-21,34.250000,39.375000,6.962500,999.875000
1573,2017-04-22,32.900000,40.900000,8.890000,1001.600000
1574,2017-04-23,32.875000,27.500000,9.962500,1002.125000


Final report on corrected data.

In [5]:
data_report(correct4,'date', 'D')

________________________________________________________________
                          DATA REPORT                          
________________________________________________________________
1. Column order is correct!
2. All column types are correct!
3. Number of missing timesteps: 0
4. Number of duplicated timesteps: 0
5. Number of Time Series with missing values : 0
________________________________________________________________


In [6]:
#correct4.to_csv('datasets/processed_data/DailyDelhiClimate_12052024.csv', index=False)

# Time Series Price Vegetables and Fruits

In [7]:
types = {'Commodity': str, 'Average': np.float64}
tspvf = pd.read_csv('datasets/raw/Time Series Price Vegetables and Fruits/kalimati_tarkari_dataset.csv', parse_dates=['Date'], index_col='SN', dtype=types)
tspvf = tspvf.drop(columns=['Minimum',	'Maximum', 'Unit'])
tspvf

Unnamed: 0_level_0,Commodity,Date,Average
SN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Tomato Big(Nepali),2013-06-16,37.5
1,Tomato Small(Local),2013-06-16,29.0
2,Potato Red,2013-06-16,20.5
3,Potato White,2013-06-16,15.5
4,Onion Dry (Indian),2013-06-16,29.0
...,...,...,...
197156,Garlic Dry Nepali,2021-05-13,110.0
197157,Fish Fresh(Rahu),2021-05-13,275.0
197158,Fish Fresh(Bachuwa),2021-05-13,230.0
197159,Fish Fresh(Chhadi),2021-05-13,225.0


In [8]:
tspvf1 = pd.pivot_table(tspvf, index = 'Date', columns='Commodity', values='Average')
tspvf1.reset_index(inplace=True)
tspvf1

Commodity,Date,Apple(Fuji),Apple(Jholey),Arum,Asparagus,Bakula,Bamboo Shoot,Banana,Barela,Bauhania flower,...,Tomato Big(Nepali),Tomato Small(Indian),Tomato Small(Local),Tomato Small(Terai),Tomato Small(Tunnel),Turnip,Turnip A,Water Melon(Dotted),Water Melon(Green),Yam
0,2013-06-16,,110.0,,125.0,,55.0,75.0,,,...,37.5,,29.0,,,47.5,42.5,,16.5,
1,2013-06-17,,120.0,,125.0,,55.0,75.0,,,...,27.5,,22.5,,,,47.5,,15.0,
2,2013-06-18,,115.0,,110.0,,55.0,75.0,,,...,27.5,,24.0,,,47.5,47.5,,13.5,
3,2013-06-19,,115.0,,112.5,,55.0,75.0,,,...,27.5,,26.0,,,47.5,47.5,,15.0,
4,2013-06-20,,115.0,,110.0,,55.0,75.0,,,...,27.5,,24.0,,,42.5,55.0,,15.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2748,2021-05-09,255.0,,55.0,900.0,,95.0,105.0,75.0,,...,,,12.5,17.5,17.5,,75.0,,42.5,
2749,2021-05-10,270.0,,55.0,950.0,,,95.0,75.0,,...,,17.5,12.5,17.5,17.5,,75.0,,37.5,
2750,2021-05-11,265.0,,,1100.0,,95.0,95.0,75.0,,...,,17.5,12.5,21.5,21.5,,75.0,,40.0,
2751,2021-05-12,,265.0,,1100.0,,95.0,105.0,75.0,,...,47.5,27.5,32.5,32.5,37.5,,75.0,,32.5,


In [9]:
data_report(tspvf1,'Date', 'D', business_days=True)

________________________________________________________________
                          DATA REPORT                          
________________________________________________________________
1. Column order is correct!
2. All column types are correct!
3. Number of missing timesteps: 136
4. Number of duplicated timesteps: 0
Number of missing values in column 'Apple(Fuji)' : 2358
Number of missing values in column 'Apple(Jholey)' : 70
Number of missing values in column 'Arum' : 733
Number of missing values in column 'Asparagus' : 1698
Number of missing values in column 'Bakula' : 1622
Number of missing values in column 'Bamboo Shoot' : 9
Number of missing values in column 'Banana' : 9
Number of missing values in column 'Barela' : 1363
Number of missing values in column 'Bauhania flower' : 2467
Number of missing values in column 'Bitter Gourd' : 83
Number of missing values in column 'Bottle Gourd' : 53
Number of missing values in column 'Brd Leaf Mustard' : 11
Number of missing values 

In [10]:
tspvf3 = check_time_column(
    tspvf1,
    time_col_name="Date",
    freq="D", fix=True
)

3. Number of missing timesteps: 136
4. Number of duplicated timesteps: 0


In [11]:
data_report(tspvf3,'Date', 'D', business_days=True)

________________________________________________________________
                          DATA REPORT                          
________________________________________________________________
1. Column order is correct!
2. All column types are correct!
3. Number of missing timesteps: 0
4. Number of duplicated timesteps: 0
Number of missing values in column 'Apple(Fuji)' : 2494
Number of missing values in column 'Apple(Jholey)' : 206
Number of missing values in column 'Arum' : 869
Number of missing values in column 'Asparagus' : 1834
Number of missing values in column 'Bakula' : 1758
Number of missing values in column 'Bamboo Shoot' : 145
Number of missing values in column 'Banana' : 145
Number of missing values in column 'Barela' : 1499
Number of missing values in column 'Bauhania flower' : 2603
Number of missing values in column 'Bitter Gourd' : 219
Number of missing values in column 'Bottle Gourd' : 189
Number of missing values in column 'Brd Leaf Mustard' : 147
Number of missing v

In [12]:
tspvf4 = check_missing_values(tspvf3, alpha=0.1, fix=True)

Number of missing values in column 'Apple(Jholey)' : 206
Number of missing values in column 'Bamboo Shoot' : 145
Number of missing values in column 'Banana' : 145
Number of missing values in column 'Bitter Gourd' : 219
Number of missing values in column 'Bottle Gourd' : 189
Number of missing values in column 'Brd Leaf Mustard' : 147
Number of missing values in column 'Brinjal Long' : 153
Number of missing values in column 'Cabbage(Local)' : 140
Number of missing values in column 'Capsicum' : 164
Number of missing values in column 'Carrot(Local)' : 153
Number of missing values in column 'Cauli Local' : 139
Number of missing values in column 'Chilli Dry' : 141
Number of missing values in column 'Chilli Green' : 154
Number of missing values in column 'Coriander Green' : 148
Number of missing values in column 'Cress Leaf' : 280
Number of missing values in column 'Cucumber(Local)' : 203
Number of missing values in column 'French Bean(Local)' : 150
Number of missing values in column 'Garlic 

In [13]:
data_report(tspvf4,'Date', 'D', business_days=True)

________________________________________________________________
                          DATA REPORT                          
________________________________________________________________
1. Column order is correct!
2. All column types are correct!
3. Number of missing timesteps: 0
4. Number of duplicated timesteps: 0
5. Number of Time Series with missing values : 0
________________________________________________________________


In [14]:
tspvf4

Unnamed: 0,Date,Apple(Jholey),Bamboo Shoot,Banana,Bitter Gourd,Bottle Gourd,Brd Leaf Mustard,Brinjal Long,Cabbage(Local),Capsicum,...,Pomegranate,Potato Red,Pumpkin,Raddish White(Local),Spinach Leaf,Sugarbeet,Tamarind,Tofu,Tomato Small(Local),Water Melon(Green)
0,2013-06-16,110.0,55.0,75.0,15.0,17.5,47.5,17.0,8.0,57.5,...,210.0,20.5,19.0,27.5,52.5,37.5,67.5,57.5,29.0,16.5
1,2013-06-17,120.0,55.0,75.0,18.0,19.0,42.5,19.0,8.0,57.5,...,210.0,20.5,19.0,32.5,47.5,37.5,67.5,57.5,22.5,15.0
2,2013-06-18,115.0,55.0,75.0,19.0,23.0,42.5,22.5,9.0,57.5,...,210.0,19.5,19.0,32.5,47.5,37.5,67.5,57.5,24.0,13.5
3,2013-06-19,115.0,55.0,75.0,22.5,19.0,42.5,24.0,12.5,57.5,...,210.0,20.5,19.0,32.5,42.5,47.5,67.5,57.5,26.0,15.0
4,2013-06-20,115.0,55.0,75.0,22.5,22.5,42.5,22.5,12.5,52.5,...,195.0,20.5,19.0,30.0,47.5,37.5,67.5,57.5,24.0,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2884,2021-05-09,290.0,95.0,105.0,13.5,12.5,25.0,20.0,25.0,75.0,...,265.0,36.5,32.5,22.5,45.0,75.0,155.0,95.0,12.5,42.5
2885,2021-05-10,290.0,95.0,95.0,17.5,22.5,25.0,27.5,25.0,75.0,...,290.0,36.5,32.5,25.0,45.0,75.0,155.0,95.0,12.5,37.5
2886,2021-05-11,290.0,95.0,95.0,17.5,17.5,25.0,25.0,22.5,75.0,...,290.0,36.5,32.5,25.0,55.0,75.0,155.0,95.0,12.5,40.0
2887,2021-05-12,265.0,95.0,105.0,17.5,17.5,22.5,27.5,22.5,75.0,...,290.0,36.0,32.5,17.5,55.0,75.0,155.0,95.0,32.5,32.5


In [15]:
#tspvf4.to_csv('datasets/processed_data/kalimati_tarkari_dataset_12052024.csv', index=False)

# Time Series Room Temperature Data

In [16]:
types = {'Hourly_Temp': np.float64}
tsrtd = pd.read_csv('datasets/raw/Time Series Room Temperature Data/MLTempDataset1.csv', parse_dates=['Datetime'], dtype=types, index_col=0)
tsrtd

Unnamed: 0,Datetime,Hourly_Temp
0,2022-01-04 00:00:00,20.867
1,2022-01-04 01:00:00,21.000
2,2022-01-04 02:00:00,20.867
3,2022-01-04 03:00:00,20.650
4,2022-01-04 04:00:00,20.400
...,...,...
7051,2022-10-24 19:00:00,25.567
7052,2022-10-24 20:00:00,25.183
7053,2022-10-24 21:00:00,24.600
7054,2022-10-24 22:00:00,23.986


In [17]:
data_report(tsrtd, "Datetime", freq='H', business_days=False)

________________________________________________________________
                          DATA REPORT                          
________________________________________________________________
1. Column order is correct!
2. All column types are correct!
3. Number of missing timesteps: 0
4. Number of duplicated timesteps: 0
5. Number of Time Series with missing values : 0
________________________________________________________________


In [18]:
#tsrtd.to_csv('datasets/processed_data/MLTempDataset_13052024.csv', index=False)

# Time Series Air Quality Data of India (2010-2023)

Reading one of the datasets (AP004.csv) corresponding to the city: Rajamahendravaram, India.

In [19]:
types = {'Commodity': str, 'Average': np.float64}
tsaqdi = pd.read_csv('datasets/raw/Time Series Air Quality Data of India (2010-2023)/AP004.csv', parse_dates=['From Date', 'To Date'], dtype=types) 

In [20]:
# Checking if the difference between date columns is 1 hour
time_difference = tsaqdi['From Date'] - tsaqdi['To Date']
assert (time_difference == pd.Timedelta(hours=-1)).all() 
tsaqdi = tsaqdi.drop(columns=['To Date'])

In [21]:
data_report(tsaqdi, time_col_name='From Date', freq="H")

________________________________________________________________
                          DATA REPORT                          
________________________________________________________________
1. Column order is correct!
2. All column types are correct!
3. Number of missing timesteps: 0
4. Number of duplicated timesteps: 0
Number of missing values in column 'PM2.5 (ug/m3)' : 2458
Number of missing values in column 'PM10 (ug/m3)' : 1885
Number of missing values in column 'NO (ug/m3)' : 1558
Number of missing values in column 'NO2 (ug/m3)' : 1578
Number of missing values in column 'NOx (ppb)' : 2174
Number of missing values in column 'NH3 (ug/m3)' : 1662
Number of missing values in column 'SO2 (ug/m3)' : 2153
Number of missing values in column 'CO (mg/m3)' : 2415
Number of missing values in column 'Ozone (ug/m3)' : 1646
Number of missing values in column 'Benzene (ug/m3)' : 1888
Number of missing values in column 'Toluene (ug/m3)' : 1894
Number of missing values in column 'Eth-Benzene (

In [22]:
tsaqdi5 = check_missing_values(tsaqdi, alpha=0.1, fix=True)

Number of missing values in column 'PM2.5 (ug/m3)' : 2458
Number of missing values in column 'PM10 (ug/m3)' : 1885
Number of missing values in column 'NO (ug/m3)' : 1558
Number of missing values in column 'NO2 (ug/m3)' : 1578
Number of missing values in column 'NOx (ppb)' : 2174
Number of missing values in column 'NH3 (ug/m3)' : 1662
Number of missing values in column 'SO2 (ug/m3)' : 2153
Number of missing values in column 'CO (mg/m3)' : 2415
Number of missing values in column 'Ozone (ug/m3)' : 1646
Number of missing values in column 'Benzene (ug/m3)' : 1888
Number of missing values in column 'Toluene (ug/m3)' : 1894
Number of missing values in column 'RH (%)' : 1438
Number of missing values in column 'WS (m/s)' : 1427
Number of missing values in column 'WD (degree)' : 1429
Number of missing values in column 'SR (W/mt2)' : 1656
Number of missing values in column 'BP (mmHg)' : 1429
Number of missing values in column 'VWS (m/s)' : 1626
Number of missing values in column 'AT (degree C)' :

In [25]:
data_report(tsaqdi5, time_col_name='From Date', freq="H")

________________________________________________________________
                          DATA REPORT                          
________________________________________________________________
1. Column order is correct!
2. All column types are correct!
3. Number of missing timesteps: 0
4. Number of duplicated timesteps: 0
5. Number of Time Series with missing values : 0
________________________________________________________________


In [26]:
#tsaqdi5.to_csv('datasets/processed_data/air_quality_Rajamahendravaram_13052024.csv', index=False)