In [1]:
import pandas as pd
import numpy as np

# Daily Climate time series data

In [2]:
types = {'meantemp': np.float64, 'humidity': np.float64, 'wind_speed': np.float64, 'meanpressure': np.float64}
dcts_test = pd.read_csv('datasets/Daily Climate time series data/DailyDelhiClimateTest.csv', parse_dates=['date'], dtype=types)
dcts_train = pd.read_csv('datasets/Daily Climate time series data/DailyDelhiClimateTrain.csv', parse_dates=['date'], dtype=types)
dcts = pd.concat([dcts_train, dcts_test,], ignore_index=True).sort_values(by='date', ascending=True)

In [3]:
print('_'*64)
print('Data Report: \n')
nb_timesteps = dcts['date'].nunique()
print(f'Number of unique timesteps:{nb_timesteps}')

#checking for index completeness
date_range = pd.date_range(start=dcts['date'].min(), end=dcts['date'].max())
nb_na_timesteps = len(date_range) - dcts['date'].nunique()
print(f'Number of missing timesteps:{nb_na_timesteps}')

nb_dup_timesteps = len(dcts['date'][dcts['date'].duplicated()])
print(f'Number of duplicated timesteps:{nb_dup_timesteps}')

#check for missing values
missing_values = len(dcts[dcts.isnull().any(axis=1)])
print(f'Number of missing values:{missing_values}')

print(f'Types: \n{dcts.dtypes}')

print('_'*64)

________________________________________________________________
Data Report: 

Number of unique timesteps:1575
Number of missing timesteps:0
Number of duplicated timesteps:1
Number of missing values:0
Types: 
date            datetime64[ns]
meantemp               float64
humidity               float64
wind_speed             float64
meanpressure           float64
dtype: object
________________________________________________________________


In [4]:
dcts['date'][dcts['date'].duplicated()]

1462   2017-01-01
Name: date, dtype: datetime64[ns]

In [5]:
dcts[dcts.date == '2017-01-01'].iloc[1].name

1462

In [6]:
nb_dup_timesteps_to_drop = dcts[dcts.date == '2017-01-01'].iloc[1].name
dcts_processed = dcts.drop(nb_dup_timesteps_to_drop).reset_index(drop=True)

In [7]:
dcts_processed

Unnamed: 0,date,meantemp,humidity,wind_speed,meanpressure
0,2013-01-01,10.000000,84.500000,0.000000,1015.666667
1,2013-01-02,7.400000,92.000000,2.980000,1017.800000
2,2013-01-03,7.166667,87.000000,4.633333,1018.666667
3,2013-01-04,8.666667,71.333333,1.233333,1017.166667
4,2013-01-05,6.000000,86.833333,3.700000,1016.500000
...,...,...,...,...,...
1570,2017-04-20,34.500000,27.500000,5.562500,998.625000
1571,2017-04-21,34.250000,39.375000,6.962500,999.875000
1572,2017-04-22,32.900000,40.900000,8.890000,1001.600000
1573,2017-04-23,32.875000,27.500000,9.962500,1002.125000


# Time Series Price Vegetables and Fruits

In [8]:
types = {'Commodity': str, 'Average': np.float64}
tspvf = pd.read_csv('datasets/Time Series Price Vegetables and Fruits/kalimati_tarkari_dataset.csv', parse_dates=['Date'], index_col='SN', dtype=types)
tspvf = tspvf.drop(columns=['Minimum',	'Maximum', 'Unit'])
tspvf

Unnamed: 0_level_0,Commodity,Date,Average
SN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Tomato Big(Nepali),2013-06-16,37.5
1,Tomato Small(Local),2013-06-16,29.0
2,Potato Red,2013-06-16,20.5
3,Potato White,2013-06-16,15.5
4,Onion Dry (Indian),2013-06-16,29.0
...,...,...,...
197156,Garlic Dry Nepali,2021-05-13,110.0
197157,Fish Fresh(Rahu),2021-05-13,275.0
197158,Fish Fresh(Bachuwa),2021-05-13,230.0
197159,Fish Fresh(Chhadi),2021-05-13,225.0


In [12]:
len(tspvf['Date'].unique())*len(tspvf['Commodity'].unique())

363396

In [27]:
tspvf_processed = pd.pivot_table(tspvf, index = 'Date', columns='Commodity', values='Average')
tspvf_processed.reset_index(inplace=True)
tspvf_processed

Commodity,Date,Apple(Fuji),Apple(Jholey),Arum,Asparagus,Bakula,Bamboo Shoot,Banana,Barela,Bauhania flower,...,Tomato Big(Nepali),Tomato Small(Indian),Tomato Small(Local),Tomato Small(Terai),Tomato Small(Tunnel),Turnip,Turnip A,Water Melon(Dotted),Water Melon(Green),Yam
0,2013-06-16,,110.0,,125.0,,55.0,75.0,,,...,37.5,,29.0,,,47.5,42.5,,16.5,
1,2013-06-17,,120.0,,125.0,,55.0,75.0,,,...,27.5,,22.5,,,,47.5,,15.0,
2,2013-06-18,,115.0,,110.0,,55.0,75.0,,,...,27.5,,24.0,,,47.5,47.5,,13.5,
3,2013-06-19,,115.0,,112.5,,55.0,75.0,,,...,27.5,,26.0,,,47.5,47.5,,15.0,
4,2013-06-20,,115.0,,110.0,,55.0,75.0,,,...,27.5,,24.0,,,42.5,55.0,,15.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2748,2021-05-09,255.0,,55.0,900.0,,95.0,105.0,75.0,,...,,,12.5,17.5,17.5,,75.0,,42.5,
2749,2021-05-10,270.0,,55.0,950.0,,,95.0,75.0,,...,,17.5,12.5,17.5,17.5,,75.0,,37.5,
2750,2021-05-11,265.0,,,1100.0,,95.0,95.0,75.0,,...,,17.5,12.5,21.5,21.5,,75.0,,40.0,
2751,2021-05-12,,265.0,,1100.0,,95.0,105.0,75.0,,...,47.5,27.5,32.5,32.5,37.5,,75.0,,32.5,


In [28]:
print('_'*64)
print('Data Report: \n')
nb_timesteps = tspvf_processed['Date'].nunique()
print(f'Number of unique timesteps:{nb_timesteps}')

#checking for index completeness
Date_range = pd.date_range(start=tspvf_processed['Date'].min(), end=tspvf_processed['Date'].max())
nb_na_timesteps = len(Date_range) - tspvf_processed['Date'].nunique()
print(f'Number of missing timesteps:{nb_na_timesteps}')

nb_dup_timesteps = len(tspvf_processed['Date'][tspvf_processed['Date'].duplicated()])
print(f'Number of duplicated timesteps:{nb_dup_timesteps}')

#check for missing values
missing_values = len(tspvf_processed[tspvf_processed.isnull().any(axis=1)])
print(f'Number of missing values:{missing_values}')

print(f'Types: \n{tspvf_processed.dtypes}')

print('_'*64)

________________________________________________________________
Data Report: 

Number of unique timesteps:2753
Number of missing timesteps:136
Number of duplicated timesteps:0
Number of missing values:2753
Types: 
Commodity
Date                   datetime64[ns]
Apple(Fuji)                   float64
Apple(Jholey)                 float64
Arum                          float64
Asparagus                     float64
                            ...      
Turnip                        float64
Turnip A                      float64
Water Melon(Dotted)           float64
Water Melon(Green)            float64
Yam                           float64
Length: 133, dtype: object
________________________________________________________________


In [30]:
missing_percentage_per_column = (df.isnull().sum() / len(df)) * 100


Commodity
Date                      0
Apple(Fuji)            2358
Apple(Jholey)            70
Arum                    733
Asparagus              1698
                       ... 
Turnip                 2333
Turnip A                215
Water Melon(Dotted)    2711
Water Melon(Green)       74
Yam                    1648
Length: 133, dtype: int64