In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [2]:
data = pd.read_csv("yearly_trade_data.csv")
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 902824 entries, 0 to 902823
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   id                  902824 non-null  int64 
 1   state               902824 non-null  object
 2   apmc                902824 non-null  object
 3   commodity           902824 non-null  object
 4   min_price           902824 non-null  int64 
 5   modal_price         902824 non-null  int64 
 6   max_price           902824 non-null  int64 
 7   commodity_arrivals  902824 non-null  int64 
 8   commodity_traded    902824 non-null  int64 
 9   created_at          902824 non-null  object
 10  status              902824 non-null  int64 
 11  Commodity_Uom       902824 non-null  object
dtypes: int64(7), object(5)
memory usage: 82.7+ MB
None


**Handle Missing Values**

In [3]:
#check for missing values
print(data.isnull().sum())

id                    0
state                 0
apmc                  0
commodity             0
min_price             0
modal_price           0
max_price             0
commodity_arrivals    0
commodity_traded      0
created_at            0
status                0
Commodity_Uom         0
dtype: int64


In [5]:
price_columns = ['min_price', 'modal_price', 'max_price']
for col in price_columns:
    data[col].fillna(data.groupby('commodity')[col].transform('median'))

In [6]:
# Fill missing values in quantities with median
quantity_columns = ['commodity_arrivals', 'commodity_traded']
for col in quantity_columns:
    data[col].fillna(data[col].median())

**Filter Valid Records**

In [7]:
data = data[data['status'] == 1]

**Normalize and Clean Data**

In [8]:
unit_map = {'Nos': 0.001, 'Kg': 1, 'Qui': 100}
data['commodity_arrivals'] = data['commodity_arrivals'] * data['Commodity_Uom'].map(unit_map)
data['commodity_traded'] = data['commodity_traded'] * data['Commodity_Uom'].map(unit_map)
data.drop(columns=['Commodity_Uom'], inplace=True)

**Feature Engineering**

In [9]:
# Add price spread and demand-supply ratio
data['price_spread'] = data['max_price'] - data['min_price']
data['price_ratio'] = data['max_price'] / (data['min_price'] + 1e-5)  # Avoid division by zero
data['demand_supply_ratio'] = data['commodity_traded'] / (data['commodity_arrivals'] + 1e-5)

In [10]:
# Add temporal features from `created_at`
data['created_at'] = pd.to_datetime(data['created_at'])
data['month'] = data['created_at'].dt.month
data['day_of_week'] = data['created_at'].dt.dayofweek
data['year'] = data['created_at'].dt.year

In [11]:
# Drop unnecessary columns
data.drop(columns=['id', 'created_at', 'status'], inplace=True)

**Encode Categorical Variables**

In [12]:
# Encode 'state', 'apmc', and 'commodity' using Label Encoding
label_encoders = {}
for col in ['state', 'apmc', 'commodity']:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

**Normalize/Scale Data**

In [13]:
scaler = MinMaxScaler()
scaled_columns = ['min_price', 'modal_price', 'max_price', 'commodity_arrivals', 'commodity_traded',
                  'price_spread', 'price_ratio', 'demand_supply_ratio']
data[scaled_columns] = scaler.fit_transform(data[scaled_columns])

**Create Time-Series Data for LSTM**

In [14]:
# Group by commodity and sort by date for time-series modeling
data.sort_values(['commodity', 'year', 'month', 'day_of_week'], inplace=True)

In [15]:
# Prepare data for LSTM
# Define features and target variables for each feature
features = ['min_price', 'modal_price', 'max_price', 'commodity_arrivals', 'commodity_traded',
            'price_spread', 'price_ratio', 'demand_supply_ratio', 'month', 'day_of_week']
target_price = 'modal_price'  # Target for price recommendation
target_trade = 'commodity_traded'  # Target for trade prediction

In [16]:
# Save preprocessed data
data.to_csv('preprocessed_trade_data.csv', index=False)
print("Preprocessing complete. Data saved to 'preprocessed_trade_data.csv'")

Preprocessing complete. Data saved to 'preprocessed_trade_data.csv'
