In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [2]:
data = pd.read_csv("total.csv")
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7418 entries, 0 to 7417
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   timestamp     7418 non-null   int64  
 1   state         7418 non-null   object 
 2   district      7418 non-null   object 
 3   market        7418 non-null   object 
 4   commodity     7418 non-null   object 
 5   variety       7418 non-null   object 
 6   arrival_date  7418 non-null   object 
 7   min_price     7418 non-null   float64
 8   max_price     7418 non-null   float64
 9   modal_price   7418 non-null   float64
dtypes: float64(3), int64(1), object(6)
memory usage: 579.7+ KB
None


**Handle Missing Values**

In [3]:
#check for missing values
print(data.isnull().sum())

timestamp       0
state           0
district        0
market          0
commodity       0
variety         0
arrival_date    0
min_price       0
max_price       0
modal_price     0
dtype: int64


In [4]:
price_columns = ['min_price', 'modal_price', 'max_price']
for col in price_columns:
    data[col].fillna(data.groupby('commodity')[col].transform('median'))

**Feature Engineering**

In [6]:
# Add price spread 
data['price_spread'] = data['max_price'] - data['min_price']
data['price_ratio'] = data['max_price'] / (data['min_price'] + 1e-5)  # Avoid division by zero

In [9]:

data['arrival_date'] = pd.to_datetime(data['arrival_date'])
data['month'] = data['arrival_date'].dt.month
data['day_of_week'] = data['arrival_date'].dt.dayofweek
data['year'] = data['arrival_date'].dt.year

**Encode Categorical Variables**

In [7]:
# Encode 'state', 'apmc', and 'commodity' using Label Encoding
label_encoders = {}
for col in ['state', 'district','market', 'commodity']:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

**Normalize/Scale Data**

In [8]:
scaler = MinMaxScaler()
scaled_columns = ['min_price', 'modal_price', 'max_price',
                  'price_spread', 'price_ratio']
data[scaled_columns] = scaler.fit_transform(data[scaled_columns])

**Create Time-Series Data for LSTM**

In [10]:
# Group by commodity and sort by date for time-series modeling
data.sort_values(['commodity', 'year', 'month', 'day_of_week'], inplace=True)

In [11]:
# Prepare data for LSTM
# Define features and target variables for each feature
features = ['min_price', 'modal_price', 'max_price',
            'price_spread', 'price_ratio','month', 'day_of_week']
target_price = 'modal_price'  # Target for price recommendation

In [12]:
# Save preprocessed data
data.to_csv('preprocessed_trade_data.csv', index=False)
print("Preprocessing complete. Data saved to 'preprocessed_trade_data.csv'")

Preprocessing complete. Data saved to 'preprocessed_trade_data.csv'
