In [5]:
import pandas as pd

# Load the Excel file
file_path = 'IMCRound2Data.xlsx'
data = pd.read_excel(file_path, sheet_name=None)

data = pd.concat(data.values())

# Display the first few rows of the dataset to understand its structure
data.head()


Unnamed: 0,timestamp,ORCHIDS,TRANSPORT_FEES,EXPORT_TARIFF,IMPORT_TARIFF,SUNLIGHT,HUMIDITY,DAY
0,0,1200.0,1.5,10.5,-2.0,2500.0,79.0,-1
1,100,1201.75,1.5,9.5,-2.0,2499.4197,79.0041,-1
2,200,1201.75,1.5,9.5,-2.0,2498.8457,79.00821,-1
3,300,1201.75,1.5,9.5,-2.0,2498.278,79.01234,-1
4,400,1201.75,1.5,9.5,-2.0,2497.7166,79.01649,-1


In [6]:
# Check for missing values in the dataset
missing_data = data.isnull().sum()

# Checking unique values for 'DAY' to understand the encoding
unique_days = data['DAY'].unique()

missing_data, unique_days


(timestamp         0
 ORCHIDS           0
 TRANSPORT_FEES    0
 EXPORT_TARIFF     0
 IMPORT_TARIFF     0
 SUNLIGHT          0
 HUMIDITY          0
 DAY               0
 dtype: int64,
 array([-1,  0,  1]))

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Define the features and target variable
X = data[['TRANSPORT_FEES', 'EXPORT_TARIFF', 'SUNLIGHT', 'HUMIDITY', 'IMPORT_TARIFF']]
y = data['ORCHIDS']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Predict on the testing set
y_pred = rf.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)

# Get the feature importances
feature_importances = rf.feature_importances_

mse, feature_importances


(0.9998753503177721,
 array([0.03313317, 0.02734205, 0.19765124, 0.64985001, 0.09202353]))

In [10]:
import pandas as pd
import numpy as np

# Sample data simulation
np.random.seed(48)
dates = pd.date_range(start='2023-01-01', periods=100, freq='D')
sunlight = np.random.normal(loc=2500, scale=200, size=100)  # Normal distribution around 2500
humidity = np.random.normal(loc=80, scale=5, size=100)  # Normal distribution around 80
prices = np.random.normal(loc=1200, scale=50, size=100)  # Prices affected by these

df = pd.DataFrame({'Date': dates, 'Sunlight': sunlight, 'Humidity': humidity, 'Price': prices})
df.set_index('Date', inplace=True)

# Moving averages
df['Sunlight_MA'] = df['Sunlight'].rolling(window=10).mean()
df['Humidity_MA'] = df['Humidity'].rolling(window=10).mean()

# Trading signals
df['Buy'] = (df['Sunlight'] > df['Sunlight_MA']) & (df['Humidity'] > df['Humidity_MA'])
df['Sell'] = (df['Sunlight'] < df['Sunlight_MA']) & (df['Humidity'] < df['Humidity_MA'])

# Simulate trading
cash = 10000000  # Starting cash
shares = 0
position = []

for index, row in df.iterrows():
    if row['Buy'] and cash >= row['Price']:
        shares = cash // row['Price']
        cash -= shares * row['Price']
        position.append((index, 'Buy', shares, row['Price']))
    elif row['Sell'] and shares > 0:
        cash += shares * row['Price']
        shares = 0
        position.append((index, 'Sell', shares, row['Price']))

# Final portfolio value
portfolio_value = cash + shares * df.iloc[-1]['Price']
print("Final Portfolio Value:", portfolio_value)

# Display transactions
transactions = pd.DataFrame(position, columns=['Date', 'Action', 'Shares', 'Price'])
transactions.set_index('Date', inplace=True)
transactions


Final Portfolio Value: 11380965.10107582


Unnamed: 0_level_0,Action,Shares,Price
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-01-11,Buy,7631.0,1310.273928
2023-01-12,Sell,0.0,1373.360415
2023-01-17,Buy,8944.0,1171.889509
2023-01-24,Sell,0.0,1136.621414
2023-01-29,Buy,8652.0,1174.947006
2023-01-30,Sell,0.0,1197.085094
2023-02-06,Buy,9217.0,1123.694913
2023-02-10,Sell,0.0,1194.21768
2023-02-12,Buy,9707.0,1133.901385
2023-02-13,Sell,0.0,1199.922438


In [None]:
from sklearn.linear_model import LinearRegression

# Initialize the Linear Regression model
lr = LinearRegression()

# Train the model on the training data
lr.fit(X_train, y_train)

# Predict on the testing set
y_pred_lr = lr.predict(X_test)

# Calculate the mean squared error and R-squared for the Linear Regression model
mse_lr = mean_squared_error(y_test, y_pred_lr)
r_squared_lr = lr.score(X_test, y_test)

# Retrieve the coefficients and the intercept
coefficients = lr.coef_
intercept = lr.intercept_

mse_lr, r_squared_lr, coefficients, intercept


(798.2384616339549,
 0.8811336965299192,
 array([-4.52923176e+01,  4.17751761e+00, -2.84078946e+01,  1.27087837e-02,
         3.05435968e+00, -9.79243420e+01]),
 745.4906187809355)

Linear regression

ORCHIDS=1103.46−62.33×TRANSPORT_FEES+2.37×EXPORT_TARIFF+47.49×IMPORT_TARIFF+283.48×SUNLIGHT+256.19×HUMIDITY