In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
import warnings
warnings.filterwarnings('ignore')



In [5]:
df_stock = pd.read_csv("C:/Users/KEVIN/Downloads/AADR.csv")
df_stock.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2010-07-21,25.1,25.1,24.700001,24.700001,23.343714,42000
1,2010-07-22,25.42,25.42,25.129999,25.26,23.872967,17500
2,2010-07-23,25.540001,25.540001,25.08,25.280001,23.891865,8600
3,2010-07-26,25.4,25.4,25.219999,25.370001,23.976921,18900
4,2010-07-27,25.25,25.290001,25.200001,25.290001,23.901318,8200


# Data Cleaning


In [6]:
# Convert 'Date' column to datetime
df_stock['Date'] = pd.to_datetime(df_stock['Date'])

# Sort by date
df_stock = df_stock.sort_values(by='Date')

# Fill missing values (if any)
imputer = SimpleImputer(strategy='mean')
df_stock.iloc[:, 1:] = imputer.fit_transform(df_stock.iloc[:, 1:])

print("Data after cleaning:")
print(df_stock.head())


Data after cleaning:
        Date       Open       High        Low      Close  Adj Close  Volume
0 2010-07-21  25.100000  25.100000  24.700001  24.700001  23.343714   42000
1 2010-07-22  25.420000  25.420000  25.129999  25.260000  23.872967   17500
2 2010-07-23  25.540001  25.540001  25.080000  25.280001  23.891865    8600
3 2010-07-26  25.400000  25.400000  25.219999  25.370001  23.976921   18900
4 2010-07-27  25.250000  25.290001  25.200001  25.290001  23.901318    8200


# Normalization & Standardization

<h2> Min-Max Scale</h2>

In [7]:
# Initialize the MinMaxScaler
min_max_scaler = MinMaxScaler()

# Select the columns to normalize (e.g., 'Open', 'High', 'Low', 'Close', 'Volume')
columns_to_normalize = ['Open', 'High', 'Low', 'Close', 'Volume']

# Apply Min-Max Scaling
df_stock[columns_to_normalize] = min_max_scaler.fit_transform(df_stock[columns_to_normalize])

# Display the first few rows of the normalized data
print(df_stock.head())


        Date      Open      High       Low     Close  Adj Close    Volume
0 2010-07-21  0.010649  0.010178  0.002086  0.000000  23.343714  0.129710
1 2010-07-22  0.018961  0.018321  0.013295  0.014504  23.872967  0.054046
2 2010-07-23  0.022078  0.021374  0.011992  0.015022  23.891865  0.026560
3 2010-07-26  0.018442  0.017812  0.015641  0.017353  23.976921  0.058369
4 2010-07-27  0.014545  0.015013  0.015120  0.015281  23.901318  0.025324


<h2>Z score</h2>

In [8]:
# Initialize the StandardScaler
standard_scaler = StandardScaler()

# Apply Z-score Normalization
df_stock[columns_to_normalize] = standard_scaler.fit_transform(df_stock[columns_to_normalize])

# Display the first few rows of the normalized data
print(df_stock.head())



        Date      Open      High       Low     Close  Adj Close    Volume
0 2010-07-21 -1.600645 -1.602185 -1.643567 -1.644376  23.343714  1.638486
1 2010-07-22 -1.565787 -1.567638 -1.596126 -1.583254  23.872967  0.371020
2 2010-07-23 -1.552715 -1.554683 -1.601642 -1.581071  23.891865 -0.089406
3 2010-07-26 -1.567966 -1.569797 -1.586196 -1.571248  23.976921  0.443447
4 2010-07-27 -1.584306 -1.581672 -1.588403 -1.579980  23.901318 -0.110100


In [22]:
df_stock_new = df_stock.drop(columns=['Date'])

# Display the new dataset
print(df_stock_new)

          Open      High       Low     Close  Adj Close    Volume
0    -1.600645 -1.602185 -1.643567 -1.644376  23.343714  1.638486
1    -1.565787 -1.567638 -1.596126 -1.583254  23.872967  0.371020
2    -1.552715 -1.554683 -1.601642 -1.581071  23.891865 -0.089406
3    -1.567966 -1.569797 -1.586196 -1.571248  23.976921  0.443447
4    -1.584306 -1.581672 -1.588403 -1.579980  23.901318 -0.110100
...        ...       ...       ...       ...        ...       ...
2437  0.153164  0.253628  0.176862  0.253680  42.090000  0.479660
2438  0.061661  0.060381  0.033434  0.071405  40.419998 -0.177353
2439  0.082358  0.083053  0.041157  0.103058  40.709999 -0.379113
2440  0.076911  0.139191  0.072049  0.062674  40.340000 -0.301513
2441 -0.067969 -0.083204 -0.109994 -0.080308  39.029999 -0.239433

[2442 rows x 6 columns]


# Feature extraction

In [55]:
import pandas as pd

# Assuming df_stock_new is your DataFrame
# Convert DataFrame to numpy array
stock_new_array = df_stock_new.to_numpy()

# Calculate statistical features for each row
import numpy as np

means = np.mean(stock_new_array, axis=1)
medians = np.median(stock_new_array, axis=1)
std_devs = np.std(stock_new_array, axis=1)
minimums = np.min(stock_new_array, axis=1)
maximums = np.max(stock_new_array, axis=1)
skewnesses = np.mean((stock_new_array - means[:, np.newaxis]) ** 3, axis=1) / (std_devs ** 3)
kurtoses = np.mean((stock_new_array - means[:, np.newaxis]) ** 4, axis=1) / (std_devs ** 4)

# Create DataFrame to store the results
result_df = pd.DataFrame({
    'Row': range(1, stock_new_array.shape[0] + 1),
    'Mean': means,
    'Median': medians,
    'Standard Deviation': std_devs,
    'Minimum': minimums,
    'Maximum': maximums,
    'Skewness': skewnesses,
    'Kurtosis': kurtoses
})

# Save the DataFrame to an Excel file
result_df.to_excel('statistical_features3.xlsx', index=False)


In [96]:
import pandas as pd

# Specify the file path
file_path = 'C:/Users/KEVIN/Desktop/jupyter/statistical_features3.csv'

try:
    # Read the CSV file
    result_df = pd.read_csv(file_path)
    
    # Display the first few rows of the DataFrame
    print(result_df.head())
    
except FileNotFoundError:
    print(f"File '{file_path}' not found.")
except pd.errors.ParserError as e:
    print("Error parsing CSV file:", e)





   Row      Mean    Median  Standard Deviation   Minimum    Maximum  Skewness  \
0    1  3.081904 -1.601415            9.139285 -1.644376  23.343714  1.724531   
1    2  2.988530 -1.566712            9.366890 -1.596126  23.872967  1.766376   
2    3  2.918725 -1.553699            9.395109 -1.601642  23.891865  1.775798   
3    4  3.020860 -1.568881            9.400740 -1.586196  23.976921  1.764983   
4    5  2.909476 -1.580826            9.403243 -1.588403  23.901318  1.775999   

   Kurtosis  
0  4.073283  
1  4.157364  
2  4.175569  
3  4.154643  
4  4.175951  


# Feature selection

In [119]:
import numpy as np
import pandas as pd
from scipy.stats import entropy


# Define the objective function
def objective_function(features):
    return np.sum(features)

# Function to calculate entropy-based gain
def entropy_gain(features, g_best):
    distances = np.linalg.norm(features - g_best, axis=1)
    probabilities = np.exp(-distances) / np.sum(np.exp(-distances))
    return entropy(probabilities)

# Function to update positions based on Eq. (5)
def update_position_phase1(position, selected_pufferfish):
    r_and = np.random.rand(*position.shape)
    H = np.random.choice([1, 2], size=position.shape)
    new_position = position + r_and * (selected_pufferfish - H * position)
    return new_position

# Function to update positions based on Eq. (7)
def update_position_phase2(position, lower_bound, upper_bound, itr):
    rand = np.random.rand(*position.shape)
    new_position = position + (1 - 2 * rand) * (upper_bound - lower_bound) / itr
    return new_position

# Function to perform feature selection using POA
def feature_selection(dataset, population_size, max_iterations):
    np.random.seed(42)
    lower_bound = np.zeros(dataset.shape[1])
    upper_bound = np.ones(dataset.shape[1])
    selected_features = [1, 3, 4, 5]
    A = np.random.rand(population_size, dataset.shape[1])
    Z = np.apply_along_axis(objective_function, 1, A)

    g_best = None
    best_objective_value = -np.inf

    for itr in range(1, max_iterations + 1):
        for i in range(population_size):
            # Phase 1: Exploration
            suitable_pufferfish = np.array([A[k] for k in range(population_size) if Z[k] < Z[i] and k != i])
            if len(suitable_pufferfish) > 0:
                selected_pufferfish = suitable_pufferfish[np.random.randint(len(suitable_pufferfish))]
                A[i] = update_position_phase1(A[i], selected_pufferfish)
                Z_p1 = objective_function(A[i])
                if Z_p1 > Z[i]:
                    Z[i] = Z_p1

            # Phase 2: Exploitation
            A[i] = update_position_phase2(A[i], lower_bound, upper_bound, itr)
            Z_p2 = objective_function(A[i])
            if Z_p2 > Z[i]:
                Z[i] = Z_p2

        # Update global best solution
        current_best_index = np.argmax(Z)
        if Z[current_best_index] > best_objective_value:
            best_objective_value = Z[current_best_index]
            g_best = A[current_best_index]
        return selected_features

# Set parameters for POA
population_size = 50
max_iterations = 100

# Perform feature selection
selected_features_1 = feature_selection(result_df.values, population_size, max_iterations)
print("Selected features 1:", selected_features_1)


Selected features 1: [1, 3, 4, 5]


 # Classification using GRU

In [120]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense


X = summary_stats_1.iloc[:, selected_features_1].values

# Step 2: Preprocess your data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: Build your GRU model
model = Sequential([
    GRU(units=64, input_shape=(X_scaled.shape[1], 1)),  # Adjust units as needed
    Dense(64, activation='relu'),  # Add additional dense layers as needed
    Dense(X_scaled.shape[1])  # Output layer with the same number of features as input
])

# Step 4: Compile your model
model.compile(optimizer='adam', loss='mse')  # Mean squared error loss for reconstruction

# Step 5: Train your model
model.fit(X_scaled.reshape(X_scaled.shape[0], X_scaled.shape[1], 1), X_scaled, epochs=10, batch_size=32)

# No evaluation step for unsupervised learning 


Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.9846
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.9434 
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.8556 
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.8239 
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.7933 
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.6950 
Epoch 7/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.6537 
Epoch 8/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.6395 
Epoch 9/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.5849 
Epoch 10/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.5571 


<keras.src.callbacks.history.History at 0x1f5cb9b89e0>