# Just taking a gander at the data for better understanding

In [14]:
''' Loading in the data ''' 
#Logan Kelsch + JJ

#IMPORT LIBRARIES-------------------------------------------------------

import pandas as pd
import numpy as np
import os
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC  # Understanding data dimensionality
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix

# Load a portion of the dataset for simplicity (wanting to see general stuff and whatnot)
csv_file_path = r"C:\Users\jairi\OneDrive\Desktop\Repos\Stock-NN\-ES-NN-test1\CollectedData\DATA_V4.0\catted\catted_2.csv"
data = pd.read_csv(csv_file_path)

# Separate features and target
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Normalizing data
normalizer = MinMaxScaler()
X = normalizer.fit_transform(X)
y = normalizer.fit_transform(y.reshape(-1, 1))

# Simply split data into training and testing
X_train, X_val, y_train, y_val = train_test_split(X, y)

In [22]:
# Run PCA to reduce dimensionality and whatnot
MINIMUM_VARIANCE = .999
pca_classifier = PCA()
pca_classifier.fit(X)
cumulative_variance = np.cumsum(pca_classifier.explained_variance_ratio_)
n_components = np.argmax(cumulative_variance >= MINIMUM_VARIANCE) + 1
print(f"Number of Components Needed for {MINIMUM_VARIANCE} variance: {n_components}")

# Now, transform dataset with given n_components
pca = PCA(n_components=n_components)
X_new = pca.fit_transform(X)
print(f"New X shape (after transformation): {X_new.shape}")  # Showing reduced dimensionality
selected_components = pca.components_

Number of Components Needed for 0.999 variance: 30
New X shape (after transformation): (28716, 30)
[[ 5.71711299e-02 -4.33819264e-04 -8.69864494e-04 ... -3.19228973e-03
  -4.59288318e-03 -8.00636970e-03]
 [ 6.93838407e-01  8.19646708e-04  1.51648771e-03 ...  2.11112566e-03
   3.23926076e-03  5.92615852e-03]
 [-1.14769396e-01 -1.25727518e-03 -2.30915080e-03 ... -4.93646416e-04
  -8.48003762e-04 -1.62872126e-03]
 ...
 [ 1.22141275e-04 -3.93590500e-02 -2.65728168e-03 ... -4.78066478e-01
   2.06100208e-01 -2.35849811e-02]
 [-1.70611473e-04 -1.62884508e-01  2.29021861e-02 ...  8.38974451e-02
  -3.74867419e-02  8.86524175e-03]
 [ 8.30760792e-04 -4.57041397e-02 -9.62768360e-03 ...  6.26197062e-02
  -2.79124203e-02  2.67252227e-04]]


In [25]:
# Model each feature against itself

X_new_df = pd.DataFrame(X_new)  # Convert to DataFrame

# Make a directory to save each of these graphs
output_folder = "featuregraphing1"
os.makedirs(output_folder, exist_ok=True)

for col1 in X_new_df.columns:
    for col2 in X_new_df.columns:
        if col1 != col2:  # Avoid plotting a feature against itself
            plt.figure(figsize=(6, 4))
            plt.scatter(X_new_df[col1], X_new_df[col2], alpha=0.7)
            plt.title(f'Scatterplot: {col1} vs {col2}')
            plt.xlabel(col1)
            plt.ylabel(col2)
            plt.grid(True)
            # Save to the output folder
            plot_filename = os.path.join(output_folder, f"{col1}vs{col2}.png")
            plt.savefig(plot_filename)
            plt.close()  # Close to save memory

In [26]:
# Graph each feature to the target

# Make a directory to save each of these graphs
output_folder = "featuregraphing2"
os.makedirs(output_folder, exist_ok=True)

for col1 in X_new_df.columns:
    plt.figure(figsize=(6, 4))
    plt.scatter(X_new_df[col1], y, alpha=0.7)
    plt.title(f'Scatterplot: {col1} vs target')
    plt.xlabel(col1)
    plt.ylabel(col2)
    plt.grid(True)
    # Save to the output folder
    plot_filename = os.path.join(output_folder, f"{col1}vs{col2}.png")
    plt.savefig(plot_filename)
    plt.close()  # Close to save memory

In [None]:
from sklearn.metrics import accuracy_score
# Creating a SVM Model for each feature to target to see what happens
# NOTE: I need to do some more feature transformation (likely mapping outputs to classification) in order to do this lil analysis

hella_svms = []

X_train, X_val, y_train, y_val = train_test_split(X_new_df, y)

for col in X_train.columns:
    smodel = SVC()  # Default arguments for now
    smodel.fit(X_train[[col]], y_train)
    y_pred = smodel.predict(X_val[[col]])
    print(f"Accuracy Score for column {col}: {accuracy_score(y_pred=y_pred, y_true=y_val)}")
    hella_svms.append(smodel)
    