# Train and Evaluate different models
This will be the main notebook for generating our data for this project. We aim to run experiments on the models and validate our hypothesis

# Install and import dependencies

In [1]:
! pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.3-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.2-cp313-cp313-win_amd64.whl (8.7 MB)
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   -- ------------------------------------- 0.5/8.7 MB 2.9 MB/s eta 0:00:03
   ---- ----------------------------------- 1.0/8.7 MB 2.9 MB/s eta 0:00:03
   -------- ------------------------------- 1.8/8.7 MB 3.2 MB/s eta 0:00:03
   ------------ --------------------------- 2.6/8.7 MB 3.4 MB/s eta 0:00:02
   ---------------- ----------------------- 3.7/8.7 MB 3.9 MB/s eta 0:00:02
   --------------------- ------------------

In [1]:
import pandas as pd

# Linear Models
from sklearn.linear_model import LogisticRegression

# Support Vector Machines
from sklearn.svm import SVC # "C" stands for Classification

# Decision Trees
from sklearn.tree import DecisionTreeClassifier

# Ensembles (Random Forest)
from sklearn.ensemble import RandomForestClassifier

# Neural Networks
from sklearn.neural_network import MLPClassifier

# Evaluation Metrics
# [cite: 32-33]
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [2]:
# Load datasets
def load_into_df(file_path):
    df = pd.read_csv(file_path)
    y = df['Target']
    x = df.drop(columns='Target')
    return x, y

X_train_basic, Y_train_basic = load_into_df("../data/dataset1_train.csv")
print(X_train_basic.head(5), Y_train_basic.head(5))

X_test_basic, Y_test_basic = load_into_df("../data/dataset1_test.csv")

X_train_calculated, Y_train_calculated = load_into_df("../data/dataset2_train.csv")
X_test_calculated, Y_test_calculated = load_into_df("../data/dataset2_test.csv")
datasets = [("basic", X_train_basic, Y_train_basic, X_test_basic, Y_test_basic), ("calculated", X_train_calculated, Y_train_calculated, X_test_calculated, Y_test_calculated)]

      Close      High       Low      Open     Volume
0  6.424606  6.439316  6.375673  6.407194  493729600
1  6.435713  6.472038  6.401790  6.442318  601904800
2  6.333346  6.461232  6.326741  6.435714  552160000
3  6.321635  6.364264  6.275704  6.356759  477131200
4  6.363663  6.364264  6.276005  6.313229  447610800 0    1
1    0
2    0
3    1
4    0
Name: Target, dtype: int64


In [6]:
results = []
models_to_run = {
    "Logistic Regression": LogisticRegression(solver='liblinear'),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(random_state=42),
    "MLP": MLPClassifier(random_state=42, max_iter=1000),
    # For models that need scaling:
    "P-Logistic Regression": Pipeline([
        ('scaler', StandardScaler()), 
        ('model', LogisticRegression(solver='liblinear'))
    ]),
    "P-SVM": Pipeline([
        ('scaler', StandardScaler()),
        ('model', SVC(random_state=42))
    ]),
    "P-MLP": Pipeline([
        ('scaler', StandardScaler()),
        ('model', MLPClassifier(random_state=42, max_iter=1000))
    ]),
    
    # For models that DON'T need scaling:
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42)
}

In [7]:
for model_name, model in models_to_run.items():
    for name, X_train, Y_train, X_test, Y_test in datasets:
        print("Training ", model_name, " on ", name)
        model.fit(X_train, Y_train)
        # 2. Get predictions
        y_pred = model.predict(X_test)
        
        # 3. Evaluate (using your proposal's metrics [cite: 29])
        accuracy = accuracy_score(Y_test, y_pred)
        precision = precision_score(Y_test, y_pred)
        recall = recall_score(Y_test,y_pred)
        F1 = f1_score(Y_test, y_pred)
        print(accuracy, precision, recall, F1)
    

Training  Logistic Regression  on  basic
0.523936170212766 0.523936170212766 1.0 0.68760907504363
Training  Logistic Regression  on  calculated
0.523936170212766 0.523936170212766 1.0 0.68760907504363
Training  Decision Tree  on  basic
0.4940159574468085 0.5157159487776485 0.5621827411167513 0.5379477838494232
Training  Decision Tree  on  calculated
0.4867021276595745 0.5132013201320133 0.3946700507614213 0.44619799139167865
Training  Random Forest  on  basic
0.4940159574468085 0.5170239596469105 0.5203045685279187 0.5186590765338394
Training  Random Forest  on  calculated
0.5106382978723404 0.5294117647058824 0.5939086294416244 0.5598086124401914
Training  SVM  on  basic
0.523936170212766 0.523936170212766 1.0 0.68760907504363
Training  SVM  on  calculated
0.523936170212766 0.523936170212766 1.0 0.68760907504363
Training  MLP  on  basic
0.523936170212766 0.523936170212766 1.0 0.68760907504363
Training  MLP  on  calculated
0.523936170212766 0.523936170212766 1.0 0.68760907504363
Traini