# LIBRARIES

In [None]:
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler

In [None]:
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

# Data


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/My Drive/infolimpioavanzadoTarget.csv')
df.head()

Unnamed: 0,date,open,high,low,close,adjclose,volume,ticker,RSIadjclose15,RSIvolume15,...,high-15,K-15,D-15,stochastic-k-15,stochastic-d-15,stochastic-kd-15,volumenrelativo,diff,INCREMENTO,TARGET
0,2022-01-03,17.799999,18.219,17.5,17.76,17.76,106600,ASLE,,,...,,,,,,,0.919758,-1.900001,-9.664295,0
1,2022-01-04,17.700001,18.309999,17.620001,17.66,17.66,128700,ASLE,,,...,,,,,,,1.11044,-1.379999,-7.247895,0
2,2022-01-05,17.58,17.799999,16.91,16.950001,16.950001,103100,ASLE,,,...,,,,,,,0.88956,-0.93,-5.201344,0
3,2022-01-06,16.65,16.879999,16.139999,16.17,16.17,173600,ASLE,,,...,,,,,,,1.497843,-0.36,-2.177856,0
4,2022-01-07,16.219999,16.290001,15.63,15.71,15.71,137800,ASLE,,,...,,,,,,,1.188956,-0.12,-0.758054,0


# Handling Missing values

In [None]:
# Missing values
missing_counts = df.isnull().sum()
print("Number of missing values:\n", missing_counts)

# Calculate percentage of missing values in each column
missing_percentage = (df.isnull().sum() / len(df)) * 100

print("Percentage of Missing Values:\n",missing_percentage)


Number of missing values:
 date                  0
open                  0
high                  0
low                   0
close                 0
                   ... 
stochastic-kd-15    587
volumenrelativo     215
diff                155
INCREMENTO          155
TARGET                0
Length: 1285, dtype: int64
Percentage of Missing Values:
 date                0.000000
open                0.000000
high                0.000000
low                 0.000000
close               0.000000
                      ...   
stochastic-kd-15    7.544017
volumenrelativo     2.763141
diff                1.992032
INCREMENTO          1.992032
TARGET              0.000000
Length: 1285, dtype: float64


In [None]:
# Removing all the rows and columns which have missing values more than the threshold
column_threshold = 0.25
df = df.loc[:, df.isnull().mean() <= column_threshold]

row_threshold = 0.25
df = df.loc[df.isnull().mean(axis=1) <= row_threshold]

# Checking how much more data needs to be restored
df_clone = df.copy()
columns_with_missing_values = df_clone.columns[df_clone.isnull().any()]
df_clone = df_clone[columns_with_missing_values]
missing_percentage = (df_clone.isnull().sum() / len(df)) * 100

print("Percentage of Missing Values:\n",missing_percentage)

Percentage of Missing Values:
 RSIadjclose25       2.149601
RSIvolume25         2.149601
RSIadjclose50      12.868555
RSIvolume50        12.868555
MACDadjclose15      2.149601
                     ...    
stochastic-k-5      0.014524
stochastic-d-5      0.159768
stochastic-kd-5     0.159768
diff                2.178649
INCREMENTO          2.178649
Length: 654, dtype: float64


In [None]:
# Applying forward and backward fill method to fill up the rest of the missing values
# Forward fill is typically more appropriate for stock data as it preserves the latest known state of the stock until new data becomes available.

# Perform forward fill to propagate the last valid observation forward
df.fillna(method='ffill', inplace=True)
# Perform backward fill to fill remaining NaNs (if any) from the next valid observation
df.fillna(method='bfill', inplace=True)

# Checking if any missing values left
missing_percentage = (df.isnull().sum() / len(df)) * 100

print("Percentage of Missing Values:\n",missing_percentage)

Percentage of Missing Values:
 date                0.0
open                0.0
high                0.0
low                 0.0
close               0.0
                   ... 
stochastic-kd-15    0.0
volumenrelativo     0.0
diff                0.0
INCREMENTO          0.0
TARGET              0.0
Length: 1271, dtype: float64


In [None]:
zero_columns = df.columns[(df == 0).all()]
# Drop columns with all zero values
df1 = df.drop(columns=zero_columns)

In [None]:
# Data Separation
numerical_df = df1.select_dtypes(include=['number'])
# Find columns with extreme values
extreme_value_threshold = 10000
extreme_columns = [col for col in numerical_df.columns if numerical_df[col].abs().max() > extreme_value_threshold]

print("Number of Columns with extreme values:", len(extreme_columns))



Number of Columns with extreme values: 801


In [None]:
extreme_value_threshold = 10000

# Create a boolean mask for extreme values
extreme_values_mask = (numerical_df.abs() > extreme_value_threshold)

# Identify rows with at least one extreme value
rows_with_extreme_values = extreme_values_mask.any(axis=1)

# Count the number of such rows
num_rows_with_extreme_values = rows_with_extreme_values.sum()

print("Number of rows with extreme values:", num_rows_with_extreme_values)

Number of rows with extreme values: 6885


In [None]:
numerical_df.drop(columns = extreme_columns,inplace = True)
X = numerical_df.drop(['adjclose','TARGET'],axis = 1)
y = df["TARGET"]

# Regression Models

In [None]:
# Preprocessing for numerical data: Apply StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
# Trian and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# List of regression models to evaluate
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42)
}
# Evaluating Regression Models
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"{name} Model Performance:")
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared: {r2}\n")

Linear Regression Model Performance:
Mean Squared Error: 0.23763685143580113
R-squared: -0.6042543417503852

Decision Tree Model Performance:
Mean Squared Error: 0.034858387799564274
R-squared: 0.7646757241732889

Random Forest Model Performance:
Mean Squared Error: 0.017477559912854032
R-squared: 0.8820113496539349



In [None]:
# Logistic Regression Model
logistic_model = LogisticRegression(random_state=42, max_iter=1000)

# Train the model
logistic_model.fit(X_train, y_train)

# Make predictions
y_pred_logistic = logistic_model.predict(X_test)

# Evaluate the logistic regression model
accuracy = accuracy_score(y_test, y_pred_logistic)
precision = precision_score(y_test, y_pred_logistic)
recall = recall_score(y_test, y_pred_logistic)
f1 = f1_score(y_test, y_pred_logistic)

print("Logistic Regression Model Performance:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}\n")

Logistic Regression Model Performance:
Accuracy: 0.9070442992011619
Precision: 0.8805031446540881
Recall: 0.5622489959839357
F1 Score: 0.6862745098039216

