In [None]:
import torch

if torch.cuda.is_available():
    print("GPU trouvé :", torch.cuda.get_device_name(0))
else:
    print("Aucun GPU trouvé, utilisation du CPU.")


In [None]:
# pytorch mlp for binary classification
from numpy import vstack
from pandas import read_csv
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torch import Tensor
from torch.nn import Linear
from torch.nn import ReLU
from torch.nn import Sigmoid
from torch.nn import Module
from torch.optim import SGD
from torch.nn import BCELoss
from torch.nn.init import kaiming_uniform_
from torch.nn.init import xavier_uniform_
from tqdm import tqdm

In [None]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
print(os.listdir('../input/nyse'))

In [None]:
def plotScatterMatrix(df, plotSize, textSize):
    df = df.select_dtypes(include =[np.number]) # keep only numerical columns
    # Remove rows and columns that would lead to df being singular
    df = df.dropna(axis=1)
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    columnNames = list(df)
    if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
        columnNames = columnNames[:10]
    df = df[columnNames]
    ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')
    corrs = df.corr().values
    for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
        ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
    plt.suptitle('Scatter and Density Plot')
    plt.show()

In [None]:

# Distribution graphs (histogram/bar graph) of column data
def plotPerColumnDistribution(df, nGraphShown, nGraphPerRow):
    nunique = df.nunique()
    df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 50]] # For displaying purposes, pick columns that have between 1 and 50 unique values
    nRow, nCol = df.shape
    columnNames = list(df)
    nGraphRow = (nCol + nGraphPerRow - 1) / nGraphPerRow
    plt.figure(num = None, figsize = (6 * nGraphPerRow, 8 * nGraphRow), dpi = 80, facecolor = 'w', edgecolor = 'k')
    for i in range(min(nCol, nGraphShown)):
        plt.subplot(nGraphRow, nGraphPerRow, i + 1)
        columnDf = df.iloc[:, i]
        if (not np.issubdtype(type(columnDf.iloc[0]), np.number)):
            valueCounts = columnDf.value_counts()
            valueCounts.plot.bar()
        else:
            columnDf.hist()
        plt.ylabel('counts')
        plt.xticks(rotation = 90)
        plt.title(f'{columnNames[i]} (column {i})')
    plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
    plt.show()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def plotCorrelationMatrix(df, graphWidth):
    # Check if the dataframe has a name attribute; otherwise, set a default name
    filename = getattr(df, 'dataframeName')
    
    # Drop columns with NaN values and keep columns with more than one unique value
    df = df.dropna(axis=1)
    df = df[[col for col in df if df[col].nunique() > 1]]
    
    # Keep only numeric columns
    df = df.select_dtypes(include=[np.number])

    # If there are fewer than 2 numeric columns, exit
    if df.shape[1] < 2:
        print(f'No correlation plots shown: The number of non-NaN or constant numeric columns ({df.shape[1]}) is less than 2')
        return

    # Calculate correlation matrix
    corr = df.corr()

    # Plot correlation matrix with annotations
    plt.figure(figsize=(graphWidth, graphWidth))
    sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", square=True, cbar_kws={'shrink': .8})
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.title(f'Correlation Matrix for {filename}', fontsize=15)
    plt.show()


Let's check 1st file: ../input/fundamentals.csv


In [None]:

# fundamentals.csv may have more rows in reality, but we are only loading/previewing the first 1000 rows
df1 = pd.read_csv('../input/nyse/fundamentals.csv', delimiter=',')
df1.dataframeName = 'fundamentals.csv'
nRow, nCol = df1.shape
print(f'There are {nRow} rows and {nCol} columns')

In [None]:
df1.head()

In [None]:
plotCorrelationMatrix(df1, 50)


In [None]:
plotScatterMatrix(df1, 20, 10)


In [None]:
import pandas as pd

def remove_highly_correlated_columns(df, threshold=0.8):
    # Compute the correlation matrix
    df = df.select_dtypes(include=[np.number])

    corr_matrix = df.corr().abs()  # Take absolute value to consider both positive and negative correlations
    
    # Select upper triangle of the correlation matrix
    upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    # Find index of columns with correlation greater than the threshold
    to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]
    
    # Drop the columns
    df_reduced = df.drop(columns=to_drop)
    
    print(f"Removed {len(to_drop)} columns with correlation above {threshold}.")
    return df_reduced


In [None]:
df_reduced=remove_highly_correlated_columns(df1)

In [None]:
df_reduced.shape

In [None]:
df2 = pd.read_csv('../input/nyse/prices.csv', delimiter=',')
df2.dataframeName = 'prices.csv'
nRow, nCol = df2.shape
print(f'There are {nRow} rows and {nCol} columns')

In [None]:
df2.head()

In [None]:
plotCorrelationMatrix(df2, 10)


In [None]:
plotScatterMatrix(df2, 20, 7)


In [None]:
# prices-split-adjusted.csv may have more rows in reality, but we are only loading/previewing the first 1000 rows
df3 = pd.read_csv('../input/nyse/prices-split-adjusted.csv', delimiter=',',nrows = 200000)
df3.dataframeName = 'prices-split-adjusted.csv'
nRow, nCol = df3.shape
print(f'There are {nRow} rows and {nCol} columns')

In [None]:
df3.head()

In [None]:
plotCorrelationMatrix(df3, 8)


In [None]:
plotScatterMatrix(df3, 15, 10)
