In [4]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load the data
data = pd.read_csv('crspm_and_predictors.csv')
signal_doc = pd.read_csv('SignalDoc.csv')

# Replace infinity values with NaN
data.replace([float('inf'), float('-inf')], float('nan'), inplace=True)

# Replace NaN values with 0
data.fillna(0, inplace=True)

# Apply MinMaxScaler with feature_range (-1, 1)
scaler = MinMaxScaler(feature_range=(-1, 1))
scaled_data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

# Save the preprocessed data if needed
scaled_data.to_csv('preprocessed_data.csv', index=False)

In [5]:
#First 500 samples
subset_data = data.iloc[:500]

In [6]:
# Summary statistics for numerical columns
print(subset_data.describe())

# General information about the DataFrame
print(subset_data.info())

             permno         yyyymm         prc         ret       Price  \
count    500.000000     500.000000  500.000000  500.000000  500.000000   
mean   10659.664000  199794.096000    7.557017    2.318719   -1.316136   
std        0.855903     832.124169   11.551829   25.163394    1.388918   
min    10659.000000  198609.000000   -6.750000  -85.714300   -3.792564   
25%    10659.000000  199108.000000   -0.539062   -8.392825   -2.751094   
50%    10659.000000  199601.000000    1.578125    0.000000   -1.266848   
75%    10661.000000  200503.250000   15.660000    6.857825   -0.071577   
max    10661.000000  201508.000000   44.370000  240.000000    2.212927   

             Size  STreversal      MaxRet      High52  RealizedVol  ...  \
count  500.000000  500.000000  500.000000  500.000000   500.000000  ...   
mean    -9.541182   -2.318719   -0.110619    0.649137    -0.045330  ...   
std      3.477378   25.163394    0.176142    0.396359     0.056623  ...   
min    -14.026541 -240.000000   -

In [7]:
subset_data.head()

Unnamed: 0,permno,yyyymm,prc,ret,Price,Size,STreversal,MaxRet,High52,RealizedVol,...,MomSeason06YrPlus,MomOffSeason06YrPlus,grcapx,EntMult,Investment,PctTotAcc,EarnSupBig,EarningsSurprise,BetaTailRisk,CBOperProf
0,10659,199102,-0.17188,-15.3846,1.760959,-8.844859,15.3846,0.0,0.407414,-0.035295,...,0.0,0.0,0.799007,0.0,0.0,0.795044,-0.379424,-0.109776,0.0,-0.371791
1,10659,199103,-0.10938,-36.3636,2.212927,-8.39289,36.3636,0.0,0.259268,-0.065733,...,0.0,0.0,0.799007,0.0,0.0,0.795044,-0.483346,-0.109776,0.0,-0.371791
2,10659,199104,-0.21875,100.0,1.519826,-9.085992,-100.0,-0.571429,0.518512,-0.12787,...,0.0,0.0,0.802281,0.0,-0.043613,0.43973,-0.47016,-0.050594,0.0,-0.138481
3,10659,199105,-0.21875,0.0,1.519826,-9.085992,0.0,-0.142857,0.518512,-0.045962,...,0.0,0.0,0.802281,0.0,-0.046697,0.43973,-0.498256,-0.050594,0.0,-0.138481
4,10659,199106,-0.20313,-7.1429,1.593909,-9.011908,7.1429,0.0,0.481488,-0.015972,...,0.0,0.0,0.802281,0.0,-0.050251,0.43973,-0.116901,-0.050594,0.0,-0.138481


In [None]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Define the target variable (e.g., 'ret') and predictors
X = scaled_data.drop(columns=['ret'])  # Drop the target column
y = scaled_data['ret']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform PCA
pca = PCA()
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Select the number of principal components to use (e.g., 10)
n_components = 10
X_train_pca_reduced = X_train_pca[:, :n_components]
X_test_pca_reduced = X_test_pca[:, :n_components]

# Perform regression using the selected principal components
regressor = LinearRegression()
regressor.fit(X_train_pca_reduced, y_train)

# Predict on the test set
y_pred = regressor.predict(X_test_pca_reduced)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")