In [1]:
import pandas as pd

In [96]:
df = pd.read_csv('CleanedData.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,#YYY/MM/DD,MAG,LAT,LON,DEPTH,Q
0,21,1980-01-02,2.35,32.445,-115.162,4.8,C
1,30,1980-01-02,3.15,34.449,-119.68,15.6,D
2,33,1980-01-02,2.83,33.04,-115.499,5.1,A
3,38,1980-01-03,2.49,32.967,-115.542,14.5,A
4,48,1980-01-03,2.52,33.943,-116.304,0.7,A


# Data Transformations

Normalize the data with min-max normalization

In [3]:
# Define the columns to be normalized
cols_to_norm = ["LAT","LON","DEPTH"]

# Get the orignal data
original_data = df[cols_to_norm]

# Get the normalized data
normalized_data = (df[cols_to_norm] - df[cols_to_norm].min()) / (df[cols_to_norm].max() - df[cols_to_norm].min())

print("Original data headings:")
print(original_data.columns.tolist())
print("\nNormalized data headings:")
print(normalized_data.columns.tolist())

# Print the normalized data
print("\nNormalized data:")
print(normalized_data)

Original data headings:
['LAT', 'LON', 'DEPTH']

Normalized data headings:
['LAT', 'LON', 'DEPTH']

Normalized data:
           LAT       LON     DEPTH
0       0.0890  0.880391  0.117550
1       0.4898  0.298700  0.296358
2       0.2080  0.837003  0.122517
3       0.1934  0.831466  0.278146
4       0.3886  0.733359  0.049669
...        ...       ...       ...
415201  0.2586  0.537788  1.000000
415202  0.0926  0.668984  0.057947
415203  0.2790  0.657139  0.268212
415204  0.0796  0.741728  0.370861
415205  0.2962  0.714304  0.259934

[415206 rows x 3 columns]


Normalize the data via Z-score standardization

In [4]:
from scipy import stats

# Define the columns to be standardized
cols_to_std = ["MAG", "DEPTH"]

original_data = df[cols_to_std]

# Apply z-score standardization to the selected columns
df[cols_to_std] = stats.zscore(df[cols_to_std])
standardized_data = df[cols_to_std]

print("Standardized data:")
print(standardized_data)

Standardized data:
             MAG      DEPTH
0       1.496376  -0.604225
1       2.619082   1.699209
2       2.169999  -0.540240
3       1.692850   1.464600
4       1.734951  -1.478676
...          ...        ...
415201  1.454275  10.763649
415202 -0.580630  -1.372036
415203 -0.594663   1.336632
415204 -0.061378   2.658974
415205 -0.763069   1.229991

[415206 rows x 2 columns]


Data Reduction

In [5]:
# SVD Data Reduction
import numpy as np
from sklearn.decomposition import TruncatedSVD

# Select columns to use for SVD
X = df[['MAG','LAT','LON','DEPTH']]

# Fit SVD model to data
svd = TruncatedSVD(n_components=2, random_state=42)
X_svd = svd.fit_transform(X)

# Create new dataframe with SVD results
outcome = pd.DataFrame(data=X_svd, columns=['SVD1','SVD2'])
print(outcome.head())

         SVD1      SVD2
0  119.638725  0.223477
1  124.537867  1.361512
2  120.129412 -0.120575
3  120.150121  1.462051
4  121.155835 -1.191984
