## Import Libraries and Load Data

In [1]:
# Import libraries

import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.decomposition import TruncatedSVD
import patsy

# Set seed
np.random.seed(4031)

In [2]:
# Data file locations and names

project_root_dir = "Data"
project_subdir_prefix = "fold_"
train_data_filename = "train.csv"
test_data_filename = "test.csv"


# The number of train/test data folders and the target RMSE for each
# train/test split in each folder

n_datasets = 10

In [3]:
# Get list of data subfolders, each with a separate training and test set.
# fold1 - fold5 have target RMSE 0.125, and fold6 - fold10 have target RMSE 0.135.

os_walk = os.walk(project_root_dir)
data_subdir_list = [subdirs for root, subdirs, files in os_walk][0]
n_subdirs = len(data_subdir_list)

assert(n_subdirs == n_datasets)

In [4]:
# Lists for training and test datasets

train_datasets = []
test_datasets = []


# Loop over subfolders and read in training/test datasets and test house sale prices.
# Use a loop instead of using os.walk directly to avoid "fold10" immediately following "fold1".

for subdir_num in np.arange(n_subdirs) + 1:
    subdir_num_str = str(subdir_num)
    train_datasets.append(pd.read_csv(os.path.join(project_root_dir,
                                                   project_subdir_prefix + subdir_num_str,
                                                   train_data_filename)))
    test_datasets.append(pd.read_csv(os.path.join(project_root_dir,
                                                   project_subdir_prefix + subdir_num_str,
                                                   test_data_filename)))

## Define Scoring function

In [5]:
# Define a WMAE function for scoring

def wmae():
    file_path = 'Data/test_with_label.csv'
    test = pd.read_csv(file_path)
    num_folds = 10
    wae = []

    for i in range(num_folds):
        file_path = f'Data/fold_{i+1}/mypred.csv'
        test_pred = pd.read_csv(file_path)

        # Left join with the test data
        new_test = test_pred.merge(test, on=['Date', 'Store', 'Dept'], how='left')

        # Compute the Weighted Absolute Error
        actuals = new_test['Weekly_Sales']
        preds = new_test['Weekly_Pred']
        weights = new_test['IsHoliday'].apply(lambda x: 5 if x else 1)
        wae.append(sum(weights * abs(actuals - preds)) / sum(weights))

    return wae

## Preprocess Data

In [294]:
#Loop through data and pull apart date into week and year
def preprocess(data):
    #Split date into useful features
    tmp = pd.to_datetime(data['Date'])
    data['Wk'] = tmp.dt.isocalendar().week
    data['Yr'] = tmp.dt.year
    data['Wk'] = pd.Categorical(data['Wk'], categories=[i for i in range(1, 53)])  # 52 weeks 

    #One hot encode Wk
    data = pd.get_dummies(data, columns=['Wk'], prefix='Week')

    return data

## SVD Implementation (not working)

In [None]:
for j in range(n_datasets):
    train = train_datasets[j]
    test = test_datasets[j]

    # Initialize the DataFrame to store predictions
    test_pred = pd.DataFrame()

    fold_train = preprocess(train)
    fold_test = preprocess(test)

    stores = fold_train["Store"].unique()
    depts = fold_train["Dept"].unique()
    years = fold_train["Yr"].unique()

    for store in stores:
        for dept in depts:
            for year in years:
                #Find training and test data within same store and then same department
                train = fold_train[(fold_train["Store"] == store) & (fold_train["Dept"] == dept) & (fold_train["Yr"] == year)]
                test = fold_test[(fold_test["Store"] == store) & (fold_test["Dept"] == dept) & (fold_test["Yr"] == year)]

                #abort if the train data is non-existant (i.e., this combo of store/dept doesnt appear in data)
                if len(train) == 0:
                    continue

                Y_train = train["Weekly_Sales"]
                X_train = train.drop(["Weekly_Sales", "Date"], axis=1)

                X_test = test

                #Keep Store, dept, and date info for later merging
                tmp_pred = X_test[['Store', 'Dept', 'Date']]
                X_test = X_test.drop(["Date"], axis=1)


                #Implement SVD
                #n_components=5
                #svd_df = pd.DataFrame(svd_result, columns=[[f'SVD_{i}' for i in range (n_components)]]) 
                svd = TruncatedSVD() 
                svd_result = svd.fit_transform(X_train)
                svd_df = pd.DataFrame(svd_result) 

                #Train model on only the features SVD selected
                model = sm.OLS(Y_train, svd_df).fit()
                mycoef = model.params.fillna(0)
                
                #Fit SVD columns for test set
                X_test = svd.transform(X_test)
                
                #Predict Y
                tmp_pred['Weekly_Pred'] = np.dot(X_test, mycoef)

                #Readd context of store, dept, and date
                test_pred = pd.concat([test_pred, tmp_pred], ignore_index=True)
            
        
    test_pred['Weekly_Pred'].fillna(0, inplace=True)
    # Save the output to CSV
    file_path = f'Data/fold_{j+1}/mypred.csv'
    print(f'fold_{j+1} processed')
    test_pred.to_csv(file_path, index=False)


## Linear Refactor (not working)

In [None]:
for j in range(n_datasets):
    train = train_datasets[j]
    test = test_datasets[j]

    # Initialize the DataFrame to store predictions
    test_pred = pd.DataFrame()

    fold_train = preprocess(train)
    fold_test = preprocess(test)

    stores = fold_train["Store"].unique()
    depts = fold_train["Dept"].unique()
    years = fold_train["Yr"].unique()

    for store in stores:
        for dept in depts:
            for year in years:
                #Find training and test data within same store and then same department
                train = fold_train[(fold_train["Store"] == store) & (fold_train["Dept"] == dept) & (fold_train["Yr"] == year)]
                test = fold_test[(fold_test["Store"] == store) & (fold_test["Dept"] == dept) & (fold_test["Yr"] == year)]

                #abort if the train data is non-existant (i.e., this combo of store/dept doesnt appear in data)
                if len(train) == 0:
                    continue

                Y_train = train["Weekly_Sales"]
                X_train = train.drop(["Weekly_Sales", "Date"], axis=1)

                X_test = test

                #Keep Store, dept, and date info for later merging
                tmp_pred = X_test[['Store', 'Dept', 'Date']]
                X_test = X_test.drop(["Date"], axis=1)

                X_train = X_train.drop(["Store", "Dept", "IsHoliday"], axis=1)
                X_test = X_test.drop(["Store", "Dept", "IsHoliday"], axis=1)

                #Add intercept columns
                X_train["Intercept"] = 1
                X_test["Intercept"] = 1

                #Cast one hot bools to ints
                X_train = X_train.astype(int)
                X_test = X_test.astype(int)

                #Train model on only the features SVD selected
                model = sm.OLS(Y_train, X_train).fit()
                mycoef = model.params.fillna(0)
                
                #Predict Y
                tmp_pred['Weekly_Pred'] = np.dot(X_test, mycoef)

                #Readd context of store, dept, and date
                test_pred = pd.concat([test_pred, tmp_pred], ignore_index=True)
            
        
    test_pred['Weekly_Pred'].fillna(0, inplace=True)
    # Save the output to CSV
    file_path = f'Data/fold_{j+1}/mypred.csv'
    print(f'fold_{j+1} processed')
    test_pred.to_csv(file_path, index=False)

## Original Code (OLS only)

In [6]:
#OLS method requires a different preprocess function

def preprocess(data):
    #Split date into useful features
    tmp = pd.to_datetime(data['Date'])
    data['Wk'] = tmp.dt.isocalendar().week
    data['Yr'] = tmp.dt.year
    data['Wk'] = pd.Categorical(data['Wk'], categories=[i for i in range(1, 53)])  # 52 weeks 

    return data


In [None]:
#Original OLS model

# Loop over folds
for j in range(n_datasets):
    
    # Get a pair of training and test sets
    train = train_datasets[j]
    test = test_datasets[j]

    test_pred = pd.DataFrame()

    # Identify the distinct store/dept pairs shared by the training and test set.
    train_pairs = train[['Store', 'Dept']].drop_duplicates(ignore_index=True)
    test_pairs = test[['Store', 'Dept']].drop_duplicates(ignore_index=True)
    unique_pairs = pd.merge(train_pairs, test_pairs, how = 'inner', on =['Store', 'Dept'])

    # Join the distinct store/dept pairs to the training set.
    # Why left join? When would training data not be available?
    train_split = unique_pairs.merge(train, on=['Store', 'Dept'], how='left')
    
    # Add numeric column for the year and a categorical column for week # to the training set
    train_split = preprocess(train_split)
    # Get design matrices for training y and X.
    # y is just the target variable, Weekly_Sales.
    # X has pivoted weeks, where individual weeks are separate 0/1 columns.
    y, X = patsy.dmatrices('Weekly_Sales ~ Weekly_Sales + Store + Dept + Yr  + Wk', 
                        data = train_split, 
                        return_type='dataframe')
    # Get dictionary where keys are (Store, Dept) tuples, and values are the
    # \"Weekly_Sales + Store + Dept + Yr + Wk\" design matrices corresponding to each key.
    # The design matrices include an Intercept column with value 1.
    train_split = dict(tuple(X.groupby(['Store', 'Dept'])))

    
    # Now join the distinct store/dept pairs to the test set.
    # Same question: why left join? When would training data not be available?
    test_split = unique_pairs.merge(test, on=['Store', 'Dept'], how='left')
    # Add numeric column for the year and a categorical column for week # to the test set
    test_split = preprocess(test_split)
    # Get design matrices for text y and X.
    # y is the Year, and the design matrix is \"Store + Dept + Yr + Wk\".
    # Note that test sets don't have the Weekly_Sales target variable.
    # Why save Year as y?
    y, X = patsy.dmatrices('Yr ~ Store + Dept + Yr  + Wk', 
                        data = test_split, 
                        return_type='dataframe')
    # Re-add Date column to the design matrix X
    X['Date'] = test_split['Date']
    # Get dictionary where keys are (Store, Dept) tuples, and values are the
    # \"Yr  + Wk + Date\" design matrices corresponding to each key.
    test_split = dict(tuple(X.groupby(['Store', 'Dept'])))

    # Get the training (store, dept) tuples.
    # SHOULD be the same keys as in test, given the left joins above.
    keys = list(train_split)

    # Loop over (store, dept) tuples
    for key in keys:
        # Get training and test design matrices corresponding to (store, dept)
        X_train = train_split[key]
        X_test = test_split[key]
    
        # Target variable for (store, dept)
        Y = X_train['Weekly_Sales']
        # Drop ID and target to get just a table of predictors
        X_train = X_train.drop(['Weekly_Sales','Store', 'Dept'], axis=1)
        
        # Identify columns that are all zero in training predictors, and drop them
        # from both training and test X.
        # This should drop weeks that are not represented in the training data.
        # How does this affect test X? Are there cases where all test weeks would be dropped?
        cols_to_drop = X_train.columns[(X_train == 0).all()]
        X_train = X_train.drop(columns=cols_to_drop)
        X_test = X_test.drop(columns=cols_to_drop)

        
        # Identify X training columns that are highly collinear with the columns to the left.
        # Note that this doesn't check the Intercept column.
        cols_to_drop = []
        for i in range(len(X_train.columns) - 1, 1, -1):  # Start from the last column and move backward
            col_name = X_train.columns[i]
            # Extract the current column and all previous columns
            tmp_Y = X_train.iloc[:, i].values
            tmp_X = X_train.iloc[:, :i].values

            coefficients, residuals, rank, s = np.linalg.lstsq(tmp_X, tmp_Y, rcond=None)
            if np.sum(residuals) < 1e-10:
                    cols_to_drop.append(col_name)
                
        # Drop those collinear columns from both training and test X.
        X_train = X_train.drop(columns=cols_to_drop)
        X_test = X_test.drop(columns=cols_to_drop)
        print(X_train)
        # Fit a regular ordinary least squares model on training Weekly_Sales.
        model = sm.OLS(Y, X_train).fit()
        mycoef = model.params.fillna(0)
        
        tmp_pred = X_test[['Store', 'Dept', 'Date']]
        X_test = X_test.drop(['Store', 'Dept', 'Date'], axis=1)
        
        tmp_pred['Weekly_Pred'] = np.dot(X_test, mycoef)
        test_pred = pd.concat([test_pred, tmp_pred], ignore_index=True)
        
    test_pred['Weekly_Pred'].fillna(0, inplace=True)
    # Save the output to CSV
    file_path = f'Data/fold_{j+1}/mypred.csv'
    print(f'fold_{j+1} processed')
    test_pred.to_csv(file_path, index=False)

## Evaluate Predictions

In [None]:
wae = wmae()
for value in wae:
    print(f"\t{value:.3f}")
print(f"{sum(wae) / len(wae):.3f}")

Linear errors

	2049.347
	1467.113
	1446.882
	1595.628
	2334.678
	1675.221
	1720.828
	1427.286
	1443.787
	1444.677
1660.545

## Edit original OLS: group stores by year and add SVD/PCA

In [23]:
# Try on a single fold: j = 2
j = 2

# Components to return from SVD. This is from the example in Campuswire post #364:
# https://campuswire.com/c/G06C55090/feed/364
n_components = 8
temp_seed = 4031
    
# Get a pair of training and test sets
train = train_datasets[j]
test = test_datasets[j]

test_pred = pd.DataFrame()

# Identify the distinct store/dept pairs shared by the training and test set.
# Will only process these.

train_pairs = train[['Store', 'Dept']].drop_duplicates(ignore_index=True)
test_pairs = test[['Store', 'Dept']].drop_duplicates(ignore_index=True)
unique_pairs = pd.merge(train_pairs, test_pairs, how = 'inner', on =['Store', 'Dept'])

# Join the distinct store/dept pairs to the training set.
# Why left join? When would training data not be available?
train_split = unique_pairs.merge(train, on=['Store', 'Dept'], how='left')

# Prep train data for SVD/PCA.
# For each department, construct a dataframe consisting of:
# rows = dates; columns = store numbers; and values = weekly sales.

# Get columns needed for SVD
train_store_dept_date_sales = train_split[["Store", "Dept", "Date", "Weekly_Sales"]]
# Group by Dept. Create dict where key = Dept and value = dataframe of Store/Date/Weekly_Sales.
train_split = dict(tuple(train_store_dept_date_sales.groupby(["Dept"])))

# Pivot each department's dataframe so that sales dates are rows and stores are columns, with values = Weekly_Sales.
# Fill in missing values with zeroes.
train_split = {dept:df.drop(columns=["Dept"]).pivot(index="Date", columns="Store", values="Weekly_Sales").reset_index().fillna(0) \
               for dept, df in train_split.items()}
# Split sales dates out from remaining data
train_sales_dates = {dept:df.Date for dept, df in train_split.items()}
# Get just the sales figures for each dept
train_X = {dept:df.drop(columns=["Date"]) for dept, df in train_split.items()}

#train_X[1]
{dept:df.shape for dept, df in train_X.items()}

# Perform SVD and choose the n_components most influential components.
#svd = TruncatedSVD(n_components=n_components, random_state=temp_seed)
#svd.fit_transform(train_X[1])
# Collect dict of reduced-dimension training data, one entry per department.
#svd_dict = {dept:svd.fit_transform(df) for dept, df in train_X.items()}
#svd_dict[1]

{1: (73, 45),
 2: (73, 45),
 3: (73, 45),
 4: (73, 45),
 5: (73, 45),
 6: (73, 42),
 7: (73, 45),
 8: (73, 45),
 9: (73, 45),
 10: (73, 45),
 11: (73, 45),
 12: (73, 45),
 13: (73, 45),
 14: (73, 45),
 16: (73, 45),
 17: (73, 45),
 18: (73, 37),
 19: (73, 33),
 20: (73, 45),
 21: (73, 45),
 22: (73, 41),
 23: (73, 43),
 24: (73, 41),
 25: (73, 45),
 26: (73, 43),
 27: (73, 43),
 28: (73, 43),
 29: (73, 38),
 30: (73, 37),
 31: (73, 45),
 32: (73, 41),
 33: (73, 39),
 34: (73, 39),
 35: (73, 37),
 36: (73, 37),
 37: (73, 18),
 38: (73, 45),
 40: (73, 45),
 41: (73, 40),
 42: (73, 45),
 43: (1, 1),
 44: (73, 41),
 45: (73, 28),
 46: (73, 45),
 47: (67, 18),
 48: (73, 15),
 49: (73, 33),
 50: (73, 11),
 51: (73, 20),
 52: (73, 44),
 54: (73, 37),
 55: (73, 41),
 56: (73, 43),
 58: (73, 32),
 59: (73, 45),
 60: (73, 45),
 65: (73, 1),
 67: (73, 45),
 71: (73, 38),
 72: (73, 43),
 74: (73, 45),
 78: (22, 11),
 79: (73, 45),
 80: (73, 44),
 81: (73, 45),
 82: (73, 45),
 83: (73, 41),
 85: (7

In [13]:
train_split[2].drop(columns=["Dept"]).pivot(index="Date", columns="Store", values="Weekly_Sales").reset_index()

Store,Date,1,2,3,4,5,6,7,8,9,...,36,37,38,39,40,41,42,43,44,45
0,2010-02-05,50605.27,74661.16,17566.72,91481.24,11955.45,50810.04,21129.93,37550.10,24917.84,...,15606.15,16121.22,9601.89,52359.34,27089.38,43875.06,14983.84,22720.10,8577.33,38214.01
1,2010-02-12,44682.74,65487.46,15177.30,85763.19,11794.51,46456.01,21635.61,35205.40,22488.29,...,15127.60,14419.56,10201.55,50874.07,25187.64,38415.81,14330.30,19942.01,8169.20,27384.43
2,2010-02-19,47928.89,70853.58,15676.11,86570.21,11707.09,48127.95,21900.24,38435.19,22173.92,...,15244.69,15382.87,9700.21,56270.81,25082.06,39140.52,15024.70,23421.42,7951.32,39601.72
3,2010-02-26,44292.87,64963.90,15112.23,84405.45,11452.33,46128.34,21804.64,34999.91,22311.77,...,15600.86,15354.55,10126.21,52825.12,24338.23,40876.97,14988.04,21550.61,8990.75,34421.20
4,2010-03-05,48397.98,68428.64,16477.28,87922.48,11300.69,52939.75,21228.66,36528.11,23609.13,...,15170.52,15732.79,10863.06,56766.01,25673.66,41030.89,15018.05,22607.30,8581.43,34631.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,2011-05-27,45072.11,64755.07,16396.82,93786.26,13005.92,49611.81,17519.49,34057.22,24921.08,...,13473.34,16890.99,10381.16,67898.16,25729.44,47129.05,15458.19,20831.05,9572.17,34991.20
69,2011-06-03,47844.27,65848.98,18860.29,95548.42,13791.92,56378.34,19234.93,36601.08,27252.62,...,12261.35,16783.06,10552.41,74980.46,28477.20,50087.94,16478.75,20756.12,9038.64,32532.02
70,2011-06-10,46363.93,66831.49,17134.57,93516.12,12687.22,53504.63,21561.66,35293.99,25906.82,...,12099.80,17710.99,11151.02,68460.66,25203.79,47687.31,16512.57,21112.52,9182.69,35991.25
71,2011-06-17,42939.63,66648.26,16598.43,96510.39,12213.47,50712.17,21189.99,35512.57,26369.13,...,14355.82,16940.94,10567.89,68181.39,27118.87,47584.55,16380.41,21292.31,8810.27,36629.18


In [15]:
# Pivot each department's dataframe so that sales dates are rows and stores are columns, with values = Weekly_Sales
train_split = {dept:df.drop(columns=["Dept"]).pivot(index="Date", columns="Store", values="Weekly_Sales").reset_index() \
               for dept, df in train_split.items()}
train_split[1]

Store,Date,1,2,3,4,5,6,7,8,9,...,36,37,38,39,40,41,42,43,44,45
0,2010-02-05,24924.50,35034.06,6453.58,38724.42,9323.89,25619.00,8970.97,16181.89,12861.40,...,2144.48,11283.23,6732.38,21244.50,18116.85,16971.05,10425.77,6476.76,6871.20,18628.11
1,2010-02-12,46039.49,60483.70,12748.72,69872.44,16861.10,43749.81,14026.65,34262.09,20273.94,...,4091.72,16184.33,9132.55,39584.16,26138.18,30204.01,15725.68,18597.64,12315.65,22416.94
2,2010-02-19,41595.55,58221.52,8918.31,49937.09,11417.67,34750.82,12477.79,22319.25,14819.97,...,3101.57,10722.08,8045.28,23025.91,23172.75,20694.24,13300.99,9939.45,7751.11,28756.53
3,2010-02-26,19403.54,25962.32,4992.00,30107.54,7168.41,19896.08,8602.73,11722.71,10530.98,...,1451.39,9256.03,5951.54,14011.39,14728.82,12816.16,9303.34,6460.56,6014.71,14656.08
4,2010-03-05,21827.90,27372.05,5172.73,31580.69,8344.13,22839.36,9541.12,12979.74,10438.47,...,1510.25,9766.32,6485.19,14875.08,18494.41,15154.51,10244.26,6939.08,6120.60,16608.87
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,2011-05-27,15741.60,21334.34,5194.21,25035.61,7901.75,16479.47,6616.65,8140.29,8687.90,...,1680.75,8491.25,5549.96,14700.37,13445.91,17880.00,9021.05,5584.23,7716.02,10632.88
69,2011-06-03,16434.15,22174.12,4906.27,25401.86,8175.25,17836.28,7839.66,9182.01,9240.67,...,1810.76,8829.34,5496.54,16575.04,14579.85,16970.53,9004.52,6172.43,8044.30,11142.70
70,2011-06-10,15883.52,21554.22,4790.03,24726.22,8170.13,18069.27,7728.34,8570.93,8963.92,...,1816.06,9983.83,5555.61,14498.75,13945.42,17552.61,9408.68,5450.76,7632.33,10312.84
71,2011-06-17,14978.09,22431.56,5738.30,27152.27,8157.72,17045.25,7781.62,9379.33,8638.83,...,1828.68,9502.31,5022.86,14906.77,14417.44,17648.40,8472.14,5580.68,7580.63,11742.00


In [16]:
len(train_split)

79

In [25]:
scratch_X = train_X[65]
scratch_X

Store,34
0,41057.25
1,39799.36
2,42908.44
3,40515.35
4,49353.20
...,...
68,46790.83
69,43607.42
70,39256.49
71,38254.73


In [28]:
for i in range(35, 50):
    scratch_X[str(i)] = 0
    
scratch_X

Store,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49
0,41057.25,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,39799.36,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,42908.44,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,40515.35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,49353.20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,46790.83,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
69,43607.42,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
70,39256.49,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
71,38254.73,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [29]:
svd = TruncatedSVD(n_components=8, random_state=4031)
svd.fit_transform(train_X[1])

array([[ 1.48807268e+05, -9.99353517e+03, -3.18608963e+01,
        -4.37803668e+03, -2.67268940e+03, -3.67391331e+03,
        -3.71424484e+03, -1.91465438e+03],
       [ 2.41045082e+05, -3.02587043e+04,  1.69425699e+04,
         1.30125058e+04,  7.44887881e+03, -1.43381733e+04,
        -6.22495960e+03,  4.44900402e+03],
       [ 1.84280398e+05, -2.67636306e+04,  1.77219628e+04,
        -1.38735259e+04, -7.91462081e+03, -8.82273176e+03,
        -8.56182600e+03, -1.38090735e+02],
       [ 1.12611339e+05, -1.28745480e+04,  6.13199188e+01,
        -4.78428038e+03, -7.68655354e+03, -1.06196543e+02,
         2.69388836e+03,  1.27912358e+03],
       [ 1.26443571e+05, -9.42570471e+03, -1.85355589e+03,
        -6.87753006e+03, -9.43867658e+03, -6.61670766e+02,
         9.00026200e+02,  1.90892115e+03],
       [ 1.35863547e+05, -2.56713770e+03, -5.22163212e+03,
        -8.46320284e+03, -5.94078955e+03, -5.81386135e+02,
        -7.32659014e+02, -2.34759150e+03],
       [ 1.45204659e+05,  3.420466

In [31]:
np.sum(svd == 0)

0