In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ppa-wl-sl-data-setup/__results__.html
/kaggle/input/ppa-wl-sl-data-setup/full_sparse_matrix.npz
/kaggle/input/ppa-wl-sl-data-setup/__notebook__.ipynb
/kaggle/input/ppa-wl-sl-data-setup/__output__.json
/kaggle/input/ppa-wl-sl-data-setup/custom.css


In [2]:
import gc
from scipy.sparse import load_npz, hstack, dok_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load the prepared sparse matrix
full_matrix = load_npz('/kaggle/input/ppa-wl-sl-data-setup/full_sparse_matrix.npz')

# Separate features and labels
X = full_matrix[:, :-1]  # Features
y = full_matrix[:, -1].toarray().ravel()  # Labels

# Free up memory
del full_matrix
gc.collect()

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




In [25]:
print(y.shape)
print(y_train.shape)

(158100,)
(126480, 37, 2, 2, 2, 2)


In [4]:
# import gc
# from scipy.sparse import load_npz, csr_matrix
# from sklearn.linear_model import SGDClassifier
# from sklearn.metrics import classification_report, accuracy_score
# from sklearn.model_selection import train_test_split
# from tqdm import tqdm
# # Initialize the SGDClassifier
# model = SGDClassifier(max_iter=1000, tol=1e-3)

# # Fit the model in batches to save memory
# batch_size = 1000
# n_samples = X_train.shape[0]

In [5]:
# def fit_batch(start, model):
#     end = min(start + batch_size, n_samples)
#     X_batch = X_train[start:end].toarray()  # Convert to array to ensure it is writable
#     y_batch = y_train[start:end]

#     # Ensure the array is writable
#     X_batch.setflags(write=1)

#     model.partial_fit(X_batch, y_batch, classes=np.unique(y_train))
#     return model

# # Create a pool of parallel workers
# n_jobs = 4
# batch_starts = range(0, n_samples, batch_size)

# # Initialize a model instance for each job to avoid race conditions
# models = [SGDClassifier(max_iter=1000, tol=1e-3) for _ in range(n_jobs)]



In [6]:
# for start in tqdm(range(0, n_samples, batch_size), total=n_samples//batch_size + 1):
#     end = min(start + batch_size, n_samples)
#     X_batch = X_train[start:end].toarray()
#     y_batch = y_train[start:end]
#     model.partial_fit(X_batch, y_batch, classes=np.unique(y_train))

# # Make predictions
# y_pred = model.predict(X_test.toarray())

# # Evaluate the model
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))

In [20]:
import gc
import numpy as np
from scipy.sparse import load_npz
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report, accuracy_score

In [27]:
# Debug: Print the shape of y before splitting
print("Shape of y before splitting:", y.shape)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Debug: Print the shape of y_train and y_test after splitting
print("Shape of y_train after splitting:", y_train.shape)
print("Shape of y_test after splitting:", y_test.shape)

# Ensure that y_train and y_test have correct shape
y_train = y_train.reshape(-1)
y_test = y_test.reshape(-1)

# Debug: Print the shape of y_train and y_test after reshaping
print("Shape of y_train after reshaping:", y_train.shape)
print("Shape of y_test after reshaping:", y_test.shape)

# Standardize the features
scaler = StandardScaler(with_mean=False)  # Set with_mean=False to handle sparse data
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# One-hot encode the labels if they are categorical
num_classes = len(np.unique(y_train))
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)

# Debug: Print the shape of y_train and y_test after one-hot encoding
print("Shape of y_train after one-hot encoding:", y_train.shape)
print("Shape of y_test after one-hot encoding:", y_test.shape)


Shape of y before splitting: (158100,)
Shape of y_train after splitting: (126480,)
Shape of y_test after splitting: (31620,)
Shape of y_train after reshaping: (126480,)
Shape of y_test after reshaping: (31620,)
Shape of y_train after one-hot encoding: (126480, 37)
Shape of y_test after one-hot encoding: (31620, 37)


In [42]:
# Strategy for multi-GPU training
strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
    # Build a deeper neural network
    model = Sequential([
        Dense(128, input_shape=(X_train.shape[1],), activation='relu'),
        BatchNormalization(),
        Dropout(0.5),

        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(32, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])

    # Compile the model
    model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])


In [43]:
# Train the model
history = model.fit(X_train, y_train, epochs=40, batch_size=128, validation_split=0.2, verbose=1)


Epoch 1/40
[1m791/791[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 31ms/step - accuracy: 0.0458 - loss: 3.9489 - val_accuracy: 0.0919 - val_loss: 3.3732
Epoch 2/40
[1m791/791[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 30ms/step - accuracy: 0.0953 - loss: 3.3824 - val_accuracy: 0.0957 - val_loss: 3.3563
Epoch 3/40
[1m791/791[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 30ms/step - accuracy: 0.1029 - loss: 3.3448 - val_accuracy: 0.0924 - val_loss: 3.3542
Epoch 4/40
[1m137/791[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m17s[0m 26ms/step - accuracy: 0.1084 - loss: 3.3153

KeyboardInterrupt: 

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {accuracy:.4f}")

In [None]:
# Make predictions
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

In [None]:
# Generate a classification report
print(classification_report(y_true, y_pred_classes))