<a href="https://colab.research.google.com/github/MehrDataPythonist/daily-dev-/blob/main/TabNet2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import FeatureHasher
from scipy.sparse import hstack, vstack
from keras.models import Model
from keras.layers import Input, Dense

# Load the data in chunks
chunksize = 10000000  # Adjust the chunksize according to your available memory
train_chunks = pd.read_csv("C:\\Users\\mehrd\\Desktop\\avazu-ctr-prediction\\train.csv", chunksize=chunksize)
test_chunks = pd.read_csv("C:\\Users\\mehrd\\Desktop\\avazu-ctr-prediction\\test.csv", chunksize=chunksize)

# Preprocess the data
def preprocess_data(df):
    # Drop unnecessary columns
    df = df.drop(['id'], axis=1)

    # Convert 'hour' column to datetime
    df['hour'] = pd.to_datetime(df['hour'])

    # Extract date features
    df['day'] = df['hour'].dt.day
    df['dayofweek'] = df['hour'].dt.dayofweek
    df['hour'] = df['hour'].dt.hour

    return df

# Process the data in chunks
categorical_features = ['banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']

hasher = FeatureHasher(n_features=1048576, input_type='string')

# Define an SSL embedding model using Keras
input_dim = hasher.n_features
embed_dim = 128

ssl_embedding_input = Input(shape=(input_dim,))
x = Dense(embed_dim, activation='relu')(ssl_embedding_input)
x = Dense(embed_dim, activation='relu')(x)
ssl_embedding_output = x

ssl_embedding_model = Model(inputs=ssl_embedding_input, outputs=ssl_embedding_output)

# Initialize an empty sparse matrix to store the processed data
train_processed_data = None
test_processed_data = None

for chunk in train_chunks:
    chunk_processed = preprocess_data(chunk)
    chunk_categorical = chunk_processed[categorical_features].astype(str)
    chunk_hash = hasher.transform(chunk_categorical.values)
    chunk_embedding = ssl_embedding_model.predict(chunk_hash)  # No need to call toarray()
    chunk_processed = hstack([chunk_processed.drop(categorical_features, axis=1), chunk_embedding])

    if train_processed_data is None:
        train_processed_data = chunk_processed
    else:
        train_processed_data = vstack([train_processed_data, chunk_processed])

for chunk in test_chunks:
    chunk_processed = preprocess_data(chunk)
    chunk_categorical = chunk_processed[categorical_features].astype(str)
    chunk_hash = hasher.transform(chunk_categorical.values)
    chunk_embedding = ssl_embedding_model.predict(chunk_hash)  # No need to call toarray()
    chunk_processed = hstack([chunk_processed.drop(categorical_features, axis=1), chunk_embedding])

    if test_processed_data is None:
        test_processed_data = chunk_processed
    else:
        test_processed_data = vstack([test_processed_data, chunk_processed])

# Convert the sparse matrices to CSR format for efficient storage and computation
train_processed_data = train_processed_data.tocsr()
test_processed_data = test_processed_data.tocsr()

MemoryError: Unable to allocate 107. MiB for an array with shape (14, 1000000) and data type int64

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Concatenate
from tensorflow.keras.models import Model

# Define the SSL model architecture
def build_ssl_model(train_df, categorical_features):
    # Define input layers for each categorical feature
    inputs = []
    embeddings = []

    # Iterate over the categorical feature columns
    for col in categorical_features:
        input_layer = Input(shape=(1,), name=col)
        inputs.append(input_layer)

        # Create an embedding layer for each categorical feature
        embedding = Embedding(train_df[col].nunique() + 1, 10)(input_layer)
        embedding = Flatten()(embedding)
        embeddings.append(embedding)

    # Concatenate the embeddings
    concat_embeddings = Concatenate()(embeddings)

    # Add dense layers for the SSL task
    dense1 = Dense(128, activation='relu')(concat_embeddings)
    dense2 = Dense(64, activation='relu')(dense1)
    output = Dense(128, activation='relu')(dense2)

    # Create the SSL model
    ssl_model = Model(inputs=inputs, outputs=output)

    return ssl_model

# Build the SSL model
categorical_features = ['banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
ssl_model = build_ssl_model(train_df, categorical_features)

# Compile the SSL model
ssl_model.compile(optimizer='adam', loss='mse')

# Train the SSL model
ssl_model.fit([train_df[col] for col in categorical_features], epochs=10, batch_size=1024)

In [None]:
!kaggle competitions download -c avazu-ctr-prediction

In [None]:
# Navigate to the directory where the ZIP file is located
import os
os.chdir('/content')

# Unzip the file
!unzip avazu-ctr-prediction.zip

In [None]:
import os
os.chdir('/content')

# Unzip the GZIP file
!gunzip train.gz
!gunzip test.gz
!gunzip sampleSubmission.gz

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import math
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import random
import gzip
pd.options.display.float_format = '{:.2f}'.format

In [None]:
# Total number of records in the dataset
num_records = 40428967

# Desired sample size
sample_size = 5000000

# Generate a list of random indices to skip for downsampling
skip_values = sorted(random.sample(range(1, num_records), num_records - sample_size))

# Function to parse the 'hour' column as datetime
parse_date = lambda val: pd.datetime.strptime(val, '%y%m%d%H')

# Read the training data, skipping rows based on skip_values and parsing 'hour' column as datetime
train = pd.read_csv("/content/train", parse_dates=['hour'], date_parser=parse_date, skiprows=skip_values)

# Read the test data, parsing 'hour' column as datetime
test = pd.read_csv('/content/test', parse_dates=['hour'], date_parser=parse_date)


In [None]:
print('Train dataset:',train.shape)
print('Test dataset:',test.shape)
print('Submission:',submission.shape)

In [None]:
#from google.colab import drive
#drive.mount('drive')

#train.to_csv('/content/drive/MyDrive/train_azure.csv')
#test.to_csv('/content/drive/MyDrive/test_zure.csv', encoding='utf-8', index=False)

In [None]:
# Read the training data from the specified file path
train_i = pd.read_csv('/content/drive/MyDrive/train_azure.csv')

# Read the test data from the specified file path
test_i = pd.read_csv('/content/drive/MyDrive/test_zure.csv')

In [None]:
# Convert the 'hour' column to datetime format
train_i['hour'] = pd.to_datetime(train_i['hour'])

# List of columns containing date information
datecolumn = ['hour']

# Extract date components and create new columns
for col in datecolumn:
    train_i['month'] = train_i[col].dt.month        # Extract month
    train_i['dayofweek'] = train_i[col].dt.dayofweek  # Extract day of the week (0 = Monday, 6 = Sunday)
    train_i['day'] = train_i[col].dt.day            # Extract day of the month
    train_i['hour_time'] = train_i[col].dt.hour     # Extract hour of the day

# Display the first two rows of the modified DataFrame
train_i.head(2)


Unnamed: 0.1,Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,...,C16,C17,C18,C19,C20,C21,month,dayofweek,day,hour_time
0,0,10000720757801103869,0,2014-10-21,1005,0,d6137915,bb1ef334,f028772b,ecad2386,...,50,1899,0,431,100077,117,10,1,21,0
1,1,10004482643316086592,0,2014-10-21,1005,0,85f751fd,c4e18dd6,50e219e0,66a5f0f3,...,50,2434,3,163,100088,61,10,1,21,0


In [None]:
# hour column contains event date with all the details, extracting the same to create different columns.
test_i['hour'] = pd.to_datetime(test_i['hour'])
datecolumn=['hour']
for col in datecolumn:
    test_i['month'] = test_i[col].dt.month
    test_i['dayofweek'] = test_i[col].dt.dayofweek
    test_i['day'] = test_i[col].dt.day
    test_i['hour_time'] = test_i[col].dt.hour
test_i.head()

Unnamed: 0,id,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,...,C16,C17,C18,C19,C20,C21,month,dayofweek,day,hour_time
0,1.0000174058809264e+19,2014-10-31,1005,0,235ba823,f6ebf28e,f028772b,ecad2386,7801e8d9,07d7df22,...,50,761,3,175,100075,23,10,4,31,0
1,1.0000182526920856e+19,2014-10-31,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,50,2616,0,35,100083,51,10,4,31,0
2,1.0000554139829211e+19,2014-10-31,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,50,2616,0,35,100083,51,10,4,31,0
3,1.00010946378098e+19,2014-10-31,1005,0,85f751fd,c4e18dd6,50e219e0,51cedd4e,aefc06bd,0f2161f8,...,50,1092,3,809,100156,61,10,4,31,0
4,1.0001377041558671e+19,2014-10-31,1005,0,85f751fd,c4e18dd6,50e219e0,9c13b419,2347f47a,f95efa07,...,50,2667,0,47,-1,221,10,4,31,0


In [None]:
# List of columns for which outliers will be capped
col = ['C15', 'C16', 'C19', 'C21']

# Loop over each column
for col in col:
    # Calculate the 98th percentile of the column
    percentiles = train_i[col].quantile(0.98)

    # Check if the 98th percentile is less than half of the maximum value in the column
    if train_i[col].quantile(0.98) < 0.5 * train_i[col].max():
        # Cap outliers by setting values above the 98th percentile to the 98th percentile
        train_i[col][train_i[col] >= percentiles] = percentiles

# The above loop caps outliers for each column specified in the list 'col'


In [None]:
# Dealing with outliers by capping

col = ['C15', 'C16', 'C19', 'C21']
for col in col:
    percentiles = test_i[col].quantile(0.98)
    if test_i[col].quantile(0.98) < 0.5 * test_i[col].max():
        test_i[col][test_i[col] >= percentiles] = percentiles

In [None]:
# Remove the column 'Unnamed: 0' from the DataFrame
train_i = train_i.drop(['Unnamed: 0'], axis=1)

# Initialize lists to store numerical and categorical column names
numerical = []
categorical = []

# Loop over each column in the DataFrame
for col in (train_i.columns):
    # Check the data type of the column
    if train_i[col].dtype == "object":
        # If the data type is object, it's categorical
        categorical.append(col)
    else:
        # Otherwise, it's numerical
        numerical.append(col)

# Print the lists of numerical and categorical columns
print("numerical columns = ", numerical)
print("\ncategorical columns = ", categorical)


numerical columns =  ['id', 'click', 'hour', 'C1', 'banner_pos', 'device_type', 'device_conn_type', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'month', 'dayofweek', 'day', 'hour_time']

categorical columns =  ['site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 'device_id', 'device_ip', 'device_model']


In [None]:
# Drop the 'id' column as it doesn't provide any significance
# Also, drop the 'hour' column as it has been derived into different columns
train_i.drop(['hour', 'id'], axis=1, inplace=True)

# Rename the 'click' column to 'y' (output)
# After dropping the 'hour' column, rename 'hour_time' to 'hour'
train_i.rename(columns={'click': 'y', 'hour_time': 'hour'}, inplace=True, errors='raise')

# For the test set, perform similar operations
test_i.drop(['hour', 'id'], axis=1, inplace=True)
test_i.rename(columns={'hour_time': 'hour'}, inplace=True, errors='raise')



In [None]:
# dataset is huge and running multiple algo will take time and resources might exhaust,
# hence taking only x% of the data for analysis.

sampled_data = train_i.sample(frac=0.01, random_state=42)
X = sampled_data.drop([ 'y'], axis = 1)
y = sampled_data['y']

In [None]:
cols = X.columns
# Create a list of all categorical columns in the dataframe
missing = (col for col in cols if col.endswith("_missing"))
categorical_features = categorical

In [None]:
# Calculate the number of unique values for each column
nunique = X.nunique()

# Get the data types of each column
types = X.dtypes

# Initialize lists and dictionaries to store information about categorical variables
categorical_columns = []
categorical_dims = {}

# Loop over each column in the DataFrame
for col in X.columns:
    # Check if the data type of the column is 'object' (indicating a categorical variable)
    if types[col] == 'object':
        # Print the column name and the number of unique values it has
        print(col, X[col].nunique())

        # Initialize a LabelEncoder for the current column
        l_enc = LabelEncoder()

        # Fill missing values with a placeholder (e.g., "VV_likely") and encode the column values
        X[col] = X[col].fillna("VV_likely")
        X[col] = l_enc.fit_transform(X[col].values)

        # Append the column name to the list of categorical columns
        categorical_columns.append(col)

        # Store the number of classes in the LabelEncoder for this column
        categorical_dims[col] = len(l_enc.classes_)

# The above loop encodes categorical variables and stores information about them


site_id 1137
site_domain 996
site_category 19
app_id 964
app_domain 74
app_category 20
device_id 8441
device_ip 41433
device_model 2567


In [None]:
nunique = test_i.nunique()
types = test_i.dtypes

categorical_columns = []
categorical_dims =  {}
for col in test_i.columns:
    if types[col] == 'object' :
        print(col, test_i[col].nunique())
        l_enc = LabelEncoder()
        test_i[col] = test_i[col].fillna("VV_likely")
        test_i[col] = l_enc.fit_transform(test_i[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)

In [None]:
# Define the target variable
target = "y"

# Create a list of all features excluding the target variable
features = [col for col in cols if col != target]

# Find the indices of all categorical features in the features list
categorical_idx = [idx for idx, feature in enumerate(features) if feature in categorical_features]

# Get the number of classes of each categorical feature
# The concatenation is required since there are some categorical features
# with some levels present only in either train or test data
categorical_dims = [
    pd.concat([X[col], test_i[col]]).nunique() for col in cols if col in categorical_features
]

# Print the indices of categorical features and the number of classes for each categorical feature
categorical_idx, categorical_dims


([2, 3, 4, 5, 6, 7, 8, 9, 10],
 [2825, 3366, 22, 3952, 201, 28, 291759, 1077199, 5438])

In [None]:
from sklearn.preprocessing import MinMaxScaler
numeric=[ 'hour', 'C1', 'banner_pos', 'device_type',
          'device_conn_type', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'month', 'dayofweek', 'day']
scaler = MinMaxScaler()
X[numeric] = scaler.fit_transform(X[numeric])
X.head()

Unnamed: 0,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,...,C16,C17,C18,C19,C20,C21,month,dayofweek,day,hour
3577888,0.36,0.14,271,875,3,897,32,0,5543,23043,...,0.13,0.91,0.0,0.0,0.0,0.87,0.0,0.17,0.78,0.43
4993932,0.36,0.0,600,779,5,413,56,15,3979,9003,...,0.13,0.96,0.67,0.0,1.0,0.09,0.0,0.5,1.0,1.0
4094900,0.36,0.14,995,195,3,897,32,0,5543,31552,...,0.13,0.8,0.0,0.5,0.0,0.19,0.0,0.33,0.89,0.17
4420497,0.36,0.14,1002,478,17,897,32,0,5543,37491,...,0.13,0.98,1.0,0.01,0.0,0.09,0.0,0.33,0.89,0.83
634465,0.36,0.14,412,327,17,897,32,0,5543,140,...,0.13,0.17,0.67,0.0,0.0,0.12,0.0,0.33,0.11,0.26


In [None]:
test_i[numeric] = scaler.transform(test_i[numeric])
test_i.head()

Unnamed: 0,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,...,C16,C17,C18,C19,C20,C21,month,dayofweek,day,hour
0,-101.02,0.0,393,3234,20,3643,94,0,193125,445746,...,-0.09,-0.04,0.04,-0.02,0.0,-0.0,-30.0,0.0,-2.62,0.0
1,-101.02,0.0,355,3197,1,3643,94,0,193125,979645,...,-0.09,-0.04,0.0,-0.02,0.0,-0.0,-30.0,0.0,-2.62,0.0
2,-101.02,0.0,355,3197,1,3643,94,0,193125,71243,...,-0.09,-0.04,0.0,-0.02,0.0,-0.0,-30.0,0.0,-2.62,0.0
3,-101.02,0.0,1491,2590,5,1224,137,3,193125,278115,...,-0.09,-0.04,0.04,-0.02,0.0,-0.0,-30.0,0.0,-2.62,0.0
4,-101.02,0.0,1491,2590,5,2403,21,26,193125,31686,...,-0.09,-0.04,0.0,-0.02,0.0,-0.0,-30.0,0.0,-2.62,0.0


In [None]:
!pip install optuna
!pip install pytorch-tabnet



In [None]:
# Import libraries for Tabnet
#!pip install pytorch-tabnet
import torch
from torch.optim.lr_scheduler import ReduceLROnPlateau
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetClassifier

# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os
from copy import deepcopy
from functools import partial
# Import sklearn classes for model selection, cross validation, and performance evaluation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold, GroupShuffleSplit, RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler
import seaborn as sns

# Import libraries for Hypertuning
import optuna
optuna.logging.set_verbosity(optuna.logging.ERROR)

In [None]:
def seed_everything(seed):
    """
    Set the random seeds to ensure reproducibility across different runs.

    Args:
    - seed (int): The seed value to use for random number generation.
    """
    random.seed(seed)  # Set seed for Python's built-in random module
    os.environ['PYTHONHASHSEED'] = str(seed)  # Set seed for hash seed used by Python
    np.random.seed(seed)  # Set seed for NumPy
    torch.manual_seed(seed)  # Set seed for PyTorch's CPU random generator
    torch.cuda.manual_seed(seed)  # Set seed for PyTorch's CUDA random generator(s)
    torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior for CuDNN

# Set the seed value
seed = 42

# Call the seed_everything function to set the seeds
seed_everything(seed)


In [None]:
pretrainer = TabNetPretrainer(
    cat_idxs=categorical_idx,
    cat_dims=categorical_dims,
    verbose=1,
    cat_emb_dim=1,
    n_d=10,
    n_a=10,
    n_steps=2,
    lambda_sparse=1e-5,
    n_shared=3,
    gamma =1.5,
    mask_type='entmax',
    n_shared_decoder=1,
    n_indep_decoder=1

)

In [None]:
max_epochs = 10 # 500
fit_params = {
    "X_train": X.values,
    "eval_set": [test_i.values],
    "max_epochs": max_epochs,
}

pretrainer.fit(**fit_params)

In [None]:
# Predict embeddings and reconstruction for the training data
reconstructed_X, embedded_X = pretrainer.predict(X.values)

# Predict embeddings and reconstruction for the test data
reconstructed_X1, embedded_X1 = pretrainer.predict(test_i.values)




In [None]:
# Save the pretrainer model
pretrainer.save_model('./model_pretrain')

In [None]:
# Convert the embeddings to DataFrame and save them to a CSV file
embedded_X = pd.DataFrame(embedded_X)
reconstructed_X = pd.DataFrame(reconstructed_X)
embedded_X.to_csv('/embedded_X.csv')
reconstructed_X.to_csv('/reconstructed_X.csv')

# Convert the embeddings for the test data to DataFrame and save them to a CSV file
embedded_X1 = pd.DataFrame(embedded_X1)
reconstructed_X1 = pd.DataFrame(reconstructed_X1)
embedded_X1.to_csv('/embedded_X1.csv')
reconstructed_X1.to_csv('/reconstructed_X1.csv')
