In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.decomposition import PCA
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
selected_label = 'label_4'

## Loading the data.

In [None]:
train_data = pd.read_csv('/kaggle/input/dataset-layer-12/train.csv')
test_data = pd.read_csv('/kaggle/input/dataset-layer-12/test.csv')
valid_data = pd.read_csv('/kaggle/input/dataset-layer-12/valid.csv')
train_data.head()

In [None]:
train_data.shape

### Checking the null values in the train dataset.

In [None]:
train_data.isnull().sum()

### Dropping the null valued rows.

In [None]:
train_data = train_data.dropna()
train_data.isnull().sum()

In [None]:
train_data.shape

In [None]:
test_data = test_data.drop('ID', axis=1)
test_data.head()

### Checking the null values in the valid dataset.

In [None]:
valid_data.isnull().sum()

### Dropping the data for null values in valid dataset.

In [None]:
valid_data = valid_data.dropna()
valid_data.isnull().sum()

In [None]:
y_test = valid_data[selected_label]
valid_data = valid_data.iloc[:,:768]
valid_data.head()

## Train the dataset.

In [None]:
X_train = train_data.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1)
X_train.head()

In [None]:
y_train = train_data[selected_label]
y_train

### Grid search to find the  best parameters for the SVC model.

In [None]:
# C = [i for i in range(1,101)]
# # Creating the model.
# svc_model = SVC(kernel='rbf')

# param_grid = {
#     'C':C
# }
# grid = HalvingGridSearchCV(svc_model, param_grid, cv=5, verbose=1)
# grid.fit(X_train, y_train)
# print(grid.best_params_)

With the grid search can't achieve the needed accuracy.

## Splitting the train data set to generate the valid set.
This is test to any issue in the model since for some labels model gives very low accuracy for  the valid dataset given.

In [None]:
# X_train , valid_data, y_train, y_test = train_test_split(X_train, y_train, test_size=0.25, random_state=28)

## Scale the data set.

In [None]:
# Create a StandardScaler object
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
valid_data = scaler.fit_transform(valid_data)
test_data = scaler.fit_transform(test_data)

## Applying PCA for the features.

In [None]:
# Create a PCA object with the desired number of components
pca = PCA(n_components=0.99, svd_solver='full') 

# Fit the PCA model on your data
pca.fit(X_train)

# Transform the data into the new feature space
X_train_pca = pca.transform(X_train)
X_train_pca.shape

In [None]:
# model = CatBoostClassifier()
# param_grid = {
#     'learning_rate': [0.01, 0.1, 0.2],
#     'depth': [4, 6, 8],
#     'l2_leaf_reg': [1, 3, 5],
#     'iterations': [100, 200, 300]
# }

# grid = HalvingGridSearchCV(model, param_grid, cv=5, verbose=1)
# grid.fit(X_train_pca, y_train)
# print(grid.best_params_)

In [None]:
valid_data_pca = pca.transform(valid_data)
test_data_pca = pca.transform(test_data)

In [None]:
model = CatBoostClassifier(learning_rate=0.1, depth=7, l2_leaf_reg=3, iterations=200,task_type='GPU')
model.fit(X_train_pca, y_train)

## Make predictions on the valid dataset.

In [None]:
y_pred = model.predict(valid_data_pca)

## Evaluate the Model.

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

## Accuracy summary.
This is for pca using n_components=0.95
* label_1 = 0.14
* label_2 = 0.15
* label_3 = 0.83
* label_4 = 0.66

pca using n_components=0.95 and svd_solver=full
* label_1 = 0.12
* label_2 = 0.15
* label_3 = 0.83
* label_4 = 0.66

For the splitted data
* label_1 = 0.76

Increased the iterations and depth
* label_1 = 0.84
* label_2 = 0.71
* label_3 = 0.98
* label_4 = 0.81

## Testing the model.

In [None]:
test_data.shape

In [None]:
# test_data.head()

In [None]:
y_pred_test = model.predict(test_data_pca)
output_df = pd.DataFrame(index=range(len(y_pred_test)))
output_df[selected_label] = y_pred_test
output_df

In [None]:
filename = selected_label + '.csv'
output_df.to_csv('/kaggle/working/'+filename, index=False)

## Run the below code after creating the label_i.csv files.

In [None]:
# labels = ['label_1', 'label_2', 'label_3', 'label_4']
# merged_data = pd.DataFrame(columns = labels)



# for label_i in labels:
#     file_name = label_i + '.csv'
#     df = pd.read_csv(file_name)
    
    
#     merged_data[label_i] = df[label_i].astype(int)
    
#     print(label_i, file_name)

# merged_data.index += 1
# merged_data.index.name = 'ID'
# merged_data.to_csv('/kaggle/working/merged_output.csv', index=True)
# print('created the merged csv')
# merged_data.head()