In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/speech-based-classification-layer-10/valid.csv
/kaggle/input/speech-based-classification-layer-10/train.csv
/kaggle/input/speech-based-classification-layer-10/test.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.utils import class_weight
from sklearn.feature_selection import SequentialFeatureSelector

In [3]:
train_df = pd.read_csv("/kaggle/input/speech-based-classification-layer-10/train.csv")
valid_df = pd.read_csv("/kaggle/input/speech-based-classification-layer-10/valid.csv")
test_df = pd.read_csv("/kaggle/input/speech-based-classification-layer-10/test.csv")

In [4]:
LABELS = ["label_1","label_2","label_3","label_4"]
models = {}
test_predictions = {}
random_searches = {}
pcas = {}
k_bests = {}

In [5]:
# Loop through each label for classification
for label in LABELS:
    print(f"Processing label: {label}")

    # Data Pre-processing
    # Separate features and labels
    train_X = train_df.dropna(subset=[label]).drop(LABELS, axis=1)
    train_y = train_df.dropna(subset=[label])[label].astype(int)
    valid_X = valid_df.dropna(subset=[label]).drop(LABELS, axis=1)
    valid_y = valid_df.dropna(subset=[label])[label].astype(int)

    # Compute class weights for handling class imbalance
    class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(train_y), y=train_y)
    class_weight_dict = {cls: weight for cls, weight in zip(np.unique(train_y), class_weights)}

    # Feature Scaling
    scaler = StandardScaler()
    train_X_scaled = scaler.fit_transform(train_X)
    valid_X_scaled = scaler.transform(valid_X)

    # Dimensionality Reduction with PCA
    pca = PCA(n_components=0.99, svd_solver='full')
    train_X_pca = pca.fit_transform(train_X_scaled)
    valid_X_pca = pca.transform(valid_X_scaled)

    k_best = SelectKBest(score_func=f_classif, k=300)
    train_X_selected = k_best.fit_transform(train_X_pca, train_y)
    valid_X_selected = k_best.transform(valid_X_pca)

#     sfs = SequentialFeatureSelector(
#         RandomForestClassifier(class_weight=class_weight_dict),
#         scoring='accuracy',
#         cv=3
#     )

#     # Fit the SequentialFeatureSelector on training data
#     sfs.fit(train_X_pca, train_y)

#     # Get the selected feature indices
#     selected_feature_indices = list(sfs.k_feature_idx_)

#     # Transform the data to include only selected features
#     train_X_selected = train_X_pca[:, selected_feature_indices]
#     valid_X_selected = valid_X_pca[:, selected_feature_indices]

    if label == LABELS[2]:
        model= LogisticRegression(class_weight=class_weight_dict)
        param_grid = {
            'penalty': ['l1', 'l2'],
            'C': [100, 10, 1.0, 0.1, 0.01],
            'solver': ['newton-cg', 'lbfgs', 'liblinear']
        }

    else:
        model= SVC(class_weight=class_weight_dict)
        param_grid = {
                'C':  [50, 10, 1.0, 0.1, 0.01],
                'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                'gamma': ['scale', 'auto']
            }
    random_search = RandomizedSearchCV(
    estimator=model, param_distributions=param_grid,
    n_iter=20, cv=3, verbose=2, random_state=42, n_jobs=-1)


    random_search.fit(train_X_selected,train_y)
    best_model_estimater = random_search.best_estimator_
    best_params = random_search.best_params_


    print(f"{label} with param : {best_params}")

    # Evaluate the model on the validation set
    valid_pred = best_model_estimater.predict(valid_X_selected)
    accuracy = accuracy_score(valid_y, valid_pred)
    print(f"Validation Accuracy for {label} : {accuracy}")

    # Store the trained model in the dictionary
    models[label] = best_model_estimater
    random_searches[label] = random_search
    pcas[label] = pca
    k_bests[label] = k_best


    test_X = test_df.drop(columns=["ID"])
    test_X_scaled = scaler.transform(test_X)
    test_X_pca = pca.transform(test_X_scaled)
#     test_X_selected = test_X_pca[:, selected_feature_indices]
    test_X_selected = k_best.transform(test_X_pca)

    test_predictions[label] = best_model_estimater.predict(test_X_selected)



Processing label: label_1
Fitting 3 folds for each of 20 candidates, totalling 60 fits
label_1 with param : {'kernel': 'linear', 'gamma': 'scale', 'C': 1.0}
Validation Accuracy for label_1 : 0.9586666666666667
Processing label: label_2
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END .................C=1.0, gamma=scale, kernel=sigmoid; total time= 1.9min
[CV] END ..................C=1.0, gamma=scale, kernel=linear; total time= 1.3min
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time= 4.0min
[CV] END ....................C=10, gamma=auto, kernel=linear; total time= 1.3min
[CV] END ....................C=0.01, gamma=auto, kernel=poly; total time= 3.4min
[CV] END .................C=0.1, gamma=scale, kernel=sigmoid; total time= 3.1min
[CV] END .......................C=50, gamma=auto, kernel=rbf; total time= 4.1min
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time= 3.4min
[CV] END .....................C=10, gamma=scale, kernel

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

label_3 with param : {'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.1}
Validation Accuracy for label_3 : 0.992
Processing label: label_4
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END ...............C=0.01, penalty=l2, solver=newton-cg; total time=   4.2s
[CV] END ................C=0.1, penalty=l2, solver=liblinear; total time=   8.2s
[CV] END ................C=1.0, penalty=l2, solver=liblinear; total time=  10.3s
[CV] END .................C=10, penalty=l1, solver=liblinear; total time=  18.9s
[CV] END ...................C=0.01, penalty=l2, solver=lbfgs; total time=   1.6s
[CV] END ...................C=0.01, penalty=l2, solver=lbfgs; total time=   1.6s
[CV] END ....................C=100, penalty=l2, solver=lbfgs; total time=   1.5s
[CV] END ....................C=100, penalty=l2, solver=lbfgs; total time=   1.5s
[CV] END ....................C=1.0, penalty=l2, solver=lbfgs; total time=   1.7s
[CV] END ................C=100, penalty=l2, solver=liblinear; total time=  2

In [6]:
# Create a DataFrame with the predictions
submission_df = pd.DataFrame(test_predictions)

# Save the DataFrame to a CSV file with the modified index
submission_df.index += 1
submission_df.index.name = "ID"
submission_df.to_csv("submission.csv")