In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.utils import resample

In [6]:
# Load the data
train_data = pd.read_csv('train (1).csv')
test_data = pd.read_csv('test (2).csv')

In [7]:
# Save the 'Id' column from the test data
test_ids = test_data['Id']

X = train_data.drop('Class', axis=1)
y = train_data['Class']

In [8]:
# Define numerical and categorical columns
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

In [25]:
# Preprocessing pipeline for numerical columns
numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing pipeline for categorical columns
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine the preprocessing steps using ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

In [26]:
# Transform the data using the preprocessor
X_processed = preprocessor.fit_transform(X)

In [27]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_processed, y, test_size=0.2, random_state=42)

In [45]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score


In [46]:
# Assuming you already have X_train and y_train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [48]:
param_distributions = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}


In [49]:
xgb_model = XGBClassifier(random_state=42)

random_search = RandomizedSearchCV(
    xgb_model,
    param_distributions=param_distributions,
    n_iter=10,
    cv=5,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

print("Best parameters found: ", random_search.best_params_)


ValueError: 
All the 50 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/home/Library/Python/3.9/lib/python/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/xgboost/sklearn.py", line 814, in fit
    train_dmatrix = DMatrix(X, label=training_labels, weight=sample_weight,
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/xgboost/core.py", line 519, in __init__
    data, feature_names, feature_types = _convert_dataframes(
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/xgboost/core.py", line 416, in _convert_dataframes
    data, feature_names, feature_types = _maybe_pandas_data(data,
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/xgboost/core.py", line 294, in _maybe_pandas_data
    raise ValueError(msg + ', '.join(bad_fields))
ValueError: DataFrame.dtypes for data must be int, float or bool.
                Did not expect the data types in fields Artist Name, Track Name


In [28]:
# Convert sparse matrix to dense arrays for upsampling
X_train_dense = X_train.toarray()

In [29]:
# Upsample the minority class if feasible
unique_classes = y_train.unique()
min_samples_per_class = min(y_train.value_counts())
X_train_resampled = []
y_train_resampled = []

for class_label in unique_classes:
    X_class = X_train_dense[y_train == class_label]
    if len(X_class) < min_samples_per_class:
        X_class_resampled, _ = resample(X_class, n_samples=min_samples_per_class, replace=True, random_state=42)
        X_train_resampled.append(X_class_resampled)
        y_train_resampled.extend([class_label] * min_samples_per_class)
    else:
        X_train_resampled.append(X_class)
        y_train_resampled.extend([class_label] * len(X_class))

X_train_resampled = np.vstack(X_train_resampled)
y_train_resampled = np.array(y_train_resampled)

In [30]:
# Initialize the model (Gradient Boosting Classifier)
model = GradientBoostingClassifier(random_state=42)

In [51]:
# Reduced search space
param_distributions = {
    'n_estimators': [100, 150],
    'max_depth': [3, 5],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

random_search = RandomizedSearchCV(model, param_distributions, n_iter=10, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)
random_search.fit(X_train_resampled, y_train_resampled)

print("Best parameters found: ", random_search.best_params_)

KeyboardInterrupt: 

In [32]:
# Fit the best model on the validation set
best_model = random_search.best_estimator_
best_model.fit(X_train_resampled, y_train_resampled)

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_estimator_'

In [None]:
# Make predictions on the validation set
y_val_pred = best_model.predict(X_val)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_val, y_val_pred)
print("Cross-validated accuracy:", accuracy)