### Importing Libraries

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from google.colab import files

### Loading and Reading Data on Colab

In [None]:
print("Upload train.csv and test.csv files")
uploaded = files.upload()

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

### Feature and Target Extraction

In [None]:
train_id = train_df.iloc[:, 0]
test_id = test_df.iloc[:, 0]

X_train = train_df.iloc[:, 1:-1].values  # First column is ID, last is target
y_train = train_df.iloc[:, -1].values    # Last column is target
X_test = test_df.iloc[:, 1:].values      # First column is ID

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"Target distribution: {np.bincount(y_train.astype(int))}")

### Median Imputation

In [None]:
# creating a new dataframe to handle missing values
X_train_df = pd.DataFrame(X_train)
X_test_df = pd.DataFrame(X_test)

# Check for missing values
train_missing = X_train_df.isnull().sum().sum()
test_missing = X_test_df.isnull().sum().sum()

if train_missing > 0 or test_missing > 0:
    for col in X_train_df.columns:
        median_val = np.nanmedian(X_train_df[col])
        X_train_df[col].fillna(median_val, inplace=True)
        X_test_df[col].fillna(median_val, inplace=True)

X_train = X_train_df.values
X_test = X_test_df.values

### Removing NAN and INF values

In [None]:
# Replace infinite values with max/min finite values
X_train = np.nan_to_num(X_train, nan=0.0, posinf=np.finfo(np.float32).max, neginf=np.finfo(np.float32).min)
X_test = np.nan_to_num(X_test, nan=0.0, posinf=np.finfo(np.float32).max, neginf=np.finfo(np.float32).min)

for i in range(X_train.shape[1]):
    majority = np.percentile(X_train[:, i], 99.9)
    minority = np.percentile(X_train[:, i], 0.1)
    
    X_train[:, i] = np.clip(X_train[:, i], minority, majority)
    X_test[:, i] = np.clip(X_test[:, i], minority, majority)

### Scaling Features

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Feature Engineering

In [None]:
X_train_mean = np.mean(X_train_scaled, axis=1).reshape(-1, 1)
X_train_std = np.std(X_train_scaled, axis=1).reshape(-1, 1)
X_train_max = np.max(X_train_scaled, axis=1).reshape(-1, 1)
X_train_min = np.min(X_train_scaled, axis=1).reshape(-1, 1)

X_test_mean = np.mean(X_test_scaled, axis=1).reshape(-1, 1)
X_test_std = np.std(X_test_scaled, axis=1).reshape(-1, 1)
X_test_max = np.max(X_test_scaled, axis=1).reshape(-1, 1)
X_test_min = np.min(X_test_scaled, axis=1).reshape(-1, 1)


X_train_final = np.hstack([X_train_scaled, X_train_mean, X_train_std, X_train_max, X_train_min])
X_test_final = np.hstack([X_test_scaled, X_test_mean, X_test_std, X_test_max, X_test_min])

### Handling Imbalanced Data

In [None]:
unique, counts = np.unique(y_train, return_counts=True)
class_ratio = counts[0] / counts[1] if len(counts) > 1 else 1

print(f"Class distribution: {dict(zip(unique, counts))}")
print(f"Class ratio: {class_ratio:.2f}")

scale_pos_weight = class_ratio if class_ratio > 1 else 1/class_ratio

### Choosing XGBoost Model

In [None]:
print("\nTraining XGBoost Model...")

xgb_model = XGBClassifier(
    n_estimators=1000,           # Number of Trees
    max_depth=6,                 # Depth
    learning_rate=0.01,          # Learning rate
    subsample=0.8,               # Row subsampling
    colsample_bytree=0.8,        # Column subsampling
    min_child_weight=1,          # Minimum sum of instance weight
    gamma=0,                     # No minimum loss reduction
    reg_alpha=0.01,              # L1 regularization
    reg_lambda=1,                # L2 regularization
    scale_pos_weight=scale_pos_weight,  # Handling class imbalance
    objective='binary:logistic',
    eval_metric='auc',
    use_label_encoder=False,
    random_state=42,
    n_jobs=-1,
    tree_method='auto'
)

### 5-Fold Cross Validation

In [None]:
print("\nPerforming 5-fold cross-validation...")
cv_scores = cross_val_score(xgb_model, X_train_final, y_train, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), scoring='roc_auc', n_jobs=-1)

### Training

In [None]:
xgb_model.fit(X_train_final, y_train)

### Making Predictions

In [None]:
y_pred_prob = xgb_model.predict_prob(X_test_final)[:, 1]

threshold = 0.5
y_pred = (y_pred_prob >= threshold).astype(int)

### Top Feature Importances

In [None]:
feature_importance = xgb_model.feature_importances_
top_features_idx = np.argsort(feature_importance)[-10:][::-1]

for idx in top_features_idx[:10]:
    print(f"Feature {idx}: {feature_importance[idx]:.4f}")

### Output

In [None]:
submission = pd.DataFrame({'id': test_id.astype(int), 'song_popularity': y_pred})

output_file = 'song_popularity_predictions.csv'
submission.to_csv(output_file, index=False)

files.download(output_file) # downloading

### Probability Output

In [None]:
output_probability = pd.DataFrame({'id': test_id.astype(int), 'song_popularity': y_pred_prob})

submission_probability = 'submission_probability.csv'
output_probability.to_csv(submission_probability, index=False)

files.download(submission_probability)