<a href="https://colab.research.google.com/github/Kusumash28/my-project/blob/main/Kaggle_project(steelplates).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#  Step 1: Install and import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from google.colab import files

#  Step 2: Load datasets
train = pd.read_csv("/content/train.csv")
test = pd.read_csv("/content/test.csv")
sample_submission = pd.read_csv("/content/sample_submission.csv")

#  Step 3: Define features and targets
target_cols = ["Pastry", "Z_Scratch", "K_Scatch", "Stains", "Dirtiness", "Bumps", "Other_Faults"]
feature_cols = [col for col in train.columns if col not in ['id'] + target_cols]

X = train[feature_cols]
y = train[target_cols]
X_test = test[feature_cols]

#  Step 4: Preprocessing (Scaling)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

#  Step 5: Split for validation
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

#  Step 6: Train Multi-label Classifier
clf = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
clf.fit(X_train_split, y_train_split)

#  Step 7: Evaluate accuracy on validation set
y_val_pred = clf.predict(X_val_split)
val_accuracy = np.mean([accuracy_score(y_val_split[col], y_val_pred[:, idx])
                        for idx, col in enumerate(target_cols)])
print(f"\n🔍 Validation Accuracy (Average across labels): {val_accuracy:.4f}\n")

# Optional: Print classification report for each label
for idx, col in enumerate(target_cols):
    print(f"--- {col} ---")
    print(classification_report(y_val_split[col], y_val_pred[:, idx]))

#  Step 8: Predict probabilities on test set
y_test_proba = clf.predict_proba(X_test_scaled)
# Extract probabilities
submission = pd.DataFrame({'id': test['id']})
for i, col in enumerate(target_cols):
    submission[col] = y_test_proba[i][:, 1]  # Probability of class 1

#  Step 9: Save submission file
submission.to_csv("submission.csv", index=False)
print("\n✅ Submission file 'submission.csv' is ready.")

#  Step 10: Download submission file
files.download("submission.csv")



🔍 Validation Accuracy (Average across labels): 0.8924

--- Pastry ---
              precision    recall  f1-score   support

           0       0.92      1.00      0.96      3543
           1       0.61      0.05      0.09       301

    accuracy                           0.92      3844
   macro avg       0.77      0.52      0.52      3844
weighted avg       0.90      0.92      0.89      3844

--- Z_Scratch ---
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      3624
           1       0.69      0.44      0.54       220

    accuracy                           0.96      3844
   macro avg       0.83      0.71      0.76      3844
weighted avg       0.95      0.96      0.95      3844

--- K_Scatch ---
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      3162
           1       0.91      0.90      0.90       682

    accuracy                           0.97      3844
   macro avg       0.9

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Install LightGBM if not already installed
!pip install lightgbm --quiet

# Step 1: Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

In [None]:
from google.colab import files  # Upload train.csv and test.csv manually in runtime

# Step 3: Read the datasets
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

In [None]:
# Step 4: Set target columns and features
target_columns = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
feature_columns = [col for col in train_df.columns if col not in target_columns + ['id']]

X = train_df[feature_columns]
y = train_df[target_columns]
X_test = test_df[feature_columns]
ids = test_df['id']

# Optional: Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

In [None]:
predictions = pd.DataFrame({'id': ids})
auc_scores = {}

for defect in target_columns:
    print(f"Training for defect: {defect}")

    X_train, X_val, y_train, y_val = train_test_split(
        X_scaled, y[defect], test_size=0.2, random_state=42
    )

    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)

    params = {
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'learning_rate': 0.05,
        'num_leaves': 31,
        'random_state': 42
    }

    model = lgb.train(
        params,
        train_data,
        valid_sets=[val_data],
        num_boost_round=1000,
        callbacks=[lgb.early_stopping(stopping_rounds=50)]
    )


    val_pred = model.predict(X_val)
    auc = roc_auc_score(y_val, val_pred)
    auc_scores[defect] = auc

    test_pred = model.predict(X_test_scaled)
    predictions[defect] = test_pred

Training for defect: Pastry
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[23]	valid_0's auc: 0.875113
Training for defect: Z_Scratch
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[266]	valid_0's auc: 0.959764
Training for defect: K_Scatch
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[88]	valid_0's auc: 0.986918
Training for defect: Stains
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[7]	valid_0's auc: 0.990592
Training for defect: Dirtiness
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[121]	valid_0's auc: 0.878813
Training for defect: Bumps
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[129]	valid_0's auc: 0.80435
Training for defect: Other_Faults
Training until validation scores don't i

In [None]:
# Step 6: Save submission file
predictions.to_csv('submission.csv', index=False)
files.download("submission.csv")
print("✅ Submission file saved as 'submission.csv'")

# Step 7: Print AUC scores
print("\n🔍 AUC Scores per defect category:")
for defect, score in auc_scores.items():
    print(f"{defect}: {score:.4f}")

print(f"\n🎯 Mean AUC Score: {np.mean(list(auc_scores.values())):.4f}")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Submission file saved as 'submission.csv'

🔍 AUC Scores per defect category:
Pastry: 0.8751
Z_Scratch: 0.9598
K_Scatch: 0.9869
Stains: 0.9906
Dirtiness: 0.8788
Bumps: 0.8043
Other_Faults: 0.7114

🎯 Mean AUC Score: 0.8867


In [None]:
!pip install lightgbm --quiet

# Step 1: Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

In [None]:
# Step 2: Upload datasets (train.csv and test.csv)
from google.colab import files  # upload train.csv and test.csv

# Step 3: Read datasets
train_df = pd.read_csv("/content/train.csv")
test_df = pd.read_csv("/content/test.csv")

# Step 4: Define features and targets
target_columns = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
feature_columns = [col for col in train_df.columns if col not in target_columns + ['id']]

X = train_df[feature_columns]
y = train_df[target_columns]
X_test = test_df[feature_columns]
ids = test_df['id']

# Optional: Scaling features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Step 5: Train models
predictions = pd.DataFrame({'id': ids})
auc_scores = {}

for defect in target_columns:
    print(f"\n🚀 Training model for: {defect}")

    # Handle class imbalance
    pos_weight = (len(y) - y[defect].sum()) / y[defect].sum()

    # Split with stratification on the current defect label
    X_train, X_val, y_train, y_val = train_test_split(
        X_scaled, y[defect], test_size=0.2, stratify=y[defect], random_state=42
    )

    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)

    params = {
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'learning_rate': 0.03,
        'num_leaves': 40,
        'max_depth': 8,
        'scale_pos_weight': pos_weight,
        'random_state': 42
    }

    model = lgb.train(
        params,
        train_data,
        valid_sets=[val_data],
        num_boost_round=1500,
        callbacks=[lgb.early_stopping(stopping_rounds=50)]
    )

    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    auc = roc_auc_score(y_val, val_pred)
    auc_scores[defect] = auc

    # Predict for test
    test_pred = model.predict(X_test_scaled, num_iteration=model.best_iteration)
    predictions[defect] = test_pred


🚀 Training model for: Pastry
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[145]	valid_0's auc: 0.859689

🚀 Training model for: Z_Scratch
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[108]	valid_0's auc: 0.959229

🚀 Training model for: K_Scatch
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[140]	valid_0's auc: 0.986271

🚀 Training model for: Stains
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[155]	valid_0's auc: 0.993742

🚀 Training model for: Dirtiness
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[5]	valid_0's auc: 0.858321

🚀 Training model for: Bumps
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[158]	valid_0's auc: 0.807706

🚀 Training model for: Other_Faults
Training until validati

In [None]:
# Step 6: Save predictions
predictions.to_csv("submission.csv", index=False)
print("✅ Submission file saved as 'submission.csv'")

# Step 7: AUC Results
print("\n🎯 AUC Scores:")
for defect, score in auc_scores.items():
    print(f"{defect}: {score:.4f}")
print(f"\n🔥 Average AUC Score: {np.mean(list(auc_scores.values())):.4f}")

✅ Submission file saved as 'submission.csv'

🎯 AUC Scores:
Pastry: 0.8597
Z_Scratch: 0.9592
K_Scatch: 0.9863
Stains: 0.9937
Dirtiness: 0.8583
Bumps: 0.8077
Other_Faults: 0.7066

🔥 Average AUC Score: 0.8817
