In [57]:
#Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score,accuracy_score,classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,AdaBoostClassifier, StackingClassifier,VotingClassifier


from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

import warnings
warnings.filterwarnings('ignore')

In [2]:
from xgboost import XGBClassifier


In [3]:
#Load Data
train = pd.read_csv(r"C:\Users\infan\OneDrive\Desktop\Gayathri\dataset\mini_hackathon\train.csv")
test=pd.read_csv(r"C:\Users\infan\OneDrive\Desktop\Gayathri\dataset\mini_hackathon\test.csv")
submission_data=pd.read_csv(r"C:\Users\infan\OneDrive\Desktop\Gayathri\dataset\mini_hackathon\sample_submission.csv")

In [4]:
#columns based on datatypes seperated
ignore_col=['employee_id']
target_col=['is_promoted']
num_col=[]
cat_col=[]
for col in train.columns:
    if col not in ignore_col + target_col:
        if train[col].dtypes == 'object':
            cat_col.append(col)
        else:
            num_col.append(col)

In [5]:
#categorical and numerical columns imputed and encoded
#categorical columns imputed by mode and encoded by onehotencoder
#numerical columns imputed by median and encoded by standardscalar
cat_pipe_encode=Pipeline(
    steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('ohe',OneHotEncoder(handle_unknown='ignore'))
])
num_pip_encode=Pipeline(
    steps=[
    ('impute',SimpleImputer(strategy='median')),
    ('standardscalar',StandardScaler())
])
## map tranformation to features
preprocess=ColumnTransformer(
    transformers=[
        ('cat_encode',cat_pipe_encode,cat_col),
        ('num_encode',num_pip_encode,num_col)
    ]
)

In [6]:
X=train.drop(columns=target_col+ignore_col)
y=train[target_col]


In [15]:
model_pipeline = Pipeline(
    steps=[
        ('preprocess', preprocess),
        ('model', DecisionTreeClassifier())
    ]
)

In [19]:
train_X, val_X, train_y, val_y = train_test_split(X,y,random_state=42, test_size=0.3)
model_pipeline.fit(train_X, train_y)

In [378]:
def evalution(train_X,train_y,val_X,val_y,model):
    predicted_train_tgt = model.predict(train_X)
    predicted_val_tgt = model.predict(val_X)
    
    print("accuracy score train",accuracy_score(train_y,predicted_train_tgt))
    print("accuracy score test",accuracy_score(val_y,predicted_val_tgt))
    
    print("f1score train",f1_score(train_y,predicted_train_tgt))
    print("f1score test",f1_score(val_y,predicted_val_tgt))
    
    # print("classification train \n",classification_report(train_y,predicted_train_tgt))
    # print("classification test \n",classification_report(val_y,predicted_val_tgt))

In [118]:
evalution(train_X,train_y,val_X,val_y)

TypeError: evalution() missing 1 required positional argument: 'model'

In [28]:
#1.oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
train_X_res, train_y_res = ros.fit_resample(train_X, train_y)


In [36]:
model_pipeline.fit(train_X_res, train_y_res)
evalution(train_X_res,train_y_res,val_X,val_y)

accuracy score train 0.8232372252355125
accuracy score train 0.7195159034239494
f1score train 0.8429263590472084
f1score test 0.3526108927568782


In [40]:
data=pd.concat([train_X_res,train_y_res])
data['is_promoted'].value_counts()

is_promoted
0.0    35030
1.0    35030
Name: count, dtype: int64

In [44]:
#1.undersampling
from imblearn.under_sampling import RandomUnderSampler
uos = RandomUnderSampler(random_state=42)
train_X_ues, train_y_ues = uos.fit_resample(train_X, train_y)
model_pipeline.fit(train_X_ues, train_y_ues)
evalution(train_X_ues,train_y_ues,val_X,val_y)

accuracy score train 0.8278860569715143
accuracy score train 0.711062458188895
f1score train 0.8468925046679114
f1score test 0.34962354551676933


In [71]:
data=pd.concat([train_X_ues,train_y_ues])
data['is_promoted'].value_counts()

is_promoted
0.0    3335
1.0    3335
Name: count, dtype: int64

In [49]:
model_pipeline2 = Pipeline(
    steps=[
        ('preprocess', preprocess),
        ('model', DecisionTreeClassifier(
            criterion='entropy',
            max_depth=None,
            max_leaf_nodes=20,
            min_samples_leaf=1,
            min_samples_split=2
        ))
    ]
)

In [65]:
#1.oversampling
model_pipeline2.fit(train_X_res, train_y_res)
evalution(train_X_res,train_y_res,val_X,val_y,model_pipeline2)

accuracy score train 0.806894090779332
accuracy score train 0.6822964179286019
f1score train 0.8327915858165145
f1score test 0.33214011761697776
classification train 
               precision    recall  f1-score   support

           0       0.94      0.65      0.77     35030
           1       0.73      0.96      0.83     35030

    accuracy                           0.81     70060
   macro avg       0.84      0.81      0.80     70060
weighted avg       0.84      0.81      0.80     70060

classification test 
               precision    recall  f1-score   support

           0       1.00      0.66      0.79     15110
           1       0.20      0.97      0.33      1333

    accuracy                           0.68     16443
   macro avg       0.60      0.82      0.56     16443
weighted avg       0.93      0.68      0.75     16443



In [67]:

model_pipeline2.fit(train_X, train_y)
evalution(train_X,train_y,val_X,val_y,model_pipeline2)

accuracy score train 0.9335331682523134
accuracy score train 0.9372985464939488
f1score train 0.40780306549001394
f1score test 0.39813193228254523
classification train 
               precision    recall  f1-score   support

           0       0.93      1.00      0.96     35030
           1       0.90      0.26      0.41      3335

    accuracy                           0.93     38365
   macro avg       0.92      0.63      0.69     38365
weighted avg       0.93      0.93      0.92     38365

classification test 
               precision    recall  f1-score   support

           0       0.94      1.00      0.97     15110
           1       0.90      0.26      0.40      1333

    accuracy                           0.94     16443
   macro avg       0.92      0.63      0.68     16443
weighted avg       0.93      0.94      0.92     16443



In [75]:
model_pipeline3 = Pipeline(
    steps=[
        ('preprocess', preprocess),
        ('model', GradientBoostingClassifier(
            criterion='friedman_mse',
            max_depth=20,
            max_leaf_nodes=20,
            min_samples_leaf=2,
            min_samples_split=2
        ))
    ]
)

In [77]:
#1.oversampling
model_pipeline3.fit(train_X_res, train_y_res)
evalution(train_X_res,train_y_res,val_X,val_y,model_pipeline3)

accuracy score train 0.8558521267485013
accuracy score test 0.7617831295992216
f1score train 0.8689309678005477
f1score test 0.37498005425243336
classification train 
               precision    recall  f1-score   support

           0       0.94      0.76      0.84     35030
           1       0.80      0.96      0.87     35030

    accuracy                           0.86     70060
   macro avg       0.87      0.86      0.85     70060
weighted avg       0.87      0.86      0.85     70060

classification test 
               precision    recall  f1-score   support

           0       0.99      0.75      0.85     15110
           1       0.24      0.88      0.37      1333

    accuracy                           0.76     16443
   macro avg       0.61      0.82      0.61     16443
weighted avg       0.93      0.76      0.81     16443



In [78]:
#1.undersampling
model_pipeline3.fit(train_X_ues, train_y_ues)
evalution(train_X_ues,train_y_ues,val_X,val_y,model_pipeline3)

accuracy score train 0.8730134932533733
accuracy score test 0.732165663200146
f1score train 0.8832529290144728
f1score test 0.3580174927113703
classification train 
               precision    recall  f1-score   support

           0       0.95      0.79      0.86      3335
           1       0.82      0.96      0.88      3335

    accuracy                           0.87      6670
   macro avg       0.88      0.87      0.87      6670
weighted avg       0.88      0.87      0.87      6670

classification test 
               precision    recall  f1-score   support

           0       0.99      0.72      0.83     15110
           1       0.22      0.92      0.36      1333

    accuracy                           0.73     16443
   macro avg       0.61      0.82      0.59     16443
weighted avg       0.93      0.73      0.79     16443



In [85]:
model_pipeline3.fit(train_X,train_y)
evalution(train_X,train_y,val_X,val_y,model_pipeline3)

accuracy score train 0.9441678613319432
accuracy score test 0.9450830140485313
f1score train 0.530469092503288
f1score test 0.5030269675288938
classification train 
               precision    recall  f1-score   support

           0       0.94      1.00      0.97     35030
           1       0.99      0.36      0.53      3335

    accuracy                           0.94     38365
   macro avg       0.96      0.68      0.75     38365
weighted avg       0.95      0.94      0.93     38365

classification test 
               precision    recall  f1-score   support

           0       0.95      1.00      0.97     15110
           1       0.94      0.34      0.50      1333

    accuracy                           0.95     16443
   macro avg       0.94      0.67      0.74     16443
weighted avg       0.95      0.95      0.93     16443



In [87]:
model_pipeline4 = Pipeline(
    steps=[
        ('preprocess', preprocess),
        ('model', XGBClassifier())
    ]
)

In [89]:
model_pipeline4.fit(train_X,train_y)
evalution(train_X,train_y,val_X,val_y,model_pipeline4)

accuracy score train 0.9484686563273818
accuracy score test 0.9444748525208295
f1score train 0.5840521775720597
f1score test 0.5094035464803869
classification train 
               precision    recall  f1-score   support

           0       0.95      1.00      0.97     35030
           1       0.98      0.42      0.58      3335

    accuracy                           0.95     38365
   macro avg       0.96      0.71      0.78     38365
weighted avg       0.95      0.95      0.94     38365

classification test 
               precision    recall  f1-score   support

           0       0.95      1.00      0.97     15110
           1       0.90      0.36      0.51      1333

    accuracy                           0.94     16443
   macro avg       0.92      0.68      0.74     16443
weighted avg       0.94      0.94      0.93     16443



In [91]:
#1.oversampling
model_pipeline4.fit(train_X_res, train_y_res)
evalution(train_X_res,train_y_res,val_X,val_y,model_pipeline4)

accuracy score train 0.90870682272338
accuracy score test 0.8225384662166272
f1score train 0.9141107589837245
f1score test 0.40763296792529435
classification train 
               precision    recall  f1-score   support

           0       0.97      0.85      0.90     35030
           1       0.86      0.97      0.91     35030

    accuracy                           0.91     70060
   macro avg       0.92      0.91      0.91     70060
weighted avg       0.92      0.91      0.91     70060

classification test 
               precision    recall  f1-score   support

           0       0.97      0.83      0.90     15110
           1       0.28      0.75      0.41      1333

    accuracy                           0.82     16443
   macro avg       0.63      0.79      0.65     16443
weighted avg       0.92      0.82      0.86     16443



In [101]:
model_pipeline4 = Pipeline(
    steps=[
        ('preprocess', preprocess),
        ('model', XGBClassifier())
    ]
)
param_grid = {
    'model__n_estimators': [50, 100, 200],  # Number of trees in the forest
    'model__max_depth': [3, 5, 7],          # Maximum tree depth
    'model__learning_rate': [0.01, 0.1, 0.2],  # Step size shrinkage
    'model__subsample': [0.6, 0.8, 1.0],    # Fraction of samples used per tree
    'model__colsample_bytree': [0.6, 0.8, 1.0],  # Fraction of features per tree
    'model__gamma': [0, 0.1, 0.2],          # Minimum loss reduction to split a node
    'model__scale_pos_weight': [1, 2, 5]    # Balances positive and negative classes
}
grid_search= GridSearchCV(estimator=model_pipeline4,param_grid=param_grid,scoring=f1_score,cv=7,n_jobs=-1)
grid_search.fit(train_X,train_y)

In [105]:
grid_search.best_params_

{'model__colsample_bytree': 0.6,
 'model__gamma': 0,
 'model__learning_rate': 0.01,
 'model__max_depth': 3,
 'model__n_estimators': 50,
 'model__scale_pos_weight': 1,
 'model__subsample': 0.6}

In [109]:
model_pipeline5=grid_search.best_estimator_

In [124]:
evalution(train_X,train_y,val_X,val_y,model_pipeline5)

accuracy score train 0.9130718102437118
accuracy score test 0.9189320683573557
f1score train 0.0
f1score test 0.0
classification train 
               precision    recall  f1-score   support

           0       0.91      1.00      0.95     35030
           1       0.00      0.00      0.00      3335

    accuracy                           0.91     38365
   macro avg       0.46      0.50      0.48     38365
weighted avg       0.83      0.91      0.87     38365

classification test 
               precision    recall  f1-score   support

           0       0.92      1.00      0.96     15110
           1       0.00      0.00      0.00      1333

    accuracy                           0.92     16443
   macro avg       0.46      0.50      0.48     16443
weighted avg       0.84      0.92      0.88     16443



In [137]:
model_pipeline6 = Pipeline(
    steps=[
        ('preprocess', preprocess),
        ('model', XGBClassifier(
            colsample_bytree=0.6,
            gamma=0,
            learning_rate=0.01,
            max_depth=3,
            n_estimators=50,
            subsample=0.6
        ))
    ]
)

model_pipeline6.fit(train_X, train_y)
evalution(train_X, train_y, val_X, val_y, model_pipeline6)


accuracy score train 0.9130718102437118
accuracy score test 0.9189320683573557
f1score train 0.0
f1score test 0.0
classification train 
               precision    recall  f1-score   support

           0       0.91      1.00      0.95     35030
           1       0.00      0.00      0.00      3335

    accuracy                           0.91     38365
   macro avg       0.46      0.50      0.48     38365
weighted avg       0.83      0.91      0.87     38365

classification test 
               precision    recall  f1-score   support

           0       0.92      1.00      0.96     15110
           1       0.00      0.00      0.00      1333

    accuracy                           0.92     16443
   macro avg       0.46      0.50      0.48     16443
weighted avg       0.84      0.92      0.88     16443



In [151]:
model_pipeline7 = Pipeline(
    steps=[
        ('preprocess', preprocess),
        ('model', XGBClassifier(
            colsample_bytree=0.6,
            gamma=0,
            learning_rate=0.5,
            max_depth=2,
            
            n_estimators=150,
            subsample=0.6
        ))
    ]
)

model_pipeline7.fit(train_X, train_y)
evalution(train_X, train_y, val_X, val_y, model_pipeline7)


accuracy score train 0.941352795516747
accuracy score test 0.944292404062519
f1score train 0.5104438642297651
f1score test 0.5032537960954447
classification train 
               precision    recall  f1-score   support

           0       0.94      1.00      0.97     35030
           1       0.93      0.35      0.51      3335

    accuracy                           0.94     38365
   macro avg       0.94      0.67      0.74     38365
weighted avg       0.94      0.94      0.93     38365

classification test 
               precision    recall  f1-score   support

           0       0.95      1.00      0.97     15110
           1       0.91      0.35      0.50      1333

    accuracy                           0.94     16443
   macro avg       0.93      0.67      0.74     16443
weighted avg       0.94      0.94      0.93     16443



In [177]:
model_pipeline8 = Pipeline(
    steps=[
        ('preprocess', preprocess),
        ('model', XGBClassifier(
            colsample_bytree=0.6,
            gamma=0,
            learning_rate=0.5,
            max_depth=2,
            n_estimators=1000,
            subsample=0.6
        ))
    ]
)

model_pipeline8.fit(train_X, train_y)
evalution(train_X, train_y, val_X, val_y, model_pipeline8)


accuracy score train 0.944063599635084
accuracy score test 0.9416773094934014
f1score train 0.5521702838063439
f1score test 0.5049044914816727
classification train 
               precision    recall  f1-score   support

           0       0.95      1.00      0.97     35030
           1       0.91      0.40      0.55      3335

    accuracy                           0.94     38365
   macro avg       0.93      0.70      0.76     38365
weighted avg       0.94      0.94      0.93     38365

classification test 
               precision    recall  f1-score   support

           0       0.95      0.99      0.97     15110
           1       0.81      0.37      0.50      1333

    accuracy                           0.94     16443
   macro avg       0.88      0.68      0.74     16443
weighted avg       0.94      0.94      0.93     16443



In [187]:
submission_data['is_promoted']=model_pipeline8.predict(test)
submission_data.to_csv('model8.csv', index=False)

In [183]:
model_pipeline9 = Pipeline(
    steps=[
        ('preprocess', preprocess),
        ('model', XGBClassifier(
            colsample_bytree=0.6,
            gamma=0,
            learning_rate=0.5,
            max_depth=2,
            n_estimators=1000,
            subsample=0.6
        ))
    ]
)

model_pipeline9.fit(train_X, train_y)
evalution(train_X_ues, train_y_ues, val_X, val_y, model_pipeline9)


accuracy score train 0.6970014992503748
accuracy score test 0.9416773094934014
f1score train 0.5669595028926505
f1score test 0.5049044914816727
classification train 
               precision    recall  f1-score   support

           0       0.62      1.00      0.77      3335
           1       0.99      0.40      0.57      3335

    accuracy                           0.70      6670
   macro avg       0.81      0.70      0.67      6670
weighted avg       0.81      0.70      0.67      6670

classification test 
               precision    recall  f1-score   support

           0       0.95      0.99      0.97     15110
           1       0.81      0.37      0.50      1333

    accuracy                           0.94     16443
   macro avg       0.88      0.68      0.74     16443
weighted avg       0.94      0.94      0.93     16443



In [394]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__max_depth': [3, 5, 7],
    'model__n_estimators': [500, 1000],
    'model__subsample': [0.6, 0.8, 1.0],
    'model__colsample_bytree': [0.6, 0.8, 1.0],
    'model__gamma': [0, 1, 5],
}

grid_search = GridSearchCV(
    estimator=model_pipeline9,
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',  # Adjust based on your metric
    verbose=1,
    n_jobs=-1
)

grid_search.fit(train_X, train_y)
print("Best Parameters:", grid_search.best_params_)


Fitting 3 folds for each of 486 candidates, totalling 1458 fits
Best Parameters: {'model__colsample_bytree': 0.6, 'model__gamma': 0, 'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 500, 'model__subsample': 1.0}


In [396]:
model13=grid_search.best_estimator_

In [403]:
model13.fit(train_X, train_y)
evalution(train_X, train_y, val_X, val_y, model13)

accuracy score train 0.9424736087579826
accuracy score test 0.9455087271179226
f1score train 0.5152646606632989
f1score test 0.5055187637969095


In [401]:
submission_data['is_promoted']=model13.predict(test)
submission_data.to_csv('model13.csv', index=False)

In [501]:
model_pipeline21 = Pipeline(
    steps=[
        ('preprocess', preprocess),
        ('model', XGBClassifier(
            colsample_bytree=0.6,
            gamma=0.3,
            learning_rate=0.5,
            max_depth=2,
            n_estimators=1000,
            subsample=0.6,
            reg_alpha=0.02,   # Add L1 regularization
            reg_lambda=1.0   # Add L2 regularization
        ))
    ]
)

model_pipeline21.fit(train_X, train_y)
evalution(train_X, train_y, val_X, val_y, model_pipeline21)


accuracy score train 0.9444545809983057
accuracy score test 0.9421638387155629
f1score train 0.554649947753396
f1score test 0.5090345895715023


In [507]:
submission_data['is_promoted']=model_pipeline21.predict(test)
submission_data.to_csv('model21.csv', index=False)

In [525]:
model_pipeline2 = Pipeline(
    steps=[
        ('preprocess', preprocess),
        ('model', DecisionTreeClassifier(
            criterion='entropy',
            max_depth=None,
            max_leaf_nodes=20,
            min_samples_leaf=1,
            min_samples_split=3,
            random_state =42
        ))
    ]
)

In [527]:
model_pipeline2.fit(train_X, train_y)
evalution(train_X, train_y, val_X, val_y, model_pipeline2)

accuracy score train 0.9335331682523134
accuracy score test 0.9372985464939488
f1score train 0.40780306549001394
f1score test 0.39813193228254523


In [None]:
DecisionTreeClassifier

In [503]:
model_pipeline21 = Pipeline(
    steps=[
        ('preprocess', preprocess),
        ('model', XGBClassifier(
            colsample_bytree=0.6,
            gamma=0.3,
            learning_rate=0.5,
            max_depth=2,
            n_estimators=1000,
            subsample=0.6,
            reg_alpha=0.02,   # Add L1 regularization
            reg_lambda=1.0   # Add L2 regularization
        ))
    ]
)

model_pipeline21.fit(train_X_res, train_y_res)
evalution(train_X, train_y, val_X, val_y, model_pipeline21)

accuracy score train 0.8280985273035318
accuracy score test 0.8070303472602324
f1score train 0.4839996870354432
f1score test 0.4014336917562724


In [505]:
model_pipeline21 = Pipeline(
    steps=[
        ('preprocess', preprocess),
        ('model', XGBClassifier(
            colsample_bytree=0.6,
            gamma=0.3,
            learning_rate=0.5,
            max_depth=2,
            n_estimators=1000,
            subsample=0.6,
            reg_alpha=0.02,   # Add L1 regularization
            reg_lambda=1.0   # Add L2 regularization
        ))
    ]
)

model_pipeline21.fit(train_X_ues, train_y_ues)
evalution(train_X, train_y, val_X, val_y, model_pipeline21)

accuracy score train 0.7882184282549198
accuracy score test 0.7721826917229216
f1score train 0.428661838126714
f1score test 0.36594448205822616


AttributeError: 'DataFrame' object has no attribute 'unique'

Parameters
----------

    n_estimators : Optional[int]
        Number of boosting rounds.

    max_depth :  typing.Optional[int]

        Maximum tree depth for base learners.

    max_leaves : typing.Optional[int]

        Maximum number of leaves; 0 indicates no limit.

    max_bin : typing.Optional[int]

        If using histogram-based algorithm, maximum number of bins per feature

    grow_policy : typing.Optional[str]

        Tree growing policy.

        - depthwise: Favors splitting at nodes closest to the node,
        - lossguide: Favors splitting at nodes with highest loss change.

    learning_rate : typing.Optional[float]

        Boosting learning rate (xgb's "eta")

    verbosity : typing.Optional[int]

        The degree of verbosity. Valid values are 0 (silent) - 3 (debug).

    objective : typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType]

        Specify the learning task and the corresponding learning objective or a custom
        objective function to be used.

        For custom objective, see :doc:`/tutorials/custom_metric_obj` and
        :ref:`custom-obj-metric` for more information, along with the end note for
        function signatures.

    booster: typing.Optional[str]

        Specify which booster to use: ``gbtree``, ``gblinear`` or ``dart``.

    tree_method : typing.Optional[str]

        Specify which tree method to use.  Default to auto.  If this parameter is set to
        default, XGBoost will choose the most conservative option available.  It's
        recommended to study this option from the parameters document :doc:`tree method
        </treemethod>`

    n_jobs : typing.Optional[int]

        Number of parallel threads used to run xgboost.  When used with other
        Scikit-Learn algorithms like grid search, you may choose which algorithm to
        parallelize and balance the threads.  Creating thread contention will
        significantly slow down both algorithms.

    gamma : typing.Optional[float]

        (min_split_loss) Minimum loss reduction required to make a further partition on
        a leaf node of the tree.

    min_child_weight : typing.Optional[float]

        Minimum sum of instance weight(hessian) needed in a child.

    max_delta_step : typing.Optional[float]

        Maximum delta step we allow each tree's weight estimation to be.

    subsample : typing.Optional[float]

        Subsample ratio of the training instance.

    sampling_method : typing.Optional[str]

        Sampling method. Used only by the GPU version of ``hist`` tree method.

        - ``uniform``: Select random training instances uniformly.
        - ``gradient_based``: Select random training instances with higher probability
            when the gradient and hessian are larger. (cf. CatBoost)

    colsample_bytree : typing.Optional[float]

        Subsample ratio of columns when constructing each tree.

    colsample_bylevel : typing.Optional[float]

        Subsample ratio of columns for each level.

    colsample_bynode : typing.Optional[float]

        Subsample ratio of columns for each split.

    reg_alpha : typing.Optional[float]

        L1 regularization term on weights (xgb's alpha).

    reg_lambda : typing.Optional[float]

        L2 regularization term on weights (xgb's lambda).

    scale_pos_weight : typing.Optional[float]
        Balancing of positive and negative weights.

    base_score : typing.Optional[float]

        The initial prediction score of all instances, global bias.

    random_state : typing.Union[numpy.random.mtrand.RandomState, numpy.random._generator.Generator, int, NoneType]

        Random number seed.

        .. note::

           Using gblinear booster with shotgun updater is nondeterministic as
           it uses Hogwild algorithm.

    missing : float

        Value in the data which needs to be present as a missing value. Default to
        :py:data:`numpy.nan`.

    num_parallel_tree: typing.Optional[int]

        Used for boosting random forest.

    monotone_constraints : typing.Union[typing.Dict[str, int], str, NoneType]

        Constraint of variable monotonicity.  See :doc:`tutorial </tutorials/monotonic>`
        for more information.

    interaction_constraints : typing.Union[str, typing.List[typing.Tuple[str]], NoneType]

        Constraints for interaction representing permitted interactions.  The
        constraints must be specified in the form of a nested list, e.g. ``[[0, 1], [2,
        3, 4]]``, where each inner list is a group of indices of features that are
        allowed to interact with each other.  See :doc:`tutorial
        </tutorials/feature_interaction_constraint>` for more information

    importance_type: typing.Optional[str]

        The feature importance type for the feature_importances\_ property:

        * For tree model, it's either "gain", "weight", "cover", "total_gain" or
          "total_cover".
        * For linear model, only "weight" is defined and it's the normalized
          coefficients without bias.

    device : typing.Optional[str]

        .. versionadded:: 2.0.0

        Device ordinal, available options are `cpu`, `cuda`, and `gpu`.

    validate_parameters : typing.Optional[bool]

        Give warnings for unknown parameter.

    enable_categorical : bool

        See the same parameter of :py:class:`DMatrix` for details.

    feature_types : typing.Optional[typing.Sequence[str]]

        .. versionadded:: 1.7.0

        Used for specifying feature types without constructing a dataframe. See
        :py:class:`DMatrix` for details.

    max_cat_to_onehot : typing.Optional[int]

        .. versionadded:: 1.6.0

        .. note:: This parameter is experimental

        A threshold for deciding whether XGBoost should use one-hot encoding based split
        for categorical data.  When number of categories is lesser than the threshold
        then one-hot encoding is chosen, otherwise the categories will be partitioned
        into children nodes. Also, `enable_categorical` needs to be set to have
        categorical feature support. See :doc:`Categorical Data
        </tutorials/categorical>` and :ref:`cat-param` for details.

    max_cat_threshold : typing.Optional[int]

        .. versionadded:: 1.7.0

        .. note:: This parameter is experimental

        Maximum number of categories considered for each split. Used only by
        partition-based splits for preventing over-fitting. Also, `enable_categorical`
        needs to be set to have categorical feature support. See :doc:`Categorical Data
        </tutorials/categorical>` and :ref:`cat-param` for details.

    multi_strategy : typing.Optional[str]

        .. versionadded:: 2.0.0

        .. note:: This parameter is working-in-progress.

        The strategy used for training multi-target models, including multi-target
        regression and multi-class classification. See :doc:`/tutorials/multioutput` for
        more information.

        - ``one_output_per_tree``: One model for each target.
        - ``multi_output_tree``:  Use multi-target trees.

    eval_metric : typing.Union[str, typing.List[str], typing.Callable, NoneType]

        .. versionadded:: 1.6.0

        Metric used for monitoring the training result and early stopping.  It can be a
        string or list of strings as names of predefined metric in XGBoost (See
        doc/parameter.rst), one of the metrics in :py:mod:`sklearn.metrics`, or any
        other user defined metric that looks like `sklearn.metrics`.

        If custom objective is also provided, then custom metric should implement the
        corresponding reverse link function.

        Unlike the `scoring` parameter commonly used in scikit-learn, when a callable
        object is provided, it's assumed to be a cost function and by default XGBoost
        will minimize the result during early stopping.

        For advanced usage on Early stopping like directly choosing to maximize instead
        of minimize, see :py:obj:`xgboost.callback.EarlyStopping`.

        See :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more
        information.

        .. code-block:: python

            from sklearn.datasets import load_diabetes
            from sklearn.metrics import mean_absolute_error
            X, y = load_diabetes(return_X_y=True)
            reg = xgb.XGBRegressor(
                tree_method="hist",
                eval_metric=mean_absolute_error,
            )
            reg.fit(X, y, eval_set=[(X, y)])

    early_stopping_rounds : typing.Optional[int]

        .. versionadded:: 1.6.0

        - Activates early stopping. Validation metric needs to improve at least once in
          every **early_stopping_rounds** round(s) to continue training.  Requires at
          least one item in **eval_set** in :py:meth:`fit`.

        - If early stopping occurs, the model will have two additional attributes:
          :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the
          :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal
          number of trees during inference. If users want to access the full model
          (including trees built after early stopping), they can specify the
          `iteration_range` in these inference methods. In addition, other utilities
          like model plotting can also use the entire model.

        - If you prefer to discard the trees after `best_iteration`, consider using the
          callback function :py:class:`xgboost.callback.EarlyStopping`.

        - If there's more than one item in **eval_set**, the last entry will be used for
          early stopping.  If there's more than one metric in **eval_metric**, the last
          metric will be used for early stopping.

    callbacks : typing.Optional[typing.List[xgboost.callback.TrainingCallback]]

        List of callback functions that are applied at end of each iteration.
        It is possible to use predefined callbacks by using
        :ref:`Callback API <callback_api>`.

        .. note::

           States in callback are not preserved during training, which means callback
           objects can not be reused for multiple training sessions without
           reinitialization or deepcopy.

        .. code-block:: python

            for params in parameters_grid:
                # be sure to (re)initialize the callbacks before each run
                callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]
                reg = xgboost.XGBRegressor(**params, callbacks=callbacks)
                reg.fit(X, y)

    kwargs : typing.Optional[typing.Any]

        Keyword arguments for XGBoost Booster object.  Full documentation of parameters
        can be found :doc:`here </parameter>`.
        Attempting to set a parameter via the constructor args and \*\*kwargs
        dict simultaneously will result in a TypeError.

        .. note:: \*\*kwargs unsupported by scikit-learn

            \*\*kwargs is unsupported by scikit-learn.  We do not guarantee
            that parameters passed via this argument will interact properly
            with scikit-learn.

        .. note::  Custom objective function

            A custom objective function can be provided for the ``objective``
            parameter. In this case, it should have the signature ``objective(y_true,
            y_pred) -> [grad, hess]`` or ``objective(y_true, y_pred, *, sample_weight)
            -> [grad, hess]``:

            y_true: array_like of shape [n_samples]
                The target values
            y_pred: array_like of shape [n_samples]
                The predicted values
            sample_weight :
                Optional sample weights.

            grad: array_like of shape [n_samples]
                The value of the gradient for each sample point.
            hess: array_like of shape [n_samples]
                The value of the second derivative for each sample point
File:           c:\users\infan\anaconda3\lib\site-packages\xgboost\sklearn.py
Type:           type
Subclasses:     XGBRFClassifier

In [189]:
submission_data['is_promoted']=model_pipeline9.predict(test)
submission_data.to_csv('model9.csv', index=False)

In [215]:
estimators = [
    ('m9', model_pipeline9),
    ('m8', model_pipeline8),
    ('m7', model_pipeline7)
]
stacking_model3 = StackingClassifier(estimators=estimators, final_estimator= GradientBoostingClassifier(criterion='friedman_mse',max_depth=20, max_leaf_nodes=20, min_samples_leaf=2, min_samples_split=2 ))


In [217]:
stacking_model3.fit(train_X, train_y)
evalution(train_X, train_y, val_X, val_y, stacking_model3)

accuracy score train 0.9405968982145184
accuracy score test 0.9425287356321839
f1score train 0.5007667031763418
f1score test 0.4827586206896552
classification train 
               precision    recall  f1-score   support

           0       0.94      1.00      0.97     35030
           1       0.93      0.34      0.50      3335

    accuracy                           0.94     38365
   macro avg       0.94      0.67      0.73     38365
weighted avg       0.94      0.94      0.93     38365

classification test 
               precision    recall  f1-score   support

           0       0.94      1.00      0.97     15110
           1       0.89      0.33      0.48      1333

    accuracy                           0.94     16443
   macro avg       0.92      0.66      0.73     16443
weighted avg       0.94      0.94      0.93     16443



In [195]:
stacking_model3

In [219]:
# voting_model3
clf1 = model_pipeline7
clf2 = model_pipeline8
clf3 = model_pipeline9


voting_model3 = VotingClassifier(estimators=[('dt', clf1), ('LR', clf2), ('LR-E', clf3)], voting='soft')


In [221]:
voting_model3.fit(train_X, train_y)
evalution(train_X, train_y, val_X, val_y, voting_model3)

accuracy score train 0.944063599635084
accuracy score test 0.9416773094934014
f1score train 0.5521702838063439
f1score test 0.5049044914816727
classification train 
               precision    recall  f1-score   support

           0       0.95      1.00      0.97     35030
           1       0.91      0.40      0.55      3335

    accuracy                           0.94     38365
   macro avg       0.93      0.70      0.76     38365
weighted avg       0.94      0.94      0.93     38365

classification test 
               precision    recall  f1-score   support

           0       0.95      0.99      0.97     15110
           1       0.81      0.37      0.50      1333

    accuracy                           0.94     16443
   macro avg       0.88      0.68      0.74     16443
weighted avg       0.94      0.94      0.93     16443



In [223]:
voting_model3.fit(train_X_res, train_y_res)
evalution(train_X_res, train_y_res, val_X, val_y, voting_model3)

accuracy score train 0.8744361975449615
accuracy score test 0.8064221857325305
f1score train 0.8811264408199667
f1score test 0.3997737129926457
classification train 
               precision    recall  f1-score   support

           0       0.92      0.82      0.87     35030
           1       0.84      0.93      0.88     35030

    accuracy                           0.87     70060
   macro avg       0.88      0.87      0.87     70060
weighted avg       0.88      0.87      0.87     70060

classification test 
               precision    recall  f1-score   support

           0       0.98      0.81      0.88     15110
           1       0.27      0.80      0.40      1333

    accuracy                           0.81     16443
   macro avg       0.62      0.80      0.64     16443
weighted avg       0.92      0.81      0.85     16443



In [225]:
voting_model3.fit(train_X_ues, train_y_ues)
evalution(train_X_ues, train_y_ues, val_X, val_y, voting_model3)

accuracy score train 0.8760119940029985
accuracy score test 0.7705406555981269
f1score train 0.8798838053740015
f1score test 0.3644938521138622
classification train 
               precision    recall  f1-score   support

           0       0.90      0.84      0.87      3335
           1       0.85      0.91      0.88      3335

    accuracy                           0.88      6670
   macro avg       0.88      0.88      0.88      6670
weighted avg       0.88      0.88      0.88      6670

classification test 
               precision    recall  f1-score   support

           0       0.98      0.77      0.86     15110
           1       0.24      0.81      0.36      1333

    accuracy                           0.77     16443
   macro avg       0.61      0.79      0.61     16443
weighted avg       0.92      0.77      0.82     16443



In [253]:
model_pipeline10 = Pipeline(
    steps=[
        ('preprocess', preprocess),
        ('model', XGBClassifier(
            colsample_bytree=0.6,
            gamma=0,
            learning_rate=0.15,
            max_depth=2,
            n_estimators=1000,
            subsample=0.6
            
        ))
    ]
)

model_pipeline10.fit(train_X, train_y)
evalution(train_X, train_y, val_X, val_y, model_pipeline10)


accuracy score train 0.9416655806073244
accuracy score test 0.9438666909931278
f1score train 0.5155844155844156
f1score test 0.5002707092582567
classification train 
               precision    recall  f1-score   support

           0       0.94      1.00      0.97     35030
           1       0.93      0.36      0.52      3335

    accuracy                           0.94     38365
   macro avg       0.93      0.68      0.74     38365
weighted avg       0.94      0.94      0.93     38365

classification test 
               precision    recall  f1-score   support

           0       0.95      1.00      0.97     15110
           1       0.90      0.35      0.50      1333

    accuracy                           0.94     16443
   macro avg       0.92      0.67      0.74     16443
weighted avg       0.94      0.94      0.93     16443



In [257]:
estimators = [
    ('m9', model_pipeline9),
    ('rf', model_pipeline)
]

stacking_model4 = StackingClassifier(estimators=estimators, final_estimator= LogisticRegression())
stacking_model4.fit(train_X,train_y)
evalution(train_X, train_y, val_X, val_y, stacking_model4)

accuracy score train 0.9420565619705461
accuracy score test 0.9436234263820471
f1score train 0.5322953923837577
f1score test 0.5123619147816938
classification train 
               precision    recall  f1-score   support

           0       0.94      1.00      0.97     35030
           1       0.89      0.38      0.53      3335

    accuracy                           0.94     38365
   macro avg       0.92      0.69      0.75     38365
weighted avg       0.94      0.94      0.93     38365

classification test 
               precision    recall  f1-score   support

           0       0.95      0.99      0.97     15110
           1       0.86      0.37      0.51      1333

    accuracy                           0.94     16443
   macro avg       0.90      0.68      0.74     16443
weighted avg       0.94      0.94      0.93     16443



In [259]:
submission_data['is_promoted']=stacking_model4.predict(test)
submission_data.to_csv('stacking_4.csv', index=False)

In [251]:
estimators = [
    ('m9', model_pipeline9),
    ('rf', model_pipeline)
]

stacking_model4 = StackingClassifier(estimators=estimators, final_estimator= RandomForestClassifier())
stacking_model4.fit(train_X,train_y)
evalution(train_X, train_y, val_X, val_y, stacking_model4)

accuracy score train 0.9364524957643686
accuracy score test 0.9372377303411786
f1score train 0.5174188440221694
f1score test 0.4901185770750988
classification train 
               precision    recall  f1-score   support

           0       0.94      0.99      0.97     35030
           1       0.76      0.39      0.52      3335

    accuracy                           0.94     38365
   macro avg       0.85      0.69      0.74     38365
weighted avg       0.93      0.94      0.93     38365

classification test 
               precision    recall  f1-score   support

           0       0.95      0.99      0.97     15110
           1       0.72      0.37      0.49      1333

    accuracy                           0.94     16443
   macro avg       0.83      0.68      0.73     16443
weighted avg       0.93      0.94      0.93     16443



In [283]:


model_pipeline14 = Pipeline(
    steps=[
        ('preprocess', preprocess),
        ('model', XGBClassifier(
            objective = 'binary:logistic',
            max_depth= 4,
            alpha= 10,
            learning_rate= 1.0,
            n_estimators=100
        ))
    ]
)

model_pipeline14.fit(train_X, train_y)
evalution(train_X_ues, train_y_ues, val_X, val_y, model_pipeline14)

accuracy score train 0.6790104947526237
accuracy score test 0.9444140363680593
f1score train 0.5287255117763592
f1score test 0.504875406283857
classification train 
               precision    recall  f1-score   support

           0       0.61      1.00      0.76      3335
           1       0.99      0.36      0.53      3335

    accuracy                           0.68      6670
   macro avg       0.80      0.68      0.64      6670
weighted avg       0.80      0.68      0.64      6670

classification test 
               precision    recall  f1-score   support

           0       0.95      1.00      0.97     15110
           1       0.91      0.35      0.50      1333

    accuracy                           0.94     16443
   macro avg       0.93      0.67      0.74     16443
weighted avg       0.94      0.94      0.93     16443



In [None]:
XGBClassifier()

In [305]:
!pip install lightgbm


Collecting lightgbm
  Downloading lightgbm-4.5.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.5.0-py3-none-win_amd64.whl (1.4 MB)
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   ------- -------------------------------- 0.3/1.4 MB ? eta -:--:--
   ------- -------------------------------- 0.3/1.4 MB ? eta -:--:--
   ------- -------------------------------- 0.3/1.4 MB ? eta -:--:--
   ------- -------------------------------- 0.3/1.4 MB ? eta -:--:--
   -------------- ------------------------- 0.5/1.4 MB 254.3 kB/s eta 0:00:04
   -------------- ------------------------- 0.5/1.4 MB 254.3 kB/s eta 0:00:04
   ---

In [307]:
from lightgbm import LGBMClassifier


In [311]:
model_pipeline15 = Pipeline(
    steps=[
        ('preprocess', preprocess),
        ('model', LGBMClassifier())
    ]
)
model_pipeline15.fit(train_X, train_y)
evalution(train_X_ues, train_y_ues, val_X, val_y, model_pipeline15)

[LightGBM] [Info] Number of positive: 3335, number of negative: 35030
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004841 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 261
[LightGBM] [Info] Number of data points in the train set: 38365, number of used features: 58
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086928 -> initscore=-2.351732
[LightGBM] [Info] Start training from score -2.351732
accuracy score train 0.6815592203898051
accuracy score test 0.9456911755762331
f1score train 0.5331868131868132
f1score test 0.5096101043382757
classification train 
               precision    recall  f1-score   support

           0       0.61      1.00      0.76      3335
           1       1.00      0.36      0.53      3335

    accuracy                           0.68      6670
   macro avg       0.80      0.68      0.65      6670
weigh

In [315]:
# Initialize the classifier

model_pipeline16 = Pipeline(
    steps=[
        ('preprocess', preprocess),
        ('model',LGBMClassifier(
                objective='binary',  # For binary classification
                boosting_type='gbdt',  # Gradient Boosting Decision Trees
                num_leaves=31,  # Maximum leaves in one tree
                learning_rate=0.1,  # Step size shrinkage
                n_estimators=100,  # Number of boosting iterations
                max_depth=-1,  # Unlimited depth
                random_state=42  # For reproducibility
            ))
    ]
)
model_pipeline16.fit(train_X, train_y)
evalution(train_X_ues, train_y_ues, val_X, val_y, model_pipeline16)


[LightGBM] [Info] Number of positive: 3335, number of negative: 35030
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005709 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 261
[LightGBM] [Info] Number of data points in the train set: 38365, number of used features: 58
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086928 -> initscore=-2.351732
[LightGBM] [Info] Start training from score -2.351732
accuracy score train 0.6815592203898051
accuracy score test 0.9456911755762331
f1score train 0.5331868131868132
f1score test 0.5096101043382757
classification train 
               precision    recall  f1-score   support

           0       0.61      1.00      0.76      3335
           1       1.00      0.36      0.53      3335

    accuracy                           0.68      6670
   macro avg       0.80      0.68      0.65      6670
weigh

In [345]:
submission_data['is_promoted']=model_pipeline16.predict(test)
submission_data.to_csv('stacking_16.csv', index=False)

In [333]:
!pip install catboost




In [343]:
cat_features = [train_X.columns.get_loc(col) for col in cat_col]  # Indices of categorical columns

cat_features = [0, 1, 2]  # Replace with the indices of categorical columns in your dataset
from catboost import CatBoostClassifier

model_pipeline17 = Pipeline(
    steps=[
        ('preprocess', preprocess),
        ('model',CatBoostClassifier(
                iterations=1000,         # Number of boosting iterations
                learning_rate=0.1,       # Learning rate
                depth=6,                 # Depth of the trees
                cat_features=cat_features,  # Specify categorical features
                verbose=100              # Log output every 100 iterations
            ))
    ]
)
model_pipeline17.fit(train_X, train_y)
evalution(train_X, train_y, val_X, val_y, model_pipeline17)

CatBoostError: 'data' is scipy.sparse.spmatrix of floating point numerical type, it means no categorical features, but 'cat_features' parameter specifies nonzero number of categorical features

In [317]:


estimators = [
    ('m14', model_pipeline14),
    ('m16', model_pipeline16),
    ('m15', model_pipeline15)
]
stacking_model6 = StackingClassifier(estimators=estimators, final_estimator= GradientBoostingClassifier(criterion='friedman_mse',max_depth=20, max_leaf_nodes=20, min_samples_leaf=2, min_samples_split=2 ))


In [323]:
stacking_model6.fit(train_X, train_y)
evalution(train_X, train_y, val_X, val_y, stacking_model6)

[LightGBM] [Info] Number of positive: 3335, number of negative: 35030
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006888 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 261
[LightGBM] [Info] Number of data points in the train set: 38365, number of used features: 58
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.086928 -> initscore=-2.351732
[LightGBM] [Info] Start training from score -2.351732
[LightGBM] [Info] Number of positive: 3335, number of negative: 35030
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004705 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 261
[LightGBM] [Info] Number of data points in the train set: 38365, number of used features: 58
[LightGBM] [Info] [bin

In [321]:
submission_data['is_promoted']=stacking_model6.predict(test)
submission_data.to_csv('stacking_6.csv', index=False)

In [327]:
stacking_model6.fit(train_X_res, train_y_res)
evalution(train_X_res, train_y_res, val_X, val_y, stacking_model6)

[LightGBM] [Info] Number of positive: 35030, number of negative: 35030
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004335 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 262
[LightGBM] [Info] Number of data points in the train set: 70060, number of used features: 58
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 35030, number of negative: 35030
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005530 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 262
[LightGBM] [Info] Number of data points in the train set: 70060, number of used features: 58
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000

In [329]:
submission_data['is_promoted']=stacking_model6.predict(test)
submission_data.to_csv('stacking_6_c.csv', index=False)

In [374]:
import xgboost as xgb

from sklearn.preprocessing import OneHotEncoder

# Apply OneHotEncoding
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
train_X_encoded = encoder.fit_transform(train_X[cat_col])
val_X_encoded = encoder.transform(val_X[cat_col])

# Concatenate the encoded categorical features with the numerical features
train_X_processed = np.hstack([train_X.drop(columns=cat_col).values, train_X_encoded])
val_X_processed = np.hstack([val_X.drop(columns=cat_col).values, val_X_encoded])

# Create DMatrix
dtrain_reg = xgb.DMatrix(data=train_X_processed, label=train_y)
dtest_reg = xgb.DMatrix(data=val_X_processed, label=val_y)


# Parameters for XGBoost
params = {
    "objective": "reg:squarederror",  # Regression objective
    "tree_method": "gpu_hist"         # Use GPU-accelerated training
}
n = 5000  # Number of boosting rounds

# Evaluation datasets
evals = [(dtest_reg, "validation"), (dtrain_reg, "train")]

# Train the model
model_pipeline20 = xgb.train(
    params=params,
    dtrain=dtrain_reg,
    num_boost_round=n,
    evals=evals,
    verbose_eval=250  # Print evaluation results every 250 rounds
)


# Evaluate the model using a custom evaluation function
# Ensure `evaluation` is compatible with the trained model's API


[0]	validation-rmse:0.25585	train-rmse:0.26323
[250]	validation-rmse:0.22738	train-rmse:0.16679
[500]	validation-rmse:0.23499	train-rmse:0.14035
[750]	validation-rmse:0.24038	train-rmse:0.12210
[1000]	validation-rmse:0.24456	train-rmse:0.10882
[1250]	validation-rmse:0.24793	train-rmse:0.09891
[1500]	validation-rmse:0.25069	train-rmse:0.09034
[1750]	validation-rmse:0.25280	train-rmse:0.08285
[2000]	validation-rmse:0.25479	train-rmse:0.07619
[2250]	validation-rmse:0.25636	train-rmse:0.07088
[2500]	validation-rmse:0.25783	train-rmse:0.06604
[2750]	validation-rmse:0.25911	train-rmse:0.06154
[3000]	validation-rmse:0.26024	train-rmse:0.05756
[3250]	validation-rmse:0.26141	train-rmse:0.05429
[3500]	validation-rmse:0.26251	train-rmse:0.05085
[3750]	validation-rmse:0.26333	train-rmse:0.04834
[4000]	validation-rmse:0.26425	train-rmse:0.04573
[4250]	validation-rmse:0.26502	train-rmse:0.04346
[4500]	validation-rmse:0.26564	train-rmse:0.04144
[4750]	validation-rmse:0.26635	train-rmse:0.03946
[4999]

In [380]:
evalution(dtrain_reg, train_y, dtest_reg, val_y, model_pipeline20)



ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [390]:
test_encoded = encoder.fit_transform(test[cat_col])
test_processed = np.hstack([test.drop(columns=cat_col).values, test_encoded])
dtest_reg = xgb.DMatrix(data=test_processed)

submission_data['is_promoted']=model_pipeline20.predict(dtest_reg)
submission_data.to_csv('new_20.csv', index=False)

XGBoostError: [06:20:00] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:1462: Check failed: learner_model_param_.num_feature >= p_fmat->Info().num_col_ (59 vs. 60) : Number of columns does not match number of features in booster.

In [392]:
result= model_pipeline20.predict(dtest_reg)

XGBoostError: [06:21:13] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\learner.cc:1462: Check failed: learner_model_param_.num_feature >= p_fmat->Info().num_col_ (59 vs. 60) : Number of columns does not match number of features in booster.

In [291]:
from xgboost import DMatrix, cv

# Apply the preprocessing pipeline to transform the data
train_X_processed = preprocess.fit_transform(train_X)
val_X_processed = preprocess.transform(val_X)

# Convert processed data into DMatrix for XGBoost
dtrain = DMatrix(data=train_X_processed, label=train_y)
dval = DMatrix(data=val_X_processed, label=val_y)

# Define parameters for XGBoost
params = {
    "objective": "binary:logistic",
    "colsample_bytree": 0.3,
    "learning_rate": 0.1,
    "max_depth": 5,
    "alpha": 10
}

# Perform cross-validation
xgb_cv = cv(
    dtrain=dtrain,
    params=params,
    nfold=3,
    num_boost_round=50,
    early_stopping_rounds=10,
    metrics="auc",
    as_pandas=True,
    seed=123
)

# Display the CV results
print(xgb_cv)


    train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
0         0.541563       0.004294       0.534909      0.012564
1         0.556337       0.003203       0.546964      0.008797
2         0.671483       0.003757       0.667774      0.007603
3         0.675274       0.005464       0.670105      0.008465
4         0.677618       0.005862       0.671194      0.008574
5         0.688351       0.003370       0.682227      0.007512
6         0.688431       0.003049       0.683801      0.006426
7         0.746256       0.001452       0.739926      0.004276
8         0.745876       0.002041       0.738535      0.006585
9         0.754415       0.005617       0.746650      0.011540
10        0.755119       0.005687       0.747204      0.011883
11        0.763731       0.004085       0.755510      0.009353
12        0.806369       0.005321       0.799400      0.010688
13        0.815340       0.004726       0.808683      0.011034
14        0.828854       0.003782       0.822908      0

In [295]:
from xgboost import train, DMatrix

# Extract the optimal number of boosting rounds from the CV results
optimal_boost_rounds = len(xgb_cv)
print(f"Optimal Boost Rounds: {optimal_boost_rounds}")

# Train the final model using the full training data
final_model = train(
    params=params,
    dtrain=dtrain,
    num_boost_round=optimal_boost_rounds
)

# Save the model for future use
final_model.save_model("xgboost_final_model.json")

# Convert validation data into DMatrix
dval = DMatrix(data=val_X_processed)

# Make predictions on the train and validation datasets
train_predictions = final_model.predict(dtrain)
val_predictions = final_model.predict(dval)

# Convert probabilities to binary predictions (default threshold = 0.5)
threshold = 0.5
train_binary_preds = (train_predictions > threshold).astype(int)
val_binary_preds = (val_predictions > threshold).astype(int)

# Evaluate the performance on training and validation data
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Training set evaluation
print("Training Set Performance:")
print("Accuracy:", accuracy_score(train_y, train_binary_preds))
print("F1 Score:", f1_score(train_y, train_binary_preds))
print("Classification Report:\n", classification_report(train_y, train_binary_preds))

# Validation set evaluation
print("Validation Set Performance:")
print("Accuracy:", accuracy_score(val_y, val_binary_preds))
print("F1 Score:", f1_score(val_y, val_binary_preds))
print("Classification Report:\n", classification_report(val_y, val_binary_preds))


Optimal Boost Rounds: 50
Training Set Performance:
Accuracy: 0.9286589339241497
F1 Score: 0.3090128755364807
Classification Report:
               precision    recall  f1-score   support

           0       0.93      1.00      0.96     35030
           1       0.98      0.18      0.31      3335

    accuracy                           0.93     38365
   macro avg       0.95      0.59      0.64     38365
weighted avg       0.93      0.93      0.91     38365

Validation Set Performance:
Accuracy: 0.933588761174968
F1 Score: 0.314070351758794
Classification Report:
               precision    recall  f1-score   support

           0       0.93      1.00      0.97     15110
           1       0.97      0.19      0.31      1333

    accuracy                           0.93     16443
   macro avg       0.95      0.59      0.64     16443
weighted avg       0.94      0.93      0.91     16443



In [None]:
cv()

In [267]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Initialize StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=3, test_size=0.3, random_state=42)

# Your model pipeline
model_pipeline12 = Pipeline(
    steps=[
        ('preprocess', preprocess),  # Replace with your preprocessing pipeline
        ('model', XGBClassifier(
            colsample_bytree=0.6,
            gamma=0,
            learning_rate=0.5,
            max_depth=2,
            n_estimators=1000,
            subsample=0.6
        ))
    ]
)

# Iterate through the splits
split_results = []
for train_index, test_index in sss.split(train_X, train_y):
    # Use .iloc for DataFrame/Series indexing
    X_train, X_test = train_X.iloc[train_index], train_X.iloc[test_index]
    y_train, y_test = train_y.iloc[train_index], train_y.iloc[test_index]
    
    # Fit the model on the training data
    model_pipeline12.fit(X_train, y_train)
    
    # Evaluate on the test data
    y_pred_train = model_pipeline12.predict(X_train)
    y_pred_test = model_pipeline12.predict(X_test)
    
    # Calculate metrics
    train_f1 = f1_score(y_train, y_pred_train, average='weighted')
    test_f1 = f1_score(y_test, y_pred_test, average='weighted')
    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    
    # Save results for this split
    split_results.append({
        'train_f1': train_f1,
        'test_f1': test_f1,
        'train_accuracy': train_accuracy,
        'test_accuracy': test_accuracy,
        'classification_report': classification_report(y_test, y_pred_test)
    })

# Display results
for i, result in enumerate(split_results):
    print(f"Split {i + 1}:")
    print(f"Train Accuracy: {result['train_accuracy']:.4f}, Test Accuracy: {result['test_accuracy']:.4f}")
    print(f"Train F1: {result['train_f1']:.4f}, Test F1: {result['test_f1']:.4f}")
    print("Classification Report:\n", result['classification_report'])
    print("----")


Split 1:
Train Accuracy: 0.9450, Test Accuracy: 0.9368
Train F1: 0.9358, Test F1: 0.9268
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.99      0.97     10509
           1       0.78      0.38      0.51      1001

    accuracy                           0.94     11510
   macro avg       0.86      0.69      0.74     11510
weighted avg       0.93      0.94      0.93     11510

----
Split 2:
Train Accuracy: 0.9442, Test Accuracy: 0.9384
Train F1: 0.9342, Test F1: 0.9285
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.99      0.97     10509
           1       0.80      0.39      0.52      1001

    accuracy                           0.94     11510
   macro avg       0.87      0.69      0.75     11510
weighted avg       0.93      0.94      0.93     11510

----
Split 3:
Train Accuracy: 0.9451, Test Accuracy: 0.9374
Train F1: 0.9358, Test F1: 0.9265
Classification Report

In [269]:
submission_data['is_promoted']=model_pipeline12.predict(test)
submission_data.to_csv('new_9.csv', index=False)