In [217]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
import xgboost as xgb
from xgboost import XGBClassifier


In [218]:
df=pd.read_csv('BMW_Car_Sales_Classification.csv')
df

Unnamed: 0,Model,Year,Region,Color,Fuel_Type,Transmission,Engine_Size_L,Mileage_KM,Price_USD,Sales_Volume,Sales_Classification
0,5 Series,2016,Asia,Red,Petrol,Manual,3.5,151748,98740,8300,High
1,i8,2013,North America,Red,Hybrid,Automatic,1.6,121671,79219,3428,Low
2,5 Series,2022,North America,Blue,Petrol,Automatic,4.5,10991,113265,6994,Low
3,X3,2024,Middle East,Blue,Petrol,Automatic,1.7,27255,60971,4047,Low
4,7 Series,2020,South America,Black,Diesel,Manual,2.1,122131,49898,3080,Low
...,...,...,...,...,...,...,...,...,...,...,...
49995,i3,2014,Asia,Red,Hybrid,Manual,4.6,151030,42932,8182,High
49996,i3,2023,Middle East,Silver,Electric,Manual,4.2,147396,48714,9816,High
49997,5 Series,2010,Middle East,Red,Petrol,Automatic,4.5,174939,46126,8280,High
49998,i3,2020,Asia,White,Electric,Automatic,3.8,3379,58566,9486,High


In [219]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Model                 50000 non-null  object 
 1   Year                  50000 non-null  int64  
 2   Region                50000 non-null  object 
 3   Color                 50000 non-null  object 
 4   Fuel_Type             50000 non-null  object 
 5   Transmission          50000 non-null  object 
 6   Engine_Size_L         50000 non-null  float64
 7   Mileage_KM            50000 non-null  int64  
 8   Price_USD             50000 non-null  int64  
 9   Sales_Volume          50000 non-null  int64  
 10  Sales_Classification  50000 non-null  object 
dtypes: float64(1), int64(4), object(6)
memory usage: 4.2+ MB


In [220]:
df.isnull().sum()

Model                   0
Year                    0
Region                  0
Color                   0
Fuel_Type               0
Transmission            0
Engine_Size_L           0
Mileage_KM              0
Price_USD               0
Sales_Volume            0
Sales_Classification    0
dtype: int64

In [221]:
df.duplicated().sum()

np.int64(0)

In [222]:
df['Model'].value_counts()

Model
7 Series    4666
i3          4618
i8          4606
3 Series    4595
5 Series    4592
X1          4570
X3          4497
X5          4487
M5          4478
X6          4478
M3          4413
Name: count, dtype: int64

In [223]:
df['Region'].value_counts()

Region
Asia             8454
Middle East      8373
North America    8335
Europe           8334
Africa           8253
South America    8251
Name: count, dtype: int64

In [224]:
df['Fuel_Type'].value_counts()

Fuel_Type
Hybrid      12716
Petrol      12550
Electric    12471
Diesel      12263
Name: count, dtype: int64

In [225]:
df['Color'].value_counts()

Color
Red       8463
Silver    8350
Grey      8348
White     8304
Black     8273
Blue      8262
Name: count, dtype: int64

In [226]:
df['Transmission'].value_counts()

Transmission
Manual       25154
Automatic    24846
Name: count, dtype: int64

In [227]:
df['Sales_Classification'] = df['Sales_Classification'].map({'Low': 0, 'High': 1})
df

Unnamed: 0,Model,Year,Region,Color,Fuel_Type,Transmission,Engine_Size_L,Mileage_KM,Price_USD,Sales_Volume,Sales_Classification
0,5 Series,2016,Asia,Red,Petrol,Manual,3.5,151748,98740,8300,1
1,i8,2013,North America,Red,Hybrid,Automatic,1.6,121671,79219,3428,0
2,5 Series,2022,North America,Blue,Petrol,Automatic,4.5,10991,113265,6994,0
3,X3,2024,Middle East,Blue,Petrol,Automatic,1.7,27255,60971,4047,0
4,7 Series,2020,South America,Black,Diesel,Manual,2.1,122131,49898,3080,0
...,...,...,...,...,...,...,...,...,...,...,...
49995,i3,2014,Asia,Red,Hybrid,Manual,4.6,151030,42932,8182,1
49996,i3,2023,Middle East,Silver,Electric,Manual,4.2,147396,48714,9816,1
49997,5 Series,2010,Middle East,Red,Petrol,Automatic,4.5,174939,46126,8280,1
49998,i3,2020,Asia,White,Electric,Automatic,3.8,3379,58566,9486,1


In [228]:
df['Car_Age'] = 2025 - df['Year']
df

Unnamed: 0,Model,Year,Region,Color,Fuel_Type,Transmission,Engine_Size_L,Mileage_KM,Price_USD,Sales_Volume,Sales_Classification,Car_Age
0,5 Series,2016,Asia,Red,Petrol,Manual,3.5,151748,98740,8300,1,9
1,i8,2013,North America,Red,Hybrid,Automatic,1.6,121671,79219,3428,0,12
2,5 Series,2022,North America,Blue,Petrol,Automatic,4.5,10991,113265,6994,0,3
3,X3,2024,Middle East,Blue,Petrol,Automatic,1.7,27255,60971,4047,0,1
4,7 Series,2020,South America,Black,Diesel,Manual,2.1,122131,49898,3080,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...
49995,i3,2014,Asia,Red,Hybrid,Manual,4.6,151030,42932,8182,1,11
49996,i3,2023,Middle East,Silver,Electric,Manual,4.2,147396,48714,9816,1,2
49997,5 Series,2010,Middle East,Red,Petrol,Automatic,4.5,174939,46126,8280,1,15
49998,i3,2020,Asia,White,Electric,Automatic,3.8,3379,58566,9486,1,5


In [229]:
X = df.drop(['Sales_Classification', 'Sales_Volume','Year'], axis=1)
y = df['Sales_Classification']

In [230]:
counts = y.value_counts()
scale_pos_weight = counts[0] / counts[1]
print(f"Using scale_pos_weight: {scale_pos_weight:.2f}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

categorical_features = ['Model', 'Region', 'Color', 'Fuel_Type', 'Transmission']
numerical_features = ['Engine_Size_L', 'Mileage_KM', 'Price_USD', 'Car_Age']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numerical_features)
    ])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBClassifier(
        random_state=42,
        scale_pos_weight=scale_pos_weight,
        objective='binary:logistic',
        eval_metric='logloss',
        use_label_encoder=False
    )) 
])

param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__max_depth': [3, 5, 7, 10],
    'model__subsample': [0.7, 0.8, 1.0],
    'model__colsample_bytree': [0.7, 0.8, 1.0]
}

print("Starting FINAL hyperparameter tuning (XGBoost)...")

rand_search = RandomizedSearchCV(
    estimator=pipeline, 
    param_distributions=param_grid, 
    n_iter=10, 
    cv=5, 
    scoring='f1',
    random_state=42, 
    n_jobs=-1,
    verbose=1 
)

rand_search.fit(X_train, y_train)

print("\n--- Final Tuning Complete ---")
print(f"Best F1-Score (for 'High') found: {rand_search.best_score_:.4f}")
print("\nBest parameters found:")
print(rand_search.best_params_)

best_model = rand_search.best_estimator_
y_pred = best_model.predict(X_test)

print("\n--- Final Tuned Model Evaluation on Test Set ---")
print(classification_report(y_test, y_pred, target_names=['Low (0)', 'High (1)']))

Using scale_pos_weight: 2.28
Starting FINAL hyperparameter tuning (XGBoost)...
Fitting 5 folds for each of 10 candidates, totalling 50 fits

--- Final Tuning Complete ---
Best F1-Score (for 'High') found: 0.3746

Best parameters found:
{'model__subsample': 0.7, 'model__n_estimators': 100, 'model__max_depth': 3, 'model__learning_rate': 0.01, 'model__colsample_bytree': 0.8}

--- Final Tuned Model Evaluation on Test Set ---
              precision    recall  f1-score   support

     Low (0)       0.69      0.50      0.58      6951
    High (1)       0.30      0.49      0.37      3049

    accuracy                           0.50     10000
   macro avg       0.49      0.49      0.48     10000
weighted avg       0.57      0.50      0.52     10000



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [231]:
# counts = y.value_counts()
# count_negative = counts[0]
# count_positive = counts[1]

# scale_pos_weight = count_negative / count_positive
# print(f"Correct scale_pos_weight (boosting 'High' class): {scale_pos_weight:.2f}")

# X_train, X_test, y_train, y_test = train_test_split(
#     X, 
#     y,
#     test_size=0.2,
#     stratify=y,
#     random_state=42
# )

# categorical_features = ['Model', 'Region', 'Color', 'Fuel_Type', 'Transmission']
# numerical_features = ['Engine_Size_L', 'Mileage_KM', 'Price_USD', 'Car_Age']

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features),
#         ('num', 'passthrough', numerical_features)
#     ],
#     remainder='passthrough'
# )

# X_train_processed = preprocessor.fit_transform(X_train)
# X_test_processed = preprocessor.transform(X_test)

# print("\nTraining XGBoost classifier with correct weights...")

# cat_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
# all_feature_names = np.concatenate([cat_names, numerical_features])

# model = xgb.XGBClassifier(
#     objective='binary:logistic',
#     eval_metric='logloss',
#     scale_pos_weight=scale_pos_weight,
#     use_label_encoder=False,
#     random_state=42,
#     feature_names=list(all_feature_names)
# )

# model.fit(X_train_processed, y_train)

# y_pred = model.predict(X_test_processed)

# print("\n--- Final Model Evaluation (Corrected) ---")
# print(classification_report(y_test, y_pred, target_names=['Low (0)', 'High (1)']))