In [1]:
# Data Ingestion Steps
import pandas as pd
df = pd.read_csv('data/Gender.csv')
df.head()

Unnamed: 0,Favorite Color,Favorite Music Genre,Favorite Beverage,Favorite Soft Drink,Gender
0,Cool,Rock,Vodka,7UP/Sprite,F
1,Neutral,Hip hop,Vodka,Coca Cola/Pepsi,F
2,Warm,Rock,Wine,Coca Cola/Pepsi,F
3,Warm,Folk/Traditional,Whiskey,Fanta,F
4,Cool,Rock,Vodka,Coca Cola/Pepsi,F


In [2]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder
X = df.drop(labels=['Gender'], axis = 1)
ans = df[['Gender']]
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(ans)

  y = column_or_1d(y, warn=True)


In [3]:
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [4]:
# Define the custom ranking for each ordinal variable
Color_categories = ['Neutral', 'Warm', 'Cool']
Music_categories = ['Jazz/Blues', 'Folk/Traditional', 'R&B and soul', 'Electronic', 'Hip hop', 'Rock', 'Pop']
Beverage_categories = ['Vodka', 'Whiskey', 'Wine', 'Other', 'Beer', "Doesn't drink"]
Drink_categories = ['Coca Cola/Pepsi', 'Fanta', '7UP/Sprite', 'other']


In [5]:
! pip install scikit-learn



In [6]:
from sklearn.impute import SimpleImputer ## Handling missing values
from sklearn.preprocessing import StandardScaler ## Handling Feature scaling
from sklearn.preprocessing import OrdinalEncoder ## Ordinal Encoding
# pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [7]:
# Numerical Pipeline
num_pipeline = Pipeline(
             steps=[
              ('imputer', SimpleImputer(strategy='median')),
               ('scaler', StandardScaler())   
          
             ]    
) 

# Categorical Pipeline

cat_pipeline = Pipeline(
              steps=[
               ('imputer', SimpleImputer(strategy='most_frequent')),
               ('ordinal_encoder', OrdinalEncoder(categories=[
                 Color_categories,Music_categories, Beverage_categories, Drink_categories],
                handle_unknown='use_encoded_value',unknown_value=-1)),
               ('scaler', StandardScaler())
              ]
     
)

preprocessor = ColumnTransformer([
('num_pipeline', num_pipeline, numerical_cols),
('cat_pipline', cat_pipeline, categorical_cols)
])


In [8]:
# Train Test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42)


In [9]:
X_train =pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
X_test =pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())

In [10]:
X_train.head()

Unnamed: 0,cat_pipline__Favorite Color,cat_pipline__Favorite Music Genre,cat_pipline__Favorite Beverage,cat_pipline__Favorite Soft Drink
0,0.708881,0.998651,0.704348,-0.436564
1,0.708881,0.998651,1.272769,-0.436564
2,0.708881,0.464489,-1.569337,-0.436564
3,0.708881,0.464489,1.272769,-1.552228
4,-2.551972,0.464489,1.272769,-0.436564


In [11]:
X_test.head()

Unnamed: 0,cat_pipline__Favorite Color,cat_pipline__Favorite Music Genre,cat_pipline__Favorite Beverage,cat_pipline__Favorite Soft Drink
0,0.708881,-0.603835,1.272769,0.6791
1,0.708881,-0.069673,0.704348,-0.436564
2,0.708881,0.464489,-1.569337,1.794764
3,0.708881,-0.069673,0.704348,-0.436564
4,-0.921546,-2.206321,1.272769,0.6791


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

In [13]:
regression = LogisticRegression()
regression.fit(X_train, y_train)

In [14]:
! pip install xgboost




In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import numpy as np
import pandas as pd

# ✅ Evaluation Function (Percent Metrics)
def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted) * 100
    precision = precision_score(true, predicted) * 100
    recall = recall_score(true, predicted) * 100
    f1 = f1_score(true, predicted) * 100
    return accuracy, precision, recall, f1

# ✅ Define Your Models
models = {
    'LogisticRegression': LogisticRegression(),
    'RandomForestClassifier': RandomForestClassifier(),
    'XGBClassifier': XGBClassifier(eval_metric='logloss')  # Removed use_label_encoder
}

# ✅ Store Results
model_list = []
results = []

# ✅ Train & Evaluate
for model_name, model in models.items():
    try:
        print(f"\nTraining model: {model_name}")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        accuracy, precision, recall, f1 = evaluate_model(y_test, y_pred)

        model_list.append(model_name)
        results.append({
            'Model': model_name,
            'Accuracy (%)': accuracy,
            'Precision (%)': precision,
            'Recall (%)': recall,
            'F1 Score (%)': f1
        })

        print(f"✔️ {model_name} added to model_list ✅")
        print("Accuracy: {:.2f}%".format(accuracy))
        print("Precision: {:.2f}%".format(precision))
        print("Recall: {:.2f}%".format(recall))
        print("F1 Score: {:.2f}%".format(f1))
        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
        print("Classification Report:\n", classification_report(y_test, y_pred))

    except Exception as e:
        print(f"❌ Error training model {model_name}: {e}")

# ✅ Summary Table
results_df = pd.DataFrame(results)
print("\n📊 Summary Table:")
print(results_df)

# ✅ Confirm Model List
print("\n📋 Models Trained:", model_list)



Training model: LogisticRegression
✔️ LogisticRegression added to model_list ✅
Accuracy: 60.00%
Precision: 54.55%
Recall: 66.67%
F1 Score: 60.00%
Confusion Matrix:
 [[6 5]
 [3 6]]
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.55      0.60        11
           1       0.55      0.67      0.60         9

    accuracy                           0.60        20
   macro avg       0.61      0.61      0.60        20
weighted avg       0.61      0.60      0.60        20


Training model: RandomForestClassifier
✔️ RandomForestClassifier added to model_list ✅
Accuracy: 55.00%
Precision: 50.00%
Recall: 55.56%
F1 Score: 52.63%
Confusion Matrix:
 [[6 5]
 [4 5]]
Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.55      0.57        11
           1       0.50      0.56      0.53         9

    accuracy                           0.55        20
   macro avg       0.55      0.55    

In [16]:
model_list

['LogisticRegression', 'RandomForestClassifier', 'XGBClassifier']