In [1]:
import pandas as pd
import os

def load_dataset(file_path=None, target_column=None):
    """
    Loads a dataset from a local CSV file.

    Args:
        file_path (str): Path to the CSV file. If None, uses default.
        target_column (str): Name of the target variable column.
                             If None, assumes the LAST column is the target.

    Returns:
        X (pd.DataFrame): Features
        y (pd.Series): Target
        df (pd.DataFrame): Full dataframe
    """
    # 1. Determine File Path
    if file_path is None:
        file_path = DEFAULT_FILENAME

    # 2. Check if file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"❌ Error: The file '{file_path}' was not found in the directory.")

    # 3. Load CSV
    try:
        df = pd.read_csv(file_path)
    except Exception as e:
        raise Exception(f"❌ Error reading CSV file: {e}")

    # 4. Identify Target Column
    if target_column is None:
        # Default behavior: Assume the LAST column is the target
        target_column = df.columns[-1]
        print(f"ℹ️ Note: No target column specified. Using last column: '{target_column}'")

    if target_column not in df.columns:
        raise ValueError(f"❌ Error: Target column '{target_column}' not found in dataset.")

    # 5. Split Features (X) and Target (y)
    X = df.drop(columns=[target_column])
    y = df[target_column]

    return X, y, df

X, Y, df = load_dataset('../data/train_dataset.csv')

ℹ️ Note: No target column specified. Using last column: 'Class'


In [2]:
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V33,V34,V35,V36,V37,V38,V39,V40,V41,Class
0,4.732,3.0634,0,0,0,0,0,28.6,4,4,...,0,0,0,3.426,2.278,0,7.977,0,0,1
1,5.262,4.7497,4,0,0,0,0,23.1,3,4,...,0,0,0,4.416,2.831,0,9.522,0,3,1
2,4.641,3.0954,0,0,4,0,2,57.1,0,0,...,0,2,0,3.726,2.833,2,8.435,0,0,1
3,5.072,2.4738,0,0,0,0,4,50.0,0,3,...,2,0,2,3.827,2.708,0,8.803,0,0,1
4,4.679,2.8711,0,0,0,0,2,41.7,2,0,...,0,0,0,3.627,1.833,0,8.015,0,0,1


In [3]:
print(X.shape)
print(Y.name)

(844, 41)
Class


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import pickle 

model = LogisticRegression(max_iter=3000, random_state=42)
model.fit(X, Y)

with open('logistic_regression.pkl', 'wb') as f:
    pickle.dump(model, f)

model = DecisionTreeClassifier() 
model.fit(X, Y)

with open('decision_tree.pkl', 'wb') as f:
    pickle.dump(model, f)

model = KNeighborsClassifier() 
model.fit(X, Y)

with open('k_nearest_neighbor.pkl', 'wb') as f:
    pickle.dump(model, f)

model = GaussianNB() 
model.fit(X, Y)

with open('naive_bayes.pkl', 'wb') as f:
    pickle.dump(model, f)

model = RandomForestClassifier() 
model.fit(X, Y)

with open('random_forest.pkl', 'wb') as f:
    pickle.dump(model, f)


In [None]:
le = LabelEncoder()

model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X, Y)

with open('xgboost.pkl', 'wb') as f:
    pickle.dump(model, f)