In [11]:
# Import the pandas library for data manipulation and analysis
import pandas as pd
# Import the numpy library, which is fundamental for scientific computing in Python
import numpy as np
# Import the SimpleImputer class from sklearn, which provides basic strategies for imputing missing values
from sklearn.impute import SimpleImputer
# Import SGDClassifier from sklearn, which is a linear classifier 
# (SVM or logistic regression) optimized using Stochastic Gradient Descent
from sklearn.linear_model import SGDClassifier
# Import the cross_val_score function for cross-validation.
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler


# Read the preprocessed training data from a CSV file into a DataFrame
train_processed = pd.read_csv("dataset/train_processed.csv")

# Read the preprocessed test data from a CSV file into a DataFrame
test_processed = pd.read_csv("dataset/test_processed.csv")

# Separate the features (X) and the target variable (y) for the training data
# Drop the 'Transported' column from the training data to form the feature set
X_train = train_processed.drop("Transported", axis=1)

# Extract the 'Transported' column as the target variable for the training data
y_train = train_processed["Transported"]

# Copy the test dataset into a new variable (this will be used for making predictions)
X_test = test_processed.copy()

# Create an instance of SimpleImputer with the strategy to replace missing values using the mean of each column
# The missing_values parameter specifies what value will be considered as missing, in this case 'np.nan' which represents NaN values in numpy
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

# Apply the imputer to the training data
# This involves calculating the mean of each column in X_train (fit) and then replacing missing values in X_train with these means (transform)
# The result is stored in X_train_tr, which now contains the training data with missing values imputed
X_train_tr = imp_mean.fit_transform(X_train)

# Convert the transformed training data (after imputation) back into a DataFrame.
# This step is necessary because the imputer returns a numpy array. 
# We preserve the original column names and indices from X_train.
X_train = pd.DataFrame(X_train_tr, columns = X_train.columns, index = X_train.index)


In [12]:
# Initialize an instance of MinMaxScaler.
# MinMaxScaler is a method to scale the features to a specific range, typically [0, 1].
scaler = MinMaxScaler()

# Fit the scaler to the training data (X_train).
# This step calculates the minimum and maximum values of each feature in X_train, 
# which will be used to scale the data.
scaler.fit(X_train)

# Transform the training data using the fitted scaler.
# This scales each feature in X_train to the range [0, 1] using the min and max values computed earlier.
X_train_scaled = scaler.transform(X_train)

# Convert the scaled data (which is in the form of a numpy array) back to a pandas DataFrame.
# This step is necessary because scaling operations often return numpy arrays rather than DataFrames.
# The original column names and indices from X_train are used to ensure that the structure of the data remains consistent.
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)


In [29]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

# Define a Pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),  # Placeholder for the scaler
    ('classifier', SGDClassifier())
])

# Define parameter grid
param_grid = [
    # {
    #     'scaler': [StandardScaler(), MinMaxScaler()],
    #     'classifier': [SGDClassifier()],
    #     'classifier__max_iter': [900, 1000, 2000],
    #     'classifier__tol': [1e-3, 1e-4, 1e-5]
    # },
    {
        'scaler': [StandardScaler(), MinMaxScaler()],
        'classifier': [RandomForestClassifier()],
        'classifier__n_estimators': list(range(0, 200, 50)),  # Example parameters for RandomForestClassifier
        'classifier__max_depth': list(range(0, 30, 10))
    }
]
# Set up Grid Search
grid_search = GridSearchCV(pipe, param_grid, cv=3, scoring='accuracy')

# Fit on your data
grid_search.fit(X_train, y_train)

# Get the best combination of preprocessing steps and classifier parameters
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)


36 fits failed out of a total of 72.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "/root/anaconda3/envs/learn/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/root/anaconda3/envs/learn/lib/python3.10/site-packages/sklearn/base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/root/anaconda3/envs/learn/lib/python3.10/site-packages/sklearn/pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/root/anaconda3/envs/learn/lib/python3.10/site-packages/sklearn/base.py", 

Best parameters: {'classifier': RandomForestClassifier(), 'classifier__max_depth': 10, 'classifier__n_estimators': 50, 'scaler': StandardScaler()}
Best score: 0.7970800886410738


### Training a Binary Classifier

In [13]:
# Assuming X_train and y_train are previously defined and properly formatted
# Initialize the SGDClassifier with specified parameters.
# max_iter is the maximum number of passes over the training data,
# tol is the stopping criterion, and random_state ensures reproducibility.
sgd_clf = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)

# Fit the model to the training data (both features and target variable)
sgd_clf.fit(X_train, y_train)


# Perform 3-fold cross-validation to evaluate the accuracy of the classifier.
# This process divides the dataset into 3 parts, trains the model on 2 parts and 
# tests on the 3rd part, this is repeated 3 times each with a different part as the test set.
cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")


array([0.7615597 , 0.76121463, 0.77528478])

### Performance Measures


##### Measuring Accuracy Using Cross-Validation


In [14]:
# Import StratifiedKFold for stratified sampling to ensure representative ratio of each class.
from sklearn.model_selection import StratifiedKFold

# Import clone to make deep copies of the SGD classifier without copying attached data.
from sklearn.base import clone

# Create a StratifiedKFold object for cross-validation, with 3 splits.
# The data is shuffled for each fold, and a random state is set for reproducibility.
skfolds = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Loop over each fold created by the StratifiedKFold object.
# `train_index` and `test_index` are arrays of indices for the training and test set respectively.
for train_index, test_index in skfolds.split(X_train, y_train):
    # Clone the original SGDClassifier. This ensures the classifier is fresh for each fold.
    clone_clf = clone(sgd_clf)

    # Create training and test sets for this fold using the provided indices.
    X_train_folds = X_train.iloc[train_index]
    y_train_folds = y_train.iloc[train_index]
    X_test_fold = X_train.iloc[test_index]
    y_test_fold = y_train.iloc[test_index]

    # Fit the cloned classifier on the training part of the fold.
    clone_clf.fit(X_train_folds, y_train_folds)

    # Predict on the test part of the fold.
    y_pred = clone_clf.predict(X_test_fold)

    # Count the number of correct predictions.
    n_correct = sum(y_pred == y_test_fold)

    # Calculate and print the accuracy for this fold.
    print(n_correct / len(y_pred))


0.766735679779158
0.7712215320910973
0.7770107007248878


### Confusion Matrix

In [15]:
# Import the cross_val_predict function from sklearn.model_selection.
from sklearn.model_selection import cross_val_predict

# Generate cross-validated estimates for each input data point.
# The function is similar to cross_val_score, but instead of returning the evaluation scores,
# it returns the predictions made on each test fold.
# This means that for each element in the input, a prediction is made by a model trained on the
# rest of the data. Here, it uses the SGD classifier, 3-fold cross-validation (cv=3),
# and the training data (X_train, y_train).
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3)

In [16]:
# Import the confusion_matrix function from sklearn.metrics.
from sklearn.metrics import confusion_matrix

# Create a confusion matrix, which is a summary of prediction results on a classification problem.
# The number of correct and incorrect predictions are summarized with count values and broken down by each class.
# This is comparing the true target values (y_train) against the predictions made by the model (y_train_pred).
# The matrix gives insights into the types of errors being made by the classifier.
cm = confusion_matrix(y_train, y_train_pred)

### Precision and Recall

In [17]:
# Import precision_score and recall_score from sklearn.metrics.
from sklearn.metrics import precision_score, recall_score

# Calculate the precision of the model.
# Precision is the ratio of true positives (correct positive predictions) to the total number of positive predictions made.
# In other words, it answers the question: "Out of all the instances the classifier predicted as positive,
# how many were actually positive?".
# This is computed by comparing the true labels (y_train) with the predicted labels (y_train_pred).
precision_score(y_train, y_train_pred)


0.8191721132897604

In [18]:
# Calculate the recall score of the model.
# Recall, also known as sensitivity or true positive rate, is the ratio of true positives 
# to the sum of true positives and false negatives.
# In other words, it answers the question: "Out of all the actual positive instances, 
# how many did the classifier correctly identify as positive?".
# This is computed by comparing the true labels (y_train) with the predicted labels (y_train_pred).
recall_score(y_train, y_train_pred)


0.6870717222476016

In [19]:
# Import the f1_score function from sklearn.metrics.
from sklearn.metrics import f1_score

# Calculate the F1 score for the model.
# The F1 score is the harmonic mean of precision and recall. 
# It provides a single score that balances both the concerns of precision and recall in one number.
# An F1 score reaches its best value at 1 (perfect precision and recall) and worst at 0.
# It is particularly useful when you need to balance precision and recall, 
# and is especially useful in uneven class distribution scenarios where one class is rare.
# This is computed by comparing the true labels (y_train) with the predicted labels (y_train_pred).
f1_score(y_train, y_train_pred)


0.7473291925465838

In [20]:
from sklearn.metrics import accuracy_score

accuracy_score(y_train, y_train_pred)

0.7660186356838836