In [None]:
# load packages
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
import rpy2 as R


In [None]:

# Cleaning and Imputation

# load data
# put the data file in the same directory as this notebook and replace the empty string with the name of the data file
Hackathon_data_path = ""
# change this to read_file_type depending on the file type (e.g. pd.read_csv, pd.read_excel, etc.)
hackathon_data_uncleaned = pd.read_csv(Hackathon_data_path)
# open the file in data wrangler to view the data

# clean the data here (e.g. drop columns, rename columns, etc.)








cleaned_hackathon_data = hackathon_data_uncleaned.copy()  # replace this with the cleaned data

# label the features and target variable
X = cleaned_hackathon_data.drop(columns=["target_column"])  # replace "target_column" with the name of the target variable
y = cleaned_hackathon_data["target_column"]  # replace "target_column" with the name of the target variable

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [None]:

# Imputation methods
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer, KNNImputer

# IterativeImputer is a more advanced imputation method that models each feature with missing values as a function of other features in a round-robin fashion. 
# It can be more accurate than simpler methods like mean or median imputation, especially when the data has complex relationships between features.
# It's important to note that IterativeImputer can be computationally intensive, especially for large datasets, and it may not always be the best choice depending on the specific characteristics of your data and the amount of missingness.
# If you have a large dataset or a high percentage of missing values, you may want to consider simpler imputation methods or dimensionality reduction techniques before applying IterativeImputer.
# Additionally, it's crucial to evaluate the performance of the imputation method you choose, as it can significantly impact the results of your analysis or machine learning models.
# It's also a good practice to compare the results of different imputation methods to ensure that the chosen method is appropriate for your specific dataset and analysis goals.
# The autofill wrote all of this ^ wow.
# I was just going to day that IterativeImputer is like MICE in R

iterative_imputer = IterativeImputer(random_state=0)
simple_imputer = SimpleImputer(strategy="mean")  # replace "mean" with the desired imputation strategy (e.g. "median", "most_frequent", etc.)
knn_imputer = KNNImputer(n_neighbors=5)  # replace 5 with the desired number of neighbors for KNN imputation



In [None]:
# Pipelines!
# create a pipeline that includes the imputer and any other preprocessing steps (e.g. scaling, encoding, etc.)

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer, MinMaxScaler
from sklearn.compose import ColumnTransformer


# Pipelines
pipeline_steps = [
    ("imputer", iterative_imputer),  # replace iterative_imputer with the desired imputation method (e.g. simple_imputer, knn_imputer, etc.)
    ("encoder", OneHotEncoder(handle_unknown="ignore")),  # replace OneHotEncoder with the desired encoding method (e.g. OrdinalEncoder, etc.)
    ("column_transformer", ColumnTransformer(transformers=[("power_transformer", PowerTransformer(method="yeo-johnson"), ["col1", "col2"])], remainder="passthrough")),  # replace 'col1', 'col2' with the names of the columns to be transformed, and add more tuples to the list if you have more transformations to apply to different columns. The remainder='passthrough' argument ensures that any columns not specified in the transformers list are passed through without transformation.
    ("model", None)  # replace None with the desired machine learning model (e.g. LogisticRegression(), RandomForestClassifier(), etc.)
]

# note on scaling: These models are sensitive to the scale of the features, and scaling can help improve their performance and convergence.
# Gradient Descent Models: Neural Networks (MLP) and Logistic Regression converge much faster when features are on a similar scale.
# Regularized Models: If you use Lasso or Ridge Regression, scaling is required because these methods penalize coefficients based on their magnitude.
# Dimensionality Reduction: PCA requires scaling because it seeks to maximize variance; unscaled data will lead PCA to focus only on the features with the largest raw values.

# note on encoding: OneHotEncoder is a common choice for encoding categorical variables, especially when the categories are nominal (i.e. no inherent order). 
# It creates binary columns for each category, which can be useful for many machine learning algorithms. 
# However, if you have a large number of categories, it can lead to a high-dimensional feature space, which may require additional dimensionality reduction techniques or regularization to prevent overfitting.
# If your categorical variables are ordinal (i.e. have a natural order), you may want to use OrdinalEncoder instead, which assigns integer values to the categories based on their order. 
# However, be cautious when using OrdinalEncoder with algorithms that can interpret the integer values as having a meaningful order, as this may not always be appropriate.

pipeline = Pipeline(steps=pipeline_steps)
pipeline.fit(X_train,y_train)  # fit the pipeline to the training data
pipeline.score(X_test, y_test)  # evaluate the pipeline on the test data
pipeline.predict(X_test)  # make predictions using the pipeline on the test data
pipeline.named_steps["model"].feature_importances_  # access feature importances from the model in the pipeline (replace "model" with the name of the model step in the pipeline, e.g. "random_forest", etc.)
pipeline.named_steps["model"].coef_  # access model coefficients from the model in the pipeline (replace "model" with the name of the model step in the pipeline, e.g. "logistic_regression", etc.)


# transformations
# power transformation is a technique used to stabilize variance and make the data more normally distributed. It can be particularly useful for skewed data. The PowerTransformer in scikit-learn provides two methods for power transformation: "yeo-johnson" and "box-cox". The "yeo-johnson" method can handle both positive and negative values, while the "box-cox" method can only handle positive values. You can choose the appropriate method based on the characteristics of your data.
yeo_johnson_transformer = PowerTransformer(method="yeo-johnson")  # replace "yeo-johnson" with the desired transformation method (e.g. "box-cox", etc.)
# ct = ColumnTranformer([("yeo_johnson", yeo_johnson_transformer), ['col1', 'col2'])], remainder='passthrough')  # replace 'col1', 'col2' with the names of the columns to be transformed, and add more tuples to the list if you have more transformations to apply to different columns. The remainder='passthrough' argument ensures that any columns not specified in the transformers list are passed through without transformation.


In [None]:
from sklearn.model_selection import GridSearchCV
# Hyperparameter Tuning if we want to
model = None  # replace None with the desired machine learning model (e.g. LogisticRegression(), RandomForestClassifier(), etc.)

param_grid = {
    # Tune the model (step name is "model")
    'model': [model(random_state=42)], # Set the model here if it was None
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 10, 20],
    
    # You can even tune the imputer or transformer!
    'imputer__initial_strategy': ['mean', 'median'],
    'column_transformer__power_transformer__method': ['yeo-johnson']
}

grid_search = GridSearchCV(
    estimator=pipeline, 
    param_grid=param_grid, 
    cv=5,            # 5-fold cross-validation
    scoring='accuracy', 
    n_jobs=-1        # Use all available CPU cores
)
grid_search.fit(X, y)

print(f"Best parameters: {grid_search.best_params_}")

test_score = grid_search.score(X_test, y_test)

# Predict
y_pred = grid_search.predict(X_test)

# Access the model inside the best pipeline to get importances
best_pipeline = grid_search.best_estimator_
importances = best_pipeline.named_steps["model"].feature_importances_

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# evaluate the model using appropriate metrics (e.g. classification_report, confusion_matrix, etc.)
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))  # replace with the appropriate metric for your specific problem (e.g. regression metrics, etc.)
print(confusion_matrix(y_test, y_pred))  # replace with the appropriate metric for your specific problem (e.g. regression metrics, etc
