In [1]:
pip install mlflow

Collecting mlflow
  Downloading mlflow-2.22.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.22.0 (from mlflow)
  Downloading mlflow_skinny-2.22.0-py3-none-any.whl.metadata (31 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.22.0->mlflow)
  Downloading databricks_sdk-0.50.0-py3-none-any.whl.metadata (38 kB)
Collecting fastapi<1 (from mlflow-skinny==2.22.0->mlflow)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn<1 (from mlflow-skinny==2.22.0->mlflow)
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 k

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from datetime import datetime
import tensorflow as tf
import mlflow
import mlflow.sklearn

In [4]:
mlflow.start_run(run_name="kickstarter_feature_engineering_fixed")

<ActiveRun: >

In [5]:
from google.colab import files
uploaded = files.upload()

Saving Kickstarter.xlsx to Kickstarter.xlsx


In [6]:
df = pd.read_excel("Kickstarter.xlsx")

In [7]:
df = df[df['state'].isin(['successful', 'failed'])]
df['target'] = (df['state'] == 'successful').astype(int)
df = df.drop(columns=["id", "name"])

## DATA LEAKAGE PREVENTION

In [8]:
# Remove features that would cause leakage (happen after campaign ends)
leakage_features = [
    'pledged', 'backers_count', 'usd_pledged',  # Direct leakage - only known after campaign
    'state_changed_at', 'state_changed_at_weekday', 'state_changed_at_month',
    'state_changed_at_day', 'state_changed_at_yr', 'state_changed_at_hr',
    'spotlight',  # Projects get spotlighted after success
]

# Add any other leakage features
if 'staff_pick.1' in df.columns:  # Duplicate of staff_pick
    leakage_features.append('staff_pick.1')

# Filter out features that don't exist in dataset
features_to_remove = [feat for feat in leakage_features if feat in df.columns]
mlflow.log_param("removed_leakage_features", features_to_remove)

# Remove leakage features
df = df.drop(columns=features_to_remove, errors='ignore')

## MISSING VALUES ANALYSIS

In [9]:
# Check for missing values
missing_vals = df.isnull().sum()
missing_cols = missing_vals[missing_vals > 0]
print("Columns with missing values:")
print(missing_cols)

# Calculate missing percentage
missing_percent = (df.isnull().sum() / len(df)) * 100
print("\nMissing percentage by column:")
print(missing_percent[missing_percent > 0])

Columns with missing values:
main_category    278
dtype: int64

Missing percentage by column:
main_category    1.922146
dtype: float64


## MISSING VALUES IMPUTATION

In [10]:
# Identify columns by data type
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'bool']).columns.tolist()
datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()

# Only use numeric columns that have missing values for KNN imputation
numeric_missing = [col for col in numeric_cols if df[col].isnull().sum() > 0]

if numeric_missing:
    # Select subset of columns for imputation
    imputer_cols = numeric_missing + ['goal']  # Include goal as a predictor
    imputer_data = df[imputer_cols].copy()

    # Standardize before KNN imputation
    scaler = StandardScaler()
    imputer_data_scaled = scaler.fit_transform(imputer_data.fillna(imputer_data.median()))

    # Perform KNN imputation
    knn_imputer = KNNImputer(n_neighbors=5)
    imputed_values = knn_imputer.fit_transform(imputer_data_scaled)

    # Transform back to original scale
    imputed_data = scaler.inverse_transform(imputed_values)

    # Replace values in original dataframe
    for i, col in enumerate(imputer_cols):
        df[col] = imputed_data[:, i]

    print(f"Imputed {len(numeric_missing)} numeric columns with KNN")
else:
    print("No missing values in numeric columns.")

# For categorical columns: mode imputation
categorical_missing = [col for col in categorical_cols if df[col].isnull().sum() > 0]
for col in categorical_missing:
    df[col] = df[col].fillna(df[col].mode().iloc[0])

# Log imputation info
mlflow.log_param("missing_numeric_columns", len(numeric_missing))
mlflow.log_param("missing_categorical_columns", len(categorical_missing))

No missing values in numeric columns.


1

## TEMPORAL FEATURE ENGINEERING

In [11]:
# Parse datetime columns
date_columns = ['deadline', 'created_at', 'launched_at']
for col in date_columns:
    if col in df.columns:
        df[f'{col}_datetime'] = pd.to_datetime(df[col])

# Calculate campaign duration (days)
df['campaign_duration'] = (df['deadline_datetime'] - df['launched_at_datetime']).dt.total_seconds() / (60*60*24)
print(f"Campaign duration statistics:\n{df['campaign_duration'].describe()}")

# Calculate preparation time (days from creation to launch)
df['preparation_time'] = (df['launched_at_datetime'] - df['created_at_datetime']).dt.total_seconds() / (60*60*24)
print(f"Preparation time statistics:\n{df['preparation_time'].describe()}")

# Extract day of week features
for col in ['deadline', 'launched_at', 'created_at']:
    df[f'{col}_is_weekend'] = df[f'{col}_datetime'].dt.dayofweek >= 5

# Create month, day, and hour cyclic features
for col in ['deadline', 'launched_at', 'created_at']:
    # Month as cyclic feature (sin and cos)
    df[f'{col}_month_sin'] = np.sin(2 * np.pi * df[f'{col}_datetime'].dt.month / 12)
    df[f'{col}_month_cos'] = np.cos(2 * np.pi * df[f'{col}_datetime'].dt.month / 12)

    # Day of month as cyclic feature
    day_max = 31
    df[f'{col}_day_sin'] = np.sin(2 * np.pi * df[f'{col}_datetime'].dt.day / day_max)
    df[f'{col}_day_cos'] = np.cos(2 * np.pi * df[f'{col}_datetime'].dt.day / day_max)

    # Hour as cyclic feature
    df[f'{col}_hour_sin'] = np.sin(2 * np.pi * df[f'{col}_datetime'].dt.hour / 24)
    df[f'{col}_hour_cos'] = np.cos(2 * np.pi * df[f'{col}_datetime'].dt.hour / 24)

# Create feature for projects created and launched on the same day
df['same_day_launch'] = (df['created_at_datetime'].dt.date == df['launched_at_datetime'].dt.date).astype(int)

# Feature for ideal campaign duration (based on stats that 30-40 days is optimal)
df['ideal_duration'] = ((df['campaign_duration'] >= 30) & (df['campaign_duration'] <= 40)).astype(int)

Campaign duration statistics:
count    14463.000000
mean        33.866455
std         12.977008
min          1.000000
25%         29.958333
50%         30.000000
75%         37.515706
max        120.589687
Name: campaign_duration, dtype: float64
Preparation time statistics:
count    14463.000000
mean        65.812119
std        205.073987
min          0.007060
25%          4.011030
50%         13.997766
75%         44.714010
max       3899.575625
Name: preparation_time, dtype: float64


## GOAL-RELATED FEATURE ENGINEERING

In [12]:
# Goal per day ratio
df['goal_per_day'] = df['goal'] / df['campaign_duration']
print(f"Goal per day statistics:\n{df['goal_per_day'].describe()}")

# Goal bins (logarithmic scale for better distribution)
df['goal_log'] = np.log1p(df['goal'])  # log(1+x) to handle zeros

# Goal percentiles
goal_percentiles = [0, 10, 25, 50, 75, 90, 95, 99, 100]
goal_bins = np.percentile(df['goal'], goal_percentiles)
df['goal_percentile_bin'] = pd.cut(df['goal'], bins=goal_bins, labels=range(len(goal_percentiles)-1))

# Create goal to static_usd_rate ratio
if 'static_usd_rate' in df.columns:
    df['goal_usd_adjusted'] = df['goal'] * df['static_usd_rate']

Goal per day statistics:
count    1.446300e+04
mean     1.827807e+03
std      3.916470e+04
min      1.666667e-02
25%      5.000000e+01
50%      1.666667e+02
75%      5.000000e+02
max      3.125000e+06
Name: goal_per_day, dtype: float64


## CATEGORY AND GEOGRAPHIC FEATURE ENGINEERING

In [13]:
# Calculate success rates by category
if 'main_category' in df.columns:
    # We need to use cross-validation principles here to avoid data leakage
    # For demonstration, we'll use a simple train-test split approach for target encoding
    np.random.seed(42)
    train_mask = np.random.rand(len(df)) < 0.8

    # Calculate success rates from the training portion only
    category_success = df[train_mask].groupby('main_category')['target'].mean().to_dict()

    # Apply to all data (in real pipeline, would apply different techniques to train/test)
    df['category_success_rate'] = df['main_category'].map(category_success)

    # Fill any missing values with the global mean
    global_success_rate = df[train_mask]['target'].mean()
    df['category_success_rate'] = df['category_success_rate'].fillna(global_success_rate)

# Sub-category success rates with same approach
if 'category' in df.columns:
    subcategory_success = df[train_mask].groupby('category')['target'].mean().to_dict()
    df['subcategory_success_rate'] = df['category'].map(subcategory_success)
    df['subcategory_success_rate'] = df['subcategory_success_rate'].fillna(global_success_rate)

# Country success rates with same approach
if 'country' in df.columns:
    country_success = df[train_mask].groupby('country')['target'].mean().to_dict()
    df['country_success_rate'] = df['country'].map(country_success)
    df['country_success_rate'] = df['country_success_rate'].fillna(global_success_rate)

    # Country project counts (as a proxy for platform maturity in region)
    country_counts = df.groupby('country').size().to_dict()
    df['country_project_count'] = df['country'].map(country_counts)
    df['country_project_count_log'] = np.log1p(df['country_project_count'])

## TEXT-BASED FEATURE ENGINEERING

In [14]:
# Handle text length features if available
if 'name_len' in df.columns and 'blurb_len' in df.columns:
    df['name_blurb_ratio'] = df['name_len'] / df['blurb_len'].replace(0, 1)  # Avoid division by zero

if 'name_len_clean' in df.columns and 'name_len' in df.columns:
    df['name_efficiency'] = df['name_len_clean'] / df['name_len'].replace(0, 1)

if 'blurb_len_clean' in df.columns and 'blurb_len' in df.columns:
    df['blurb_efficiency'] = df['blurb_len_clean'] / df['blurb_len'].replace(0, 1)

## ADVANCED ENCODING

In [15]:
# Identify categorical features for encoding - EXCLUDING 'state' (target variable)
categorical_features = []
for col in df.columns:
    # Skip the target variable and any other excluded columns
    if col == 'state' or col == 'target':
        continue
    if df[col].dtype == 'object' or (col.endswith('_weekday') and df[col].nunique() < 10):
        categorical_features.append(col)

print(f"Identified {len(categorical_features)} categorical features for encoding")

# Currency is considered duplicate with country - drop if present
if 'currency' in df.columns:
    df = df.drop(columns=['currency'])
    print("Dropped 'currency' as it's duplicate with country")
    # Remove from categorical features if it was added
    if 'currency' in categorical_features:
        categorical_features.remove('currency')

# Identify low cardinality features (fewer than 10 unique values)
low_cardinality = []
for col in categorical_features:
    if col in df.columns and df[col].nunique() < 10:
        low_cardinality.append(col)

print(f"Low cardinality features for one-hot encoding: {low_cardinality}")

# Apply one-hot encoding
for col in low_cardinality:
    one_hot = pd.get_dummies(df[col], prefix=col, drop_first=True)
    df = pd.concat([df, one_hot], axis=1)

# Identify high cardinality features
high_cardinality = []
for col in categorical_features:
    if col in df.columns and df[col].nunique() >= 10:
        high_cardinality.append(col)

print(f"High cardinality features for target encoding: {high_cardinality}")

# Apply proper target encoding (avoiding leakage by using cross-validation)
for col in high_cardinality:
    # Calculate encoded value using only training data
    global_mean = df[train_mask]['target'].mean()
    category_means = df[train_mask].groupby(col)['target'].mean().to_dict()

    # Calculate counts by category (from all data, as this isn't leaking target info)
    category_counts = df[col].value_counts().to_dict()

    # Apply smoothed target encoding
    smoothing = 20  # Smoothing factor
    df[f'{col}_encoded'] = df[col].map(lambda x:
        (category_counts.get(x, 0) * category_means.get(x, global_mean) +
         smoothing * global_mean) / (category_counts.get(x, 0) + smoothing)
    )

# Boolean features conversion
bool_columns = df.select_dtypes(include=['bool']).columns
for col in bool_columns:
    df[col] = df[col].astype(int)

# Label encoding for remaining categorical features
label_encoder = LabelEncoder()
for col in categorical_features:
    if col in df.columns:
        df[f'{col}_label'] = label_encoder.fit_transform(df[col])

Identified 7 categorical features for encoding
Dropped 'currency' as it's duplicate with country
Low cardinality features for one-hot encoding: ['deadline_weekday', 'created_at_weekday', 'launched_at_weekday']
High cardinality features for target encoding: ['country', 'category', 'main_category']


## ENTITY EMBEDDING

In [16]:
# Identify high-cardinality categorical features for entity embedding
categorical_cols_with_label = [col for col in df.columns if col.endswith('_label')]

# Only apply embeddings to features with sufficient cardinality (to avoid overfitting)
significant_cat_cols = []

for col in categorical_cols_with_label:
    feature_name = col.replace('_label', '')
    unique_values = df[col].nunique()

    # Skip if cardinality is too low
    if unique_values < 10:
        print(f"Skipping embedding for {feature_name} - cardinality too low ({unique_values})")
        continue

    significant_cat_cols.append(col)

    # Calculate embedding dimension with rule of thumb
    embedding_dim = min(50, int(unique_values**0.5))  # Square root rule

    # Create embedding model
    embedding_model = tf.keras.Sequential([
        tf.keras.layers.InputLayer(input_shape=(1,)),
        tf.keras.layers.Embedding(input_dim=unique_values+1, output_dim=embedding_dim),
        tf.keras.layers.Flatten()
    ])

    # Generate embeddings
    embeddings = embedding_model.predict(df[col].values.reshape(-1, 1))

    # Add embeddings as new features
    for i in range(embedding_dim):
        df[f'{feature_name}_emb_{i}'] = embeddings[:, i]

    print(f"Added {embedding_dim} embedding dimensions for {feature_name}")

mlflow.log_param("entity_embeddings_created", len(significant_cat_cols))

[1m  1/452[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m48s[0m 108ms/step



[1m452/452[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step  
Added 10 embedding dimensions for country
[1m 37/452[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 1ms/step  



[1m452/452[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Added 12 embedding dimensions for category
Skipping embedding for deadline_weekday - cardinality too low (7)
Skipping embedding for created_at_weekday - cardinality too low (7)
Skipping embedding for launched_at_weekday - cardinality too low (7)
[1m  1/452[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m28s[0m 63ms/step



[1m452/452[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Added 3 embedding dimensions for main_category


3

## DIMENSIONALITY REDUCTION

In [17]:
# Only use numeric features for PCA - but exclude the target
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
numeric_features = [col for col in numeric_features if col != 'target']

if len(numeric_features) > 20:  # Only apply PCA if we have many features
    # Standardize data
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df[numeric_features])

    # Apply PCA
    pca = PCA(n_components=0.95)  # Retain 95% of variance
    pca_result = pca.fit_transform(scaled_data)

    print(f"Original dimensions: {scaled_data.shape[1]}")
    print(f"Reduced dimensions: {pca_result.shape[1]}")
    print(f"Explained variance ratio: {pca.explained_variance_ratio_}")

    # Add PCA components to dataframe
    for i in range(pca_result.shape[1]):
        df[f'pca_component_{i}'] = pca_result[:, i]

    print(f"Added {pca_result.shape[1]} PCA components to the dataset")
    mlflow.log_param("pca_components_added", pca_result.shape[1])
else:
    print("Fewer than 20 numeric features - skipping PCA.")
    mlflow.log_param("pca_components_added", 0)

Original dimensions: 85
Reduced dimensions: 52
Explained variance ratio: [0.05323518 0.04656428 0.03819956 0.03763303 0.03490149 0.03418295
 0.03329889 0.03248909 0.03186021 0.02914676 0.02645752 0.02574018
 0.02483863 0.02399164 0.02313667 0.02299213 0.02164685 0.02078109
 0.02002065 0.01857607 0.01793081 0.01760079 0.01631287 0.01589931
 0.01504599 0.01463837 0.01410845 0.01384979 0.01356584 0.01305262
 0.01225956 0.01193863 0.01170366 0.01146821 0.01124142 0.01104853
 0.01068101 0.01036077 0.01021516 0.01003984 0.00974218 0.00936698
 0.00925394 0.00882453 0.00863269 0.00831987 0.00794619 0.00670628
 0.00575707 0.00543554 0.00497688 0.00463461]
Added 52 PCA components to the dataset


  df[f'pca_component_{i}'] = pca_result[:, i]
  df[f'pca_component_{i}'] = pca_result[:, i]
  df[f'pca_component_{i}'] = pca_result[:, i]
  df[f'pca_component_{i}'] = pca_result[:, i]
  df[f'pca_component_{i}'] = pca_result[:, i]
  df[f'pca_component_{i}'] = pca_result[:, i]
  df[f'pca_component_{i}'] = pca_result[:, i]
  df[f'pca_component_{i}'] = pca_result[:, i]
  df[f'pca_component_{i}'] = pca_result[:, i]
  df[f'pca_component_{i}'] = pca_result[:, i]
  df[f'pca_component_{i}'] = pca_result[:, i]
  df[f'pca_component_{i}'] = pca_result[:, i]
  df[f'pca_component_{i}'] = pca_result[:, i]
  df[f'pca_component_{i}'] = pca_result[:, i]
  df[f'pca_component_{i}'] = pca_result[:, i]


## FINAL CLEANUP

In [18]:
# Keep original categorical columns for reference, but create a version without them
# for models that can't handle them directly
df_encoded = df.copy()

# Remove original categorical columns in the encoded version
categorical_cols = df_encoded.select_dtypes(include=['object']).columns.tolist()
df_encoded = df_encoded.drop(columns=categorical_cols, errors='ignore')

# Convert datetime columns to numeric (days since a reference date)
reference_date = pd.to_datetime('2010-01-01')
for col in df_encoded.select_dtypes(include=['datetime64']).columns:
    df_encoded[f'{col}_days'] = (df_encoded[col] - reference_date).dt.total_seconds() / (60*60*24)
    df_encoded = df_encoded.drop(columns=[col])

# Final check for missing values
missing_final = df_encoded.isnull().sum()
if missing_final.sum() > 0:
    print("\nRemaining missing values:")
    print(missing_final[missing_final > 0])

    # Fill missing values in numeric columns with median
    numeric_columns = df_encoded.select_dtypes(include=['int64', 'float64']).columns
    if len(numeric_columns) > 0:
        df_encoded[numeric_columns] = df_encoded[numeric_columns].fillna(df_encoded[numeric_columns].median())

    # Verify all missing values are filled
    remaining_missing = df_encoded.isnull().sum().sum()
    print(f"\nRemaining missing values after filling: {remaining_missing}")
else:
    print("\nNo missing values in the final dataset.")


Remaining missing values:
goal_percentile_bin    26
dtype: int64

Remaining missing values after filling: 26


## FEATURE CORRELATION ANALYSIS

In [19]:
# Calculate correlations with target (useful for feature selection)
numeric_df = df_encoded.select_dtypes(include=['int64', 'float64'])
if 'target' in numeric_df.columns:
    correlations = numeric_df.corr()['target'].sort_values(ascending=False)
    print("\nTop 10 positive correlations with target:")
    print(correlations.head(11))  # +1 because target will be first
    print("\nTop 10 negative correlations with target:")
    print(correlations.tail(10))

    # Save correlation info for reporting
    top_corr_positive = correlations.head(11).to_dict()
    top_corr_negative = correlations.tail(10).to_dict()
    mlflow.log_param("top_positive_correlations", str(top_corr_positive))
    mlflow.log_param("top_negative_correlations", str(top_corr_negative))


Top 10 positive correlations with target:
target                       1.000000
subcategory_success_rate     0.592756
category_encoded             0.588517
pca_component_1              0.483093
category_success_rate        0.304858
main_category_encoded        0.304543
staff_pick                   0.281067
pca_component_2              0.230038
launched_at_yr               0.165043
launched_at_days             0.160799
launched_at_datetime_days    0.160799
Name: target, dtype: float64

Top 10 negative correlations with target:
pca_component_30        -0.090204
pca_component_4         -0.118518
pca_component_5         -0.124667
launched_at_hour_cos    -0.126140
pca_component_48        -0.135821
category_label          -0.136741
same_day_launch         -0.144357
campaign_duration       -0.230113
goal_log                -0.271310
disable_communication         NaN
Name: target, dtype: float64


## SAVE PROCESSED DATA

In [20]:
df_encoded=df_encoded.dropna()
df.to_csv("kickstarter_processed_with_categorical.csv", index=False)
df_encoded.to_csv("kickstarter_final_processed.csv", index=False)

# Log dataset information
mlflow.log_param("n_rows", df_encoded.shape[0])
mlflow.log_param("n_cols", df_encoded.shape[1])
mlflow.log_param("target_balance", df_encoded['target'].mean())

# Log feature engineering metadata
feature_metadata = {
    "feature_count": df_encoded.shape[1],
    "numeric_features": len(df_encoded.select_dtypes(include=['int64', 'float64']).columns),
    "categorical_orig_features": len(categorical_features),
    "embedding_features": len([col for col in df_encoded.columns if '_emb_' in col]),
    "pca_features": len([col for col in df_encoded.columns if 'pca_component_' in col])
}

# Log metrics
for key, value in feature_metadata.items():
    mlflow.log_metric(key, value)

# Log artifacts
mlflow.log_artifact("kickstarter_final_processed.csv")
mlflow.log_artifact("kickstarter_processed_with_categorical.csv")

# End MLflow run
mlflow.end_run()

In [21]:
from google.colab import files
files.download('kickstarter_final_processed.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>