In [5]:
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as sp
from xgboost.sklearn import XGBClassifier
import ml_utils as mt
import numpy as np

In [6]:
from sklearn.tree import DecisionTreeClassifier

In [7]:
cd=pd.read_csv("cosmicclassifierTraining.csv")
cd=cd[~cd['Prediction'].isnull()].reset_index(drop=True)
cd

Unnamed: 0,Atmospheric Density,Surface Temperature,Gravity,Water Content,Mineral Abundance,Orbital Period,Proximity to Star,Magnetic Field Strength,Radiation Levels,Atmospheric Composition Index,Prediction
0,0.472806,,-0.313872,-2.089299,-0.152201,-0.885649,0.900105,,Category_6,0.692907,5.0
1,4.180154,-1.157515,2.430956,-1.595850,-3.188678,-0.609434,-0.199828,Category_9,Category_9,,0.0
2,-0.129008,1.621592,-0.785741,2.081196,-1.413796,-0.095152,-3.502577,,Category_8,-0.677182,4.0
3,-3.122000,-2.299818,1.072092,0.353524,-0.192529,2.917067,-1.972329,,Category_11,0.109429,1.0
4,-1.459426,2.890268,0.148757,-0.804439,0.494875,0.044910,-0.438796,Category_6,Category_10,0.407941,9.0
...,...,...,...,...,...,...,...,...,...,...,...
56956,-0.316003,-1.160519,0.544548,-1.407123,1.427861,0.849849,-1.932329,Category_8,Category_5,1.333760,8.0
56957,0.789506,-2.645345,-0.375569,-2.579966,0.783195,0.671547,-2.041189,Category_14,Category_4,0.170505,3.0
56958,-0.662563,0.642230,-1.175106,-2.783240,-0.902704,-1.694373,-1.824274,Category_8,Category_8,1.010311,5.0
56959,0.475118,-0.021458,2.086274,1.444825,-1.986595,-2.113147,-0.348915,Category_11,Category_9,-0.665345,6.0


In [8]:
orig_df = cd.copy()

In [9]:
orig_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56961 entries, 0 to 56960
Data columns (total 11 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Atmospheric Density            54119 non-null  float64
 1   Surface Temperature            54065 non-null  float64
 2   Gravity                        54135 non-null  float64
 3   Water Content                  54025 non-null  float64
 4   Mineral Abundance              54177 non-null  float64
 5   Orbital Period                 54106 non-null  float64
 6   Proximity to Star              54175 non-null  float64
 7   Magnetic Field Strength        54020 non-null  object 
 8   Radiation Levels               54102 non-null  object 
 9   Atmospheric Composition Index  54166 non-null  float64
 10  Prediction                     56961 non-null  float64
dtypes: float64(9), object(2)
memory usage: 4.8+ MB


In [10]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
import pickle
import os

def impute_cat_feature_with_dtree(df, target_col, exclude_col, model=None, train_mode=True):
    """
    Impute categorical features using a decision tree classifier.
    If train_mode=True, it trains a new model and returns it.
    If train_mode=False, it uses the provided model for prediction.
    """
    df = df.copy()
    
    temp_cols = [col for col in df.columns if col not in [target_col, exclude_col, 'Prediction']]
    
    # Fill NAs in features with median to prepare for model training/prediction
    for col in temp_cols:
        df[col] = df[col].fillna(df[col].median())
    
    if train_mode:
        # Training mode - create and train a new model
        train_data = df[df[target_col].notna()]
        predict_data = df[df[target_col].isna()]
        X_train = train_data[temp_cols]
        y_train = train_data[target_col]
        X_pred = predict_data[temp_cols]
        
        model = DecisionTreeClassifier(max_depth=None, random_state=42)
        model.fit(X_train, y_train)
        
        df_result = df.copy()
        if len(X_pred) > 0:  # Only predict if there are missing values
            df_result.loc[df[target_col].isna(), target_col] = model.predict(X_pred)
        
        return df_result[[target_col]], model, temp_cols
    else:
        # Prediction mode - use provided model
        predict_data = df[df[target_col].isna()]
        X_pred = predict_data[temp_cols] if len(predict_data) > 0 else None
        
        df_result = df.copy()
        if X_pred is not None and len(X_pred) > 0:
            df_result.loc[df[target_col].isna(), target_col] = model.predict(X_pred)
        
        return df_result[[target_col]]

def compute_group_medians(df, group_cols, target_cols):
    """Compute and return median values for each group and column combination"""
    group_medians = {}
    overall_medians = {}
    
    for col in target_cols:
        overall_medians[col] = df[col].median()
        
        # Group medians as a dictionary for easier lookup
        grouped = df.groupby(group_cols)[col].median().to_dict()
        group_medians[col] = grouped
    
    return group_medians, overall_medians

def train_imputation_model(df):
    """
    Train imputation models and save everything needed for future imputation
    Returns a dictionary with all necessary models and values
    """
    # Save original dataframe
    orig_df = df.copy()
    cd = df.copy()  # Working copy
    
    # Step 1: Decision tree imputation for categorical features
    mag_imputed, mag_model, mag_cols = impute_cat_feature_with_dtree(
        cd, 'Magnetic Field Strength', 'Radiation Levels', train_mode=True
    )
    
    rad_imputed, rad_model, rad_cols = impute_cat_feature_with_dtree(
        cd, 'Radiation Levels', 'Magnetic Field Strength', train_mode=True
    )
    
    columns_to_restore = [col for col in cd.columns if col not in ['Magnetic Field Strength', 'Radiation Levels', 'Prediction']]
    df_final = pd.concat([orig_df[columns_to_restore], mag_imputed, rad_imputed, orig_df['Prediction']], axis=1)
    
    # Step 2: Grouped median imputation for rest of the features
    numeric_cols = [col for col in df_final.columns if col not in ['Magnetic Field Strength', 'Radiation Levels', 'Prediction']]
    
    # Calculate group medians
    group_medians, overall_medians = compute_group_medians(
        df_final, 
        ['Magnetic Field Strength', 'Radiation Levels'], 
        numeric_cols
    )
    
    # Apply the imputation to ensure the training dataset is fully imputed
    for col in numeric_cols:
        # Create a mask for records with missing values
        mask = df_final[col].isna()
        
        # For each row with a missing value, look up the appropriate group median
        for idx in df_final[mask].index:
            mag = df_final.loc[idx, 'Magnetic Field Strength']
            rad = df_final.loc[idx, 'Radiation Levels']
            
            # Try to get group median, fall back to overall median if not found
            try:
                group_key = (mag, rad)
                group_median = group_medians[col].get(group_key)
                if pd.isna(group_median):
                    df_final.loc[idx, col] = overall_medians[col]
                else:
                    df_final.loc[idx, col] = group_median
            except:
                df_final.loc[idx, col] = overall_medians[col]
    
    # Create imputation model dictionary with everything needed for future imputation
    imputation_model = {
        'mag_model': mag_model,
        'rad_model': rad_model,
        'mag_cols': mag_cols,
        'rad_cols': rad_cols,
        'group_medians': group_medians,
        'overall_medians': overall_medians,
        'feature_columns': numeric_cols
    }
    
    return imputation_model, df_final

def save_imputation_model(imputation_model, filepath="imputation_model.pkl"):
    """Save the imputation model to a file"""
    with open(filepath, 'wb') as f:
        pickle.dump(imputation_model, f)
    print(f"Imputation model saved to {filepath}")

def load_imputation_model(filepath="imputation_model.pkl"):
    """Load the imputation model from a file"""
    with open(filepath, 'rb') as f:
        imputation_model = pickle.load(f)
    return imputation_model

def impute_new_data(new_data, imputation_model=None, model_path=None):
    """
    Impute missing values in new data using saved imputation model
    Either provide the imputation_model directly or specify the model_path to load from
    """
    if imputation_model is None and model_path is not None:
        imputation_model = load_imputation_model(model_path)
    elif imputation_model is None:
        raise ValueError("Either imputation_model or model_path must be provided")
    
    df = new_data.copy()
    
    # Step 1: Impute categorical features using saved decision tree models
    mag_imputed = impute_cat_feature_with_dtree(
        df, 
        'Magnetic Field Strength', 
        'Radiation Levels', 
        model=imputation_model['mag_model'],
        train_mode=False
    )
    
    rad_imputed = impute_cat_feature_with_dtree(
        df, 
        'Radiation Levels', 
        'Magnetic Field Strength', 
        model=imputation_model['rad_model'],
        train_mode=False
    )
    
    # Create dataframe with imputed categorical features
    columns_to_restore = [col for col in df.columns if col not in ['Magnetic Field Strength', 'Radiation Levels']]
    df_final = pd.concat([df[columns_to_restore], mag_imputed, rad_imputed], axis=1)
    
    # Step 2: Apply grouped median imputation for numeric features
    for col in imputation_model['feature_columns']:
        if col in df_final.columns:
            # Create a mask for records with missing values
            mask = df_final[col].isna()
            
            # For each row with a missing value, look up the appropriate group median
            for idx in df_final[mask].index:
                mag = df_final.loc[idx, 'Magnetic Field Strength']
                rad = df_final.loc[idx, 'Radiation Levels']
                
                # Try to get group median, fall back to overall median if not found
                try:
                    group_key = (mag, rad)
                    group_median = imputation_model['group_medians'][col].get(group_key)
                    if pd.isna(group_median):
                        df_final.loc[idx, col] = imputation_model['overall_medians'][col]
                    else:
                        df_final.loc[idx, col] = group_median
                except:
                    df_final.loc[idx, col] = imputation_model['overall_medians'][col]
    
    return df_final


In [11]:
imputation_model, df_final = train_imputation_model(cd)

In [12]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56961 entries, 0 to 56960
Data columns (total 11 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Atmospheric Density            56961 non-null  float64
 1   Surface Temperature            56961 non-null  float64
 2   Gravity                        56961 non-null  float64
 3   Water Content                  56961 non-null  float64
 4   Mineral Abundance              56961 non-null  float64
 5   Orbital Period                 56961 non-null  float64
 6   Proximity to Star              56961 non-null  float64
 7   Atmospheric Composition Index  56961 non-null  float64
 8   Magnetic Field Strength        56961 non-null  object 
 9   Radiation Levels               56961 non-null  object 
 10  Prediction                     56961 non-null  float64
dtypes: float64(9), object(2)
memory usage: 4.8+ MB


In [13]:
df_final['Magnetic Field Strength'].value_counts()

Magnetic Field Strength
Category_9     8967
Category_8     8949
Category_10    7862
Category_7     6142
Category_11    6136
Category_12    4514
Category_6     3484
Category_13    3161
Category_14    2154
Category_5     2014
Category_15    1264
Category_4      899
Category_16     711
Category_3      326
Category_17     225
Category_18      63
Category_2       63
Category_19      16
Category_1       10
Category_20       1
Name: count, dtype: int64

In [14]:
def extract_numeric_from_category(col):
    return col.astype(str).str.extract(r'(\d+)', expand=False).astype(float)

df_final['Magnetic Field Strength'] = extract_numeric_from_category(df_final['Magnetic Field Strength'])
df_final['Radiation Levels'] = extract_numeric_from_category(df_final['Radiation Levels'])

X = df_final.drop(columns=['Prediction'])
y = df_final['Prediction']

In [15]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56961 entries, 0 to 56960
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Atmospheric Density            56961 non-null  float64
 1   Surface Temperature            56961 non-null  float64
 2   Gravity                        56961 non-null  float64
 3   Water Content                  56961 non-null  float64
 4   Mineral Abundance              56961 non-null  float64
 5   Orbital Period                 56961 non-null  float64
 6   Proximity to Star              56961 non-null  float64
 7   Atmospheric Composition Index  56961 non-null  float64
 8   Magnetic Field Strength        56961 non-null  float64
 9   Radiation Levels               56961 non-null  float64
dtypes: float64(10)
memory usage: 4.3 MB


In [16]:
y.dtype

dtype('float64')

In [17]:
save_imputation_model(imputation_model, 'imputation_model.pkl')

Imputation model saved to imputation_model.pkl


In [19]:
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as sp

param_dist = {
    'learning_rate': sp.uniform(0.01, 0.5),       
    'max_depth': sp.randint(3, 10),               
    'n_estimators': sp.randint(100, 500),         
    'subsample': sp.uniform(0.6, 0.4),            
    'colsample_bylevel': sp.uniform(0.6, 0.4),    
    'bootstrap_type': ['Bernoulli']               # Ensure compatible bootstrap type
}

# Initialize CatBoost classifier
cat_model = CatBoostClassifier(verbose=0, random_state=42)

# Set up RandomizedSearchCV
grid_search = RandomizedSearchCV(
    estimator=cat_model,
    param_distributions=param_dist,
    n_iter=50,
    scoring='accuracy',
    cv=5,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

X = df_final.drop(columns=['Prediction'])
y = df_final['Prediction']


In [20]:
grid_search.fit(X, y)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END bootstrap_type=Bernoulli, colsample_bylevel=0.749816047538945, learning_rate=0.4853571532049581, max_depth=5, n_estimators=171, subsample=0.8394633936788146; total time=  55.4s
[CV] END bootstrap_type=Bernoulli, colsample_bylevel=0.6571467271687763, learning_rate=0.33544423647442645, max_depth=7, n_estimators=357, subsample=0.8887995089067299; total time= 3.1min
[CV] END bootstrap_type=Bernoulli, colsample_bylevel=0.9140703845572055, learning_rate=0.10983689107917986, max_depth=9, n_estimators=343, subsample=0.836965827544817; total time= 5.6min




[CV] END bootstrap_type=Bernoulli, colsample_bylevel=0.6624074561769746, learning_rate=0.08799726016810132, max_depth=5, n_estimators=187, subsample=0.7334834444556088; total time= 1.0min
[CV] END bootstrap_type=Bernoulli, colsample_bylevel=0.9754210836063001, learning_rate=0.010389382920507164, max_depth=6, n_estimators=376, subsample=0.8469926038510867; total time= 3.1min
[CV] END bootstrap_type=Bernoulli, colsample_bylevel=0.9140703845572055, learning_rate=0.10983689107917986, max_depth=9, n_estimators=343, subsample=0.836965827544817; total time= 5.7min
[CV] END bootstrap_type=Bernoulli, colsample_bylevel=0.749816047538945, learning_rate=0.4853571532049581, max_depth=5, n_estimators=171, subsample=0.8394633936788146; total time= 1.0min
[CV] END bootstrap_type=Bernoulli, colsample_bylevel=0.9754210836063001, learning_rate=0.010389382920507164, max_depth=6, n_estimators=376, subsample=0.8469926038510867; total time= 3.2min
[CV] END bootstrap_type=Bernoulli, colsample_bylevel=0.914070

[CV] END bootstrap_type=Bernoulli, colsample_bylevel=0.7257423924305306, learning_rate=0.2642853455823514, max_depth=8, n_estimators=330, subsample=0.7641531692142519; total time= 3.7min
[CV] END bootstrap_type=Bernoulli, colsample_bylevel=0.9521871356061031, learning_rate=0.32217702406689663, max_depth=8, n_estimators=233, subsample=0.782613828193164; total time= 3.0min
[CV] END bootstrap_type=Bernoulli, colsample_bylevel=0.6488351818802693, learning_rate=0.18814891903848746, max_depth=8, n_estimators=324, subsample=0.7088528997538541; total time= 3.5min
[CV] END bootstrap_type=Bernoulli, colsample_bylevel=0.8590760482165449, learning_rate=0.01026018849765791, max_depth=7, n_estimators=332, subsample=0.7219125032632117; total time= 2.9min
[CV] END bootstrap_type=Bernoulli, colsample_bylevel=0.8232408008069365, learning_rate=0.21191808552902042, max_depth=8, n_estimators=151, subsample=0.6987504251354405; total time= 1.6min
[CV] END bootstrap_type=Bernoulli, colsample_bylevel=0.7067124

In [21]:
mt.report(grid_search.cv_results_,5)

Model with rank: 1
Mean validation score: 0.891663 (std: 0.004205)
Parameters: {'bootstrap_type': 'Bernoulli', 'colsample_bylevel': 0.6873761748867334, 'learning_rate': 0.21825497393518312, 'max_depth': 8, 'n_estimators': 430, 'subsample': 0.7297380084021096}

Model with rank: 2
Mean validation score: 0.890890 (std: 0.004969)
Parameters: {'bootstrap_type': 'Bernoulli', 'colsample_bylevel': 0.9933692563579372, 'learning_rate': 0.20941222122227654, 'max_depth': 9, 'n_estimators': 339, 'subsample': 0.9193380499938204}

Model with rank: 3
Mean validation score: 0.890802 (std: 0.003554)
Parameters: {'bootstrap_type': 'Bernoulli', 'colsample_bylevel': 0.9140703845572055, 'learning_rate': 0.10983689107917986, 'max_depth': 9, 'n_estimators': 343, 'subsample': 0.836965827544817}

Model with rank: 4
Mean validation score: 0.890697 (std: 0.003740)
Parameters: {'bootstrap_type': 'Bernoulli', 'colsample_bylevel': 0.7478617824245618, 'learning_rate': 0.13107996913871295, 'max_depth': 8, 'n_estimator

In [29]:
### Now predicting 

In [27]:
test=pd.read_csv("cosmictest.csv")
test

Unnamed: 0,Atmospheric Density,Surface Temperature,Gravity,Water Content,Mineral Abundance,Orbital Period,Proximity to Star,Magnetic Field Strength,Radiation Levels,Atmospheric Composition Index
0,-1.303074,2.932540,-0.168043,-1.220249,-1.942662,0.950473,1.540782,Category_13,Category_6,-0.729809
1,0.081124,-3.747390,-0.598897,1.563487,-0.199953,-1.844926,1.095349,Category_11,Category_10,-0.341850
2,-3.709952,-1.689588,0.259499,1.020046,-2.094565,0.344641,-0.094227,Category_9,Category_9,-0.532658
3,4.838804,0.442909,0.995312,-0.385272,-0.818065,-1.903998,2.727794,Category_10,Category_9,-0.205558
4,0.742217,2.802281,-1.144759,2.475840,-0.585959,0.880694,-1.942863,Category_11,Category_6,0.763087
...,...,...,...,...,...,...,...,...,...,...
9995,1.185221,-2.146064,-0.145847,1.056399,3.201846,3.729891,3.100994,Category_11,Category_5,-0.147794
9996,2.257602,-2.714023,-3.215728,-0.753216,0.460961,-0.793091,2.672126,Category_16,Category_8,-0.157080
9997,-1.059609,2.340662,1.392567,-1.627227,2.634176,0.638663,-0.165413,Category_6,Category_9,-3.148313
9998,-1.231524,-0.425901,2.520917,-0.993503,-0.577048,-1.083454,0.636375,Category_11,Category_11,-0.844913


In [34]:
imputation_model, test_final = train_imputation_model(test)

test_final['Magnetic Field Strength'] = extract_numeric_from_category(df_final['Magnetic Field Strength'])
test_final['Radiation Levels'] = extract_numeric_from_category(df_final['Radiation Levels'])


In [25]:
def extract_numeric_from_category(col):
    return col.astype(str).str.extract(r'(\d+)', expand=False).astype(float)

test_final['Magnetic Field Strength'] = extract_numeric_from_category(df_final['Magnetic Field Strength'])
test_final['Radiation Levels'] = extract_numeric_from_category(df_final['Radiation Levels'])

X = test_final.drop(columns=['Prediction'])
y = test_final['Prediction']


In [35]:
X

Unnamed: 0,Atmospheric Density,Surface Temperature,Gravity,Water Content,Mineral Abundance,Orbital Period,Proximity to Star,Atmospheric Composition Index,Magnetic Field Strength,Radiation Levels
0,0.472806,-0.598250,-0.313872,-2.089299,-0.152201,-0.885649,0.900105,0.692907,14.0,6.0
1,4.180154,-1.157515,2.430956,-1.595850,-3.188678,-0.609434,-0.199828,-0.008297,9.0,9.0
2,-0.129008,1.621592,-0.785741,2.081196,-1.413796,-0.095152,-3.502577,-0.677182,9.0,8.0
3,-3.122000,-2.299818,1.072092,0.353524,-0.192529,2.917067,-1.972329,0.109429,10.0,11.0
4,-1.459426,2.890268,0.148757,-0.804439,0.494875,0.044910,-0.438796,0.407941,6.0,10.0
...,...,...,...,...,...,...,...,...,...,...
56956,-0.316003,-1.160519,0.544548,-1.407123,1.427861,0.849849,-1.932329,1.333760,8.0,5.0
56957,0.789506,-2.645345,-0.375569,-2.579966,0.783195,0.671547,-2.041189,0.170505,14.0,4.0
56958,-0.662563,0.642230,-1.175106,-2.783240,-0.902704,-1.694373,-1.824274,1.010311,8.0,8.0
56959,0.475118,-0.021458,2.086274,1.444825,-1.986595,-2.113147,-0.348915,-0.665345,11.0,9.0


In [38]:
pred = grid_search.predict(X)
pred

array([[5.],
       [0.],
       [4.],
       ...,
       [5.],
       [6.],
       [8.]])

In [41]:
df = pd.DataFrame(pred)
df.to_csv('predictions.csv', index=False)
