##Required Libraries

In [None]:
# Requirements #
# !pip install -r requirements.txt
# or
!pip install numpy pandas matplotlib scikit-learn joblib feature_engine colorama


In [140]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
import math

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_validate, StratifiedKFold, RepeatedStratifiedKFold

from feature_engine.encoding import WoEEncoder

from colorama import Fore, Back, Style

from joblib import dump
from joblib import load

In [124]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [125]:
# Initialize the Test and Submission.csv Path #
# myPATH = "tabular-playground-series-aug-2022"
myPATH = '/content/drive/MyDrive/ML-Final-Project/tabular-playground-series-aug-2022'
train2 = pd.read_csv(f'{myPATH}/train.csv')
test2 = pd.read_csv(f'{myPATH}/test.csv')
submission = pd.read_csv(f'{myPATH}/sample_submission.csv')

if 'failure' in train2:
    target = train2.pop('failure')

X_test2 = test2.copy()
# if 'index' in X_test2.columns:
X_test2 = X_test2.reset_index().set_index('id').drop('index', axis=1)

print(f'test2: {test2.shape}, train2: {train2.shape}')
X_test2.head()

test2: (20775, 25), train2: (26570, 25)


Unnamed: 0_level_0,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,...,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26570,F,119.57,material_5,material_6,6,4,6,9,6,19.305,...,18.654,10.802,15.909,18.07,13.772,13.659,16.825,13.742,17.71,634.612
26571,F,113.51,material_5,material_6,6,4,11,8,0,17.883,...,19.368,12.032,13.998,,12.473,17.468,16.708,14.776,14.102,537.037
26572,F,112.16,material_5,material_6,6,4,8,12,4,18.475,...,17.774,11.743,17.046,18.086,10.907,13.363,15.737,17.065,16.021,658.995
26573,F,112.72,material_5,material_6,6,4,8,11,10,16.518,...,18.948,11.79,18.165,16.163,10.933,15.501,15.667,12.62,16.111,594.301
26574,F,208.0,material_5,material_6,6,4,14,16,8,17.808,...,19.141,12.37,14.578,17.849,11.941,16.07,16.183,13.324,17.15,801.044


In [126]:
features = []
for column in test2.columns:
    if column == "loading" or column.startswith("measurement"):
        features.append(column)
print(features)

['loading', 'measurement_0', 'measurement_1', 'measurement_2', 'measurement_3', 'measurement_4', 'measurement_5', 'measurement_6', 'measurement_7', 'measurement_8', 'measurement_9', 'measurement_10', 'measurement_11', 'measurement_12', 'measurement_13', 'measurement_14', 'measurement_15', 'measurement_16', 'measurement_17']


##1. Data Preprocessing for Test Dataset
 We also need to implement `impute_null_values` and `scale_function` to test dataset

In [127]:
#@title def impute_null_values(df, features, pcodes): ...
def impute_null_values(df, features, pcodes):

    # Impute null values in a dataframe using a KNNImputer model #

    # Create KNNImputer model with 15 neighbors
    model = KNNImputer(n_neighbors=15)
    # model = KNNImputer(n_neighbors=3)
    # model = KNNImputer(n_neighbors=5)
    
    for pcode in pcodes:
        # Get rows of data where product_code is equal to the current value of `pcode`
        mask = df['product_code'] == pcode
        feature_subset = df.loc[mask, features]
        
        # Calculate the number of null values in the feature subset before imputation
        null_before = feature_subset.isnull().sum().sum()
        
        # Use the KNN model to impute null values in the feature subset
        imputed = model.fit_transform(feature_subset)
        
        # Calculate the number of null values in the imputed feature subset
        null_after = pd.isnull(imputed).sum().sum()
        
        # Calculate the number of null values that were imputed
        null_imputed = null_before - null_after
        
        # Print a message indicating how many null values were imputed for the current product code
        print(f"Imputing Product Code {pcode}:.... \n -> {null_imputed} null values imputed")
        
        # Replace the feature subset in the original data with the imputed version
        df.loc[mask, features] = imputed
    
    return df

In [128]:
#@title def scale_function(X, cols): ...
def scale_function(X, cols):
    scaler = StandardScaler()
    #scaler = MinMaxScaler()
    
    X_scaled = scaler.fit_transform(X[cols])
    X_scaled = pd.DataFrame(X_scaled, columns=cols, index=X.index)
    X_scaled = pd.concat([X.drop(cols, axis=1), X_scaled], axis=1)
    
    assert len(X) == len(X_scaled)
    
    return X_scaled

In [129]:
test2['m3_missing'] = test2['measurement_3'].isnull().astype(np.int8)
test2['m5_missing'] = test2['measurement_5'].isnull().astype(np.int8)

X_test2 = impute_null_values(test2, features, test2['product_code'].unique())

Imputing Product Code F:.... 
 -> 4073 null values imputed
Imputing Product Code G:.... 
 -> 3924 null values imputed
Imputing Product Code H:.... 
 -> 3883 null values imputed
Imputing Product Code I:.... 
 -> 3829 null values imputed


In [130]:
# Need X_train2 for fitting #
train2['m3_missing'] = train2['measurement_3'].isnull().astype(np.int8)
train2['m5_missing'] = train2['measurement_5'].isnull().astype(np.int8)
X_train2 = impute_null_values(train2, features, train2['product_code'].unique())

Imputing Product Code A:.... 
 -> 3849 null values imputed
Imputing Product Code B:.... 
 -> 3975 null values imputed
Imputing Product Code C:.... 
 -> 4344 null values imputed
Imputing Product Code D:.... 
 -> 3973 null values imputed
Imputing Product Code E:.... 
 -> 4132 null values imputed


In [131]:
# Encoding for attribute_0 using WoEEncoder #
woe_encoder = WoEEncoder(variables=['attribute_0'])
woe_encoder.fit(X_train2, target)

# X_train2 = woe_encoder.transform(X_train2)
X_test2 = woe_encoder.transform(X_test2)

In [133]:
# Important Feature Engineering #
X_test2['measurement(3*5)'] = X_test2['measurement_3'] * X_test2['measurement_5']
X_test2['missing(3*5)'] = X_test2['m5_missing'] * X_test2['m3_missing']
X_test2['area'] = X_test2['attribute_2'] * X_test2['attribute_3']

X_test2[['area', 'm3_missing', 'm5_missing', 'missing(3*5)', 'measurement_3', 'measurement_5', 'measurement(3*5)']].query("m3_missing != 0 and m5_missing != 0").head()

Unnamed: 0,area,m3_missing,m5_missing,missing(3*5),measurement_3,measurement_5,measurement(3*5)
29,24,1,1,1,17.6042,17.152,301.947238
1931,24,1,1,1,17.936333,17.179333,308.134249
2556,24,1,1,1,18.1384,17.224267,312.420639
3036,24,1,1,1,17.800867,17.098133,304.361592
5917,63,1,1,1,17.651933,17.196733,303.55559


In [134]:
optimized_features = [
    'loading',
    'attribute_0',
    'area',
    'measurement_17',
    'measurement_0',
    'measurement_1',
    'measurement_2',
    'm3_missing', 
    'm5_missing', 
    'measurement(3*5)',
    'missing(3*5)'
]

In [138]:
# Scaling #
X_test2_scaled = scale_function(X_test2, optimized_features)

##2. Load the pretrained Model

In [136]:
predmodel2 = LogisticRegression(penalty='l1', C=0.01, solver='liblinear', random_state=1)
# predmodel2 = LogisticRegression(max_iter=200, C=0.0001, penalty='l2', solver='newton-cg')

modelPATH = '/content/drive/MyDrive/ML-Final-Project'
predmodel2 = load(f'{modelPATH}/my_best_model3.joblib')
y_pred2 = predmodel2.predict_proba(X_test2_scaled[optimized_features])[:,1]

## 3. Writing to Submission.csv

In [137]:
X_test2 = X_test2.reset_index().set_index('id').drop('index', axis=1)

submission = pd.DataFrame({'id': X_test2.index, 'failure': y_pred2})
submission.to_csv('109550200_v2.csv', index=False)
submission.head()

Unnamed: 0,id,failure
0,26570,0.198423
1,26571,0.180607
2,26572,0.190013
3,26573,0.191483
4,26574,0.332543
