In [10]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [11]:
train_df = pd.read_csv("../dataset/train.csv")
train_labels = train_df.pop('failure')

In [12]:
train_df.drop(["id"], axis=1, inplace=True)

In [13]:
categorical_cols = train_df.select_dtypes(["bool_", "object_"]).columns

numeric_cols = train_df.select_dtypes(exclude=["bool_", "object_"]).columns

In [14]:
encoder = OrdinalEncoder()
train_df[categorical_cols] = encoder.fit_transform(train_df[categorical_cols])

In [15]:
train_df.isna().sum().sum()

20273

In [16]:
iterative_imputer = IterativeImputer()
train_df[numeric_cols] = pd.DataFrame(iterative_imputer.fit_transform(train_df[numeric_cols]), columns= numeric_cols)

In [17]:
categorical_imputer = SimpleImputer(strategy= "most_frequent")
train_df[categorical_cols] = pd.DataFrame(categorical_imputer.fit_transform(train_df[categorical_cols]), columns= categorical_cols)

In [18]:
train_df.isna().sum().sum()

0

In [19]:
scaler = StandardScaler()
new_col_names = [col + "_scaled" for col in numeric_cols]

train_df[new_col_names] = scaler.fit_transform(train_df[numeric_cols]) 

In [20]:
mi_scores = mutual_info_classif(train_df, train_labels)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=train_df.columns)
mi_scores = mi_scores.sort_values(ascending=False)
mi_scores

attribute_0              0.008353
loading                  0.005198
measurement_12_scaled    0.005077
measurement_1            0.005058
attribute_3              0.004955
measurement_2            0.004865
loading_scaled           0.004764
measurement_5_scaled     0.004138
measurement_7_scaled     0.003035
measurement_12           0.002992
measurement_3            0.002543
measurement_14_scaled    0.001300
measurement_0            0.001281
measurement_7            0.001167
measurement_2_scaled     0.000839
measurement_15_scaled    0.000801
measurement_5            0.000671
attribute_2              0.000509
product_code             0.000430
measurement_15           0.000376
measurement_11_scaled    0.000157
measurement_6_scaled     0.000068
attribute_2_scaled       0.000056
measurement_3_scaled     0.000036
measurement_10_scaled    0.000000
measurement_13_scaled    0.000000
measurement_9_scaled     0.000000
measurement_8_scaled     0.000000
measurement_16_scaled    0.000000
measurement_4_

In [21]:
X_train, X_valid, y_train, y_valid = train_test_split(train_df, train_labels, train_size = 0.8)

In [22]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 1000)
rf.fit(train_df, train_labels)

print(accuracy_score(y_valid, rf.predict(X_valid)))

KeyboardInterrupt: 

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=3000, random_state=42)
lr.fit(train_df, train_labels)

print(accuracy_score(y_valid, lr.predict(X_valid)))

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbc = GradientBoostingRegressor(n_estimators=200)
gbc.fit(train_df, train_labels)

print(accuracy_score(y_valid, gbc.predict(X_valid)))

In [None]:
from xgboost import XGBRegressor

class CustomXGBRegressor(XGBRegressor):
    
    def __init__(self, **params):
        
        super().__init__(**params)
        self.eval_set = params['eval_set']
    
    def fit(self, X, y):
        super().fit(X, y, eval_set=self.eval_set, verbose=100) 

In [None]:
xgb = CustomXGBRegressor(n_estimators=1000, learning_rate=0.01, eval_set=[(X_valid, y_valid)], early_stopping_rounds=40)
xgb.fit(train_df, train_labels)

print(accuracy_score(y_valid, xgb.predict(X_valid)))

In [None]:
from sklearn.ensemble import VotingClassifier

model = VotingClassifier(
    estimators = [
        ('xgb', xgb),
        ('rf', rf),
        ('lr', lr),
        ('gbc', gbc)
    ],
    voting = 'soft'
)

model.fit(X_train, y_train)

print(accuracy_score(y_valid, model.predict(X_valid)))

In [None]:
test_df = pd.read_csv("../dataset/test.csv")

In [None]:
test_df.drop(["id"], axis=1, inplace=True)

In [None]:
test_df[categorical_cols] = encoder.transform(test_df[categorical_cols])

In [None]:
test_df[numeric_cols] = pd.DataFrame(iterative_imputer.transform(test_df[numeric_cols]), columns= numeric_cols)
test_df[categorical_cols] = pd.DataFrame(categorical_imputer.transform(test_df[categorical_cols]), columns= categorical_cols)

In [None]:
test_df[new_col_names] = scaler.transform(test_df[numeric_cols]) 

In [None]:
preds = [x == 1 for x in model.predict_proba(test_df[train_df.columns])]

In [None]:
submission_df = pd.DataFrame({
    "id" : test_df["PassengerId"],
    "failure" : preds
})

In [None]:
submission_df.to_csv("submission.csv", index=False)