<a href="https://www.kaggle.com/code/masoudnaghshbandi/product-failure-t-p-series-aug-2022?scriptVersionId=108070413" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Import libraries

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
import imblearn
from imblearn.over_sampling import SMOTE

This data represents the results of a large product testing study. For each product_code you are given a number of product attributes (fixed for the code) as well as a number of measurement values for each individual product, representing various lab testing methods. Each product is used in a simulated real-world environment experiment, and and absorbs a certain amount of fluid (loading) to see whether or not it fails.

My task is to use the data to predict individual product failures of new codes with their individual lab test results.

## load data

In [None]:
df_train=pd.read_csv('../input/tabular/train.csv')
df_train.head()

# set id column as index column

In [None]:
df_train.set_index('id')

# handle missing value¶


In [None]:
df_train_miss=['loading','attribute_2','attribute_3','measurement_0','measurement_1','measurement_2','measurement_3','measurement_4','measurement_5','measurement_6','measurement_7','measurement_8','measurement_9','measurement_10','measurement_11','measurement_12','measurement_13','measurement_14','measurement_15','measurement_16','measurement_17']

df_train_miss=pd.DataFrame(df_train_miss)

In [None]:
df_train_miss.isnull().sum().sum()

# Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder


In [None]:
cols = ["product_code", "attribute_0", "attribute_1"]


# Encode categorical column to numeric

In [None]:
df_train[cols] = df_train[cols].apply(LabelEncoder().fit_transform)

In [None]:
df_train=df_train.fillna(df_train.median())

In [None]:
df_train.isnull().sum().sum()

In [None]:
df_train.head()

# Find correlation

In [None]:
plt.figure(figsize=(35, 25))
heatmap = sns.heatmap(df_train.corr(), vmin=-1, vmax=1, annot=True)

In [None]:
df_train.set_index('id')

In [None]:
df_train.shape

# Make supervised data format

In [None]:
X=df_train.iloc[:, 0:14]


In [None]:
X.set_index('id')

In [None]:
X=X.values

In [None]:
y=df_train['failure']


In [None]:
scaler=StandardScaler()


In [None]:
X = scaler.fit_transform(X.astype(float))


# Train test split

In [None]:
X_train, X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=4)


In [None]:
y_train.value_counts().plot(kind='bar')
plt.title('label balance')
plt.xlabel('label values')
plt.ylabel('amount per label')
plt.show()

# Make Balanced

In [None]:
oversample = SMOTE()


In [None]:
X, y = oversample.fit_resample(X, y)


In [None]:
X_train, X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=4)


In [None]:
y_train.value_counts().plot(kind='bar')
plt.title('label balance')
plt.xlabel('label values')
plt.ylabel('amount per label')
plt.show()

In [None]:
#from sklearn.decomposition import PCA

In [None]:
#pca = PCA()

In [None]:
#comp = pca.fit(df_train)

In [None]:
#plt.plot(np.cumsum(comp.explained_variance_ratio_))
#plt.grid()
#plt.xlabel('Number of Principal Components')
#plt.ylabel('Explained Variance')
#sns.despine()

In [None]:
#df_train_transform = pca.transform(df_train)

In [None]:
#model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

In [None]:
#model.fit(X_train, y_train)

# Training model

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier()


In [None]:
#from sklearn.model_selection import train_test_split, GridSearchCV


In [None]:
#grid_space={'max_depth':[3,5,10,None],
           #  'n_estimators':[10,100,200],
           #  'max_features':[1,3,5,9],
            # 'min_samples_leaf':[1,2,3],
             # 'min_samples_split':[1,2,3]
        #  }

In [None]:
rfc = RandomForestClassifier(max_depth=None, max_features=9,min_samples_leaf=1, min_samples_split=2,n_estimators= 200 )

In [None]:
model=rfc.fit(X_train,y_train)


In [None]:
#model = rfc.fit(X_train,y_train)



In [None]:
#print('Best hyperparameters are: '+str(model_grid.best_params_))
#print('Best score is: '+str(model_grid.best_score_))

In [None]:
expected_y  = y_test
predicted_y = model.predict(X_test)

In [None]:
from sklearn import metrics

In [None]:
print(metrics.classification_report(expected_y, predicted_y))
print(metrics.confusion_matrix(expected_y, predicted_y))

In [None]:
accuracy = accuracy_score(expected_y, predicted_y)
accuracy

# Test Model output

In [None]:
df_test=pd.read_csv("../input/tabular/test.csv")
df_test.head()

In [None]:
df_test.set_index('id')


In [None]:
cols = ["product_code", "attribute_0", "attribute_1"]

In [None]:
df_test.isnull().sum()


In [None]:
df_test=df_test.fillna(df_train.median())

In [None]:
df_test.isnull().sum()


In [None]:
df_test[cols] = df_test[cols].apply(LabelEncoder().fit_transform)

In [None]:
df_test.head()

In [None]:
df_test=df_test.iloc[:, 0:14]


In [None]:
df_test.head()

In [None]:
X1 = scaler.fit_transform(df_test.astype(int))


In [None]:
X1=pd.DataFrame(X1)

In [None]:
X1.head()

In [None]:
#df_test.head()

In [None]:
y_pred_test = model.predict(X1)

In [None]:
df_test['failure'] = y_pred_test

In [None]:
df_test_new = df_test[['id', 'failure']]

In [None]:
df_test_new.head(5)

In [None]:
df_test_new.set_index('id')

In [None]:
sample_solution=df_test_new.to_csv("sample_solution5.csv")