In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#data = pd.read_csv(r'./data/train.csv')
data = pd.read_csv('/content/drive/MyDrive/IronWeek7_TechChallenge/data/test_no_class.csv')

display(data.shape, data.head())

(53, 19)

Unnamed: 0,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
0,39,Male,2,2,1,2,2,2,1,2,2,2,2,0.7,?,48,4.4,?,1
1,41,Feamle,2,1,1,1,1,2,2,2,2,2,2,0.7,81,53,5.0,74,1
2,28,maled,1,2,1,1,1,2,1,2,2,2,2,1.6,44,123,4.0,46,1
3,36,maled,1,2,1,1,1,2,1,2,2,2,2,1.0,?,45,4.0,57,1
4,32,M,2,2,2,2,2,2,2,2,2,2,2,0.7,102,64,4.0,90,1


In [None]:
#Clean the data

df=data.copy()
df['SEX'] = df['SEX'].apply(lambda x: 'F' if 'f' in x.lower() else 'M')
df['SEX'].value_counts()

# Replace value 'c' in column 'col' with the mean of column 'col'.
def replace_by_median(data, col, c):
    data[col].mask(data[col] == c, 0, inplace=True)
    data[col] = pd.to_numeric(data[col])
    data[col].mask(data[col] == 0, data[col].median(), inplace=True)

def clean_categorical_columns(data):
    for col in data.columns:
        if pd.api.types.is_categorical_dtype(data[col]):
            unique_values = data[col].unique()
            for value in unique_values:
                if pd.notna(value):
                    replace_by_mean(data, col, value)
    return data

def replace_question_marks_with_median(data):
    for col in data.columns:
        # Check if the column contains question marks
        if data[col].dtype == object and data[col].str.contains('\?').any():
            # Replace question marks with NaN
            data[col] = data[col].replace('?', np.nan)
            # Convert the column to a numeric type
            data[col] = pd.to_numeric(data[col])
            # Calculate the median of the column
            median = data[col].median()
            # Replace NaN values with the median
            data[col].fillna(median, inplace=True)
    return data

In [None]:
#clean it
df=clean_categorical_columns(df)
display(df.head(),df['SEX'].value_counts())
df=replace_question_marks_with_median(df)
display(df.head(),df['SEX'].value_counts())

Unnamed: 0,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
0,39,M,2,2,1,2,2,2,1,2,2,2,2,0.7,?,48,4.4,?,1
1,41,F,2,1,1,1,1,2,2,2,2,2,2,0.7,81,53,5.0,74,1
2,28,M,1,2,1,1,1,2,1,2,2,2,2,1.6,44,123,4.0,46,1
3,36,M,1,2,1,1,1,2,1,2,2,2,2,1.0,?,45,4.0,57,1
4,32,M,2,2,2,2,2,2,2,2,2,2,2,0.7,102,64,4.0,90,1


M    47
F     6
Name: SEX, dtype: int64

Unnamed: 0,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
0,39,M,2,2,1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,0.7,85.0,48.0,4.4,60.0,1
1,41,F,2,1,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,81.0,53.0,5.0,74.0,1
2,28,M,1,2,1.0,1.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,1.6,44.0,123.0,4.0,46.0,1
3,36,M,1,2,1.0,1.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,85.0,45.0,4.0,57.0,1
4,32,M,2,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,102.0,64.0,4.0,90.0,1


M    47
F     6
Name: SEX, dtype: int64

# **Scale and encode the data**

In [None]:
df = pd.get_dummies(df, drop_first=True)

In [None]:
with open('/content/drive/MyDrive/IronWeek7_TechChallenge/scaler/standard_scaler.pickle', 'rb') as file:
    scaler = pickle.load(file)

data_scaled = pd.DataFrame(scaler.transform(df), columns=df.columns)

# **Load models and do predictions**

RandomForest

In [None]:
#RandomForest
with open('/content/drive/MyDrive/IronWeek7_TechChallenge/model/random_forest.pickle', 'rb') as file:
    randForest = pickle.load(file)

In [None]:
y_RF_pred  = randForest.predict(df)

In [None]:
#store 2 file
pd.DataFrame(pd.Series(y_RF_pred), columns=['Class']).to_csv('group_4_rf.csv')

KNN

In [27]:
with open('/content/drive/MyDrive/IronWeek7_TechChallenge/model/knn.pickle', 'rb') as file:
     knn_pred = pickle.load(file)

In [28]:
y_knn_pred  = knn_pred.predict(df)

In [33]:
#store 2 file
pd.DataFrame(pd.Series(y_knn_pred), columns=['Class']).to_csv('group_4_knn.csv')

D-Tree

In [30]:
with open('/content/drive/MyDrive/IronWeek7_TechChallenge/model/dtree.pickle', 'rb') as file:
     dtree_pred = pickle.load(file)

In [31]:
y_dtree_pred  = dtree_pred.predict(df)

In [34]:
#store 2 file
pd.DataFrame(pd.Series(y_dtree_pred), columns=['Class']).to_csv('group_4_dtree.csv')