## Em Colbert



## Imports

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from numba import jit

In [3]:
TRAIN = pd.read_csv("./playground-series-s3e26/train.csv")
TEST = pd.read_csv("./playground-series-s3e26/test.csv")
SAMPLE_SUB = pd.read_csv("./playground-series-s3e26/sample_submission.csv")

train = TRAIN.copy()
test = TEST.copy()
submission = SAMPLE_SUB.copy()
train_id = train.id
train_status = train.Status
test_id = test.id

train.drop(columns=['id', 'Status'], inplace=True)
test.drop(columns=['id'], inplace=True)

test.head()

Unnamed: 0,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,3839,D-penicillamine,19724,F,N,Y,N,N,1.2,546.0,3.37,65.0,1636.0,151.9,90.0,430.0,10.6,2.0
1,2468,D-penicillamine,14975,F,N,N,N,N,1.1,660.0,4.22,94.0,1257.0,151.9,155.0,227.0,10.0,2.0
2,51,Placebo,13149,F,N,Y,N,Y,2.0,151.0,2.96,46.0,961.0,69.75,101.0,213.0,13.0,4.0
3,2330,D-penicillamine,20510,F,N,N,N,N,0.6,293.0,3.85,40.0,554.0,125.55,56.0,270.0,10.6,2.0
4,1615,D-penicillamine,21904,F,N,Y,N,N,1.4,277.0,2.97,121.0,1110.0,125.0,126.0,221.0,9.8,1.0


In [6]:
pd.options.display.float_format = '{:,.2f}'.format
def summary(df):
    print(f'data shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['data type'])
    summ['#missing'] = df.isnull().sum().values
    summ['%missing'] = df.isnull().sum().values / len(df) * 100
    summ['#unique'] = df.nunique().values
    desc = pd.DataFrame(df.describe(include='all').transpose())
    summ['min'] = desc['min'].values
    summ['max'] = desc['max'].values
    summ['average'] = desc['mean'].values
    summ['standard_deviation'] = desc['std'].values
    summ['first value'] = df.loc[0].values
    summ['second value'] = df.loc[1].values
    summ['third value'] = df.loc[2].values

    return summ

summary(test)

data shape: (5271, 18)


Unnamed: 0,data type,#missing,%missing,#unique,min,max,average,standard_deviation,first value,second value,third value
N_Days,int64,0,0.0,409,41.0,4795.0,2038.7,1086.84,3839,2468,51
Drug,object,0,0.0,2,,,,,D-penicillamine,D-penicillamine,Placebo
Age,int64,0,0.0,363,9598.0,28650.0,18497.76,3583.9,19724,14975,13149
Sex,object,0,0.0,2,,,,,F,F,F
Ascites,object,0,0.0,2,,,,,N,N,N
Hepatomegaly,object,0,0.0,2,,,,,Y,N,Y
Spiders,object,0,0.0,2,,,,,N,N,N
Edema,object,0,0.0,3,,,,,N,N,Y
Bilirubin,float64,0,0.0,108,0.3,28.0,2.6,3.85,1.20,1.10,2.00
Cholesterol,float64,0,0.0,222,120.0,1775.0,352.49,200.44,546.00,660.00,151.00


Observations

- Every type of each object is in both train and test

- You'll need to give a predicted percent for each status

## Data Cleaning 1

In [18]:
train_d, test_d = pd.get_dummies(train, dtype=int), pd.get_dummies(test, dtype=int)

ss = StandardScaler()
train_dn = pd.DataFrame(ss.fit_transform(train_d), columns=train_d.columns)
test_dn = pd.DataFrame(ss.transform(test_d), columns=test_d.columns)

test_dn.head()

Unnamed: 0,N_Days,Age,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,...,Sex_M,Ascites_N,Ascites_Y,Hepatomegaly_N,Hepatomegaly_Y,Spiders_N,Spiders_Y,Edema_N,Edema_S,Edema_Y
0,1.65,0.37,-0.37,1.0,-0.52,-0.25,-0.09,0.76,-0.48,1.88,...,-0.28,0.22,-0.22,-0.98,0.98,0.57,-0.57,0.32,-0.23,-0.21
1,0.4,-0.92,-0.39,1.58,1.94,0.13,-0.29,0.76,0.76,-0.44,...,-0.28,0.22,-0.22,1.02,-1.02,0.57,-0.57,0.32,-0.23,-0.21
2,-1.81,-1.42,-0.16,-1.02,-1.7,-0.5,-0.45,-0.92,-0.27,-0.6,...,-0.28,0.22,-0.22,-0.98,0.98,0.57,-0.57,-3.1,-0.23,4.68
3,0.27,0.58,-0.52,-0.29,0.87,-0.58,-0.66,0.22,-1.13,0.05,...,-0.28,0.22,-0.22,1.02,-1.02,0.57,-0.57,0.32,-0.23,-0.21
4,-0.38,0.96,-0.31,-0.38,-1.67,0.49,-0.37,0.21,0.2,-0.51,...,-0.28,0.22,-0.22,-0.98,0.98,0.57,-0.57,0.32,-0.23,-0.21


In [21]:
Y = train_status.map({"C":0, "CL":1, "D":2})
Y

0       2
1       0
2       2
3       0
4       0
       ..
7900    0
7901    0
7902    2
7903    2
7904    0
Name: Status, Length: 7905, dtype: int64

In [30]:
rf = RandomForestClassifier(criterion='log_loss')
kfold = RepeatedStratifiedKFold(n_splits=10, n_repeats=1)

rfcvs = cross_val_score(rf, train_dn, Y, scoring='neg_log_loss', cv=kfold, n_jobs=-1)

rf.fit(train_dn, Y)

In [48]:
-1*rfcvs.mean()

0.5428967490777389

In [33]:
preds = rf.predict_proba(test_dn)
preds

array([[0.66, 0.03, 0.31],
       [0.71, 0.16, 0.13],
       [0.08, 0.03, 0.89],
       ...,
       [0.94, 0.04, 0.02],
       [0.99, 0.  , 0.01],
       [0.43, 0.06, 0.51]])

In [43]:
sub_1 = SAMPLE_SUB.copy()

In [44]:
preds_1 = pd.DataFrame(preds, columns=["C", "CL", "D"])
sub_1.Status_C = preds_1.C
sub_1.Status_CL = preds_1.CL
sub_1.Status_D = preds_1.D

In [47]:
sub_1.to_csv("./submissions/submission1.csv", index=False)

In [5]:
def mil():
    a=0
    for i in range(100000000):
        a += i

@jit(nopython=True)
def fast():
    a=0
    for i in range(100000000):
        a += i