In [None]:
!pip install ipython-autotime
%load_ext autotime

# Creating Pipeline

In [None]:
## System info
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 4854769104786567707, name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 15469771520
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 4388253141469529268
 physical_device_desc: "device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:04.0, compute capability: 7.0"]

time: 4.96 s


In [2]:
# loading library
import joblib
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_absolute_error

time: 837 ms


In [3]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/My Drive/Applied AI Course/Assignments/23. Self Case Study 1

Mounted at /content/drive
/content/drive/My Drive/Applied AI Course/Assignments/23. Self Case Study 1
time: 22 s


In [4]:
# Loading Train and Test Dataset.
Train_Data = pd.read_csv('train.csv')
Test_Data = pd.read_csv('test.csv')

time: 6.74 s


In [None]:
def preprocessing(Train_Test):
  '''
    This funtion takes pandas dataframe as input, perform preprocessing(replaceing the nan value with most counted class) etc.
    and then return pandas dataframe as output
    
    data : pandas dataframe
    return : pandas dataframe
    '''

  Columns = [feature for feature in Train_Test.columns if 'c' in feature]
  if Train_Test[Columns].isnull().values.any() == True:
    Train_Test[Columns].apply(lambda col:fillna(np.nan))
  else:
    return Train_Test
  return Train_Test  

time: 2.54 ms


In [None]:
def featurization(Train_Data,Test_Data,Train_Test):
  '''
    This function takes pandas dataframe as input, create features and then return pandas dataframe as output
    
    input : pandas dataframe
    return : pandas dataframe
    '''
  if __name__ == '__main__':
    cat_feature = [n for n in Train_Data.columns if n.startswith('cat')]
    for column in cat_feature:
      if Train_Data[column].nunique() != Test_Data[column].nunique():
        Unique_classes_Train = set(Train_Data[column].unique())
        Unique_classes_Test = set(Test_Data[column].unique())
        missing_train = Unique_classes_Train.difference(Unique_classes_Test)           # set_A.difference(set_B) for (A - B)
        missing_test =  Unique_classes_Test.difference(Unique_classes_Train)

        All_misisng = missing_train.union(missing_test)
        # Replace all misisng categories with a common category instead of removing. 
        def missing_common(x):
          if x in All_misisng:
            return np.nan
          return x        

        Train_Test[column] = Train_Test[column].apply(lambda x: missing_common(x), 1)   # Axis 1 :: columns

      Train_Test[column] = pd.factorize(Train_Test[column].values, sort=True)[0]
  return Train_Test

time: 11.1 ms


In [None]:
def final_Data(Train_Data,Test_Data):
  '''
    This function creates a final dataframe after all of the preprocessing, featurization, prparation and normalization.
    
    input : pandas dataframe
    return : pandas dataframe
    '''

  Train_Data.drop(['id'], axis=1, inplace=True)
  Test_Data.drop(['id'], axis=1, inplace=True)
  Test_Data['loss'] = np.nan
  Train_Test = pd.concat((Train_Data, Test_Data)).reset_index(drop=True)

  # preprocessing
  Train_Test_final = preprocessing(Train_Test)

  # Featurization
  Train_Test_ = featurization(Train_Data,Test_Data,Train_Test_final)
   
  Train_Data_final = Train_Test_[Train_Test_['loss'].notnull()]
  Test_Data_final = Train_Test_[Train_Test_['loss'].isnull()]
  return Train_Data_final,Test_Data_final

time: 5.5 ms


In [None]:
Train_Data_final,Test_Data_final = final_Data(Train_Data,Test_Data)

time: 18.4 s


In [None]:
# saving csv to disk
Train_Data_final.to_csv('Train_Data_final.csv', index=False)
Test_Data_final.to_csv('Test_Data_final.csv', index=False)

time: 10.1 s


In [6]:
def predict(data):
  '''
    This function is used to take single or multiple observations, and predict probabilities for them
    
    input : single or multiple observations from a pandas dataframe
    return : predicted cliam amount for the observations
    '''
  data = data.drop(['loss'], axis=1, inplace=False)
  data = xgb.DMatrix(data)
  clf = joblib.load('allstateserevity.pkl')
  pred = clf.predict(data)
  return pred

time: 2.36 ms


In [5]:
def mae(data, labels):
  '''
    This function is used to take single or multiple observations and class labels, and predict MAE of each observation.
    
    input : single or multiple observations from a pandas dataframe
    labels : Data frame of ground truth values
    return : MAE of each observation
    '''

  data = data.drop(['loss'], axis=1, inplace=False)
  data = xgb.DMatrix(data)
  clf = joblib.load('allstateserevity.pkl')
  pred = clf.predict(data)
  return mean_absolute_error(labels, pred)

time: 2.59 ms


## Single Observation Predicted

In [None]:
sampled_train = Train_Data_final.sample(1)
sampled_train

Unnamed: 0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,cat11,cat12,cat13,cat14,cat15,cat16,cat17,cat18,cat19,cat20,cat21,cat22,cat23,cat24,cat25,cat26,cat27,cat28,cat29,cat30,cat31,cat32,cat33,cat34,cat35,cat36,cat37,cat38,cat39,cat40,...,cat92,cat93,cat94,cat95,cat96,cat97,cat98,cat99,cat100,cat101,cat102,cat103,cat104,cat105,cat106,cat107,cat108,cat109,cat110,cat111,cat112,cat113,cat114,cat115,cat116,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
158572,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,2,3,2,4,0,0,1,8,0,0,0,7,7,4,3,7,29,96,2,8,7,5,6,223,0.467114,0.681761,0.57136,0.24439,0.281143,0.490651,0.993953,0.33372,0.42289,0.60401,0.797841,0.785706,0.189489,0.237754,820.23


time: 94.1 ms


In [None]:
predict(sampled_train)

array([8.000027], dtype=float32)

time: 655 ms


In [None]:
mae(sampled_train,sampled_train['loss'])

812.2299732971192

time: 245 ms


## Multiple Observation Predicted

In [9]:
Train_Data_final = pd.read_csv("Train_Data_final.csv")

time: 1.95 s


In [10]:
sampled_train = Train_Data_final.sample(15)
sampled_train

Unnamed: 0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,cat11,cat12,cat13,cat14,cat15,cat16,cat17,cat18,cat19,cat20,cat21,cat22,cat23,cat24,cat25,cat26,cat27,cat28,cat29,cat30,cat31,cat32,cat33,cat34,cat35,cat36,cat37,cat38,cat39,cat40,...,cat92,cat93,cat94,cat95,cat96,cat97,cat98,cat99,cat100,cat101,cat102,cat103,cat104,cat105,cat106,cat107,cat108,cat109,cat110,cat111,cat112,cat113,cat114,cat115,cat116,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
68489,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,4,3,3,3,4,2,0,12,8,0,0,0,5,6,6,5,3,29,38,2,11,0,0,9,150,0.477231,0.488789,0.762059,0.208655,0.281143,0.435733,0.680421,0.26847,0.41471,0.47779,0.672085,0.658814,0.30435,0.463392,1710.07
1346,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,3,3,3,1,2,0,12,11,3,0,0,3,4,9,6,1,29,53,0,35,37,0,10,25,0.376862,0.785784,0.54977,0.412789,0.643315,0.257876,0.303438,0.49462,0.34555,0.35944,0.24541,0.241676,0.318646,0.390144,391.71
88965,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,...,0,3,3,3,4,2,0,12,6,0,0,0,4,4,8,5,1,29,38,4,20,3,0,10,63,0.478678,0.245921,0.440642,0.267727,0.422268,0.290374,0.381301,0.26847,0.39447,0.37493,0.275431,0.270746,0.310026,0.765057,4514.16
132559,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,3,1,2,4,0,3,15,14,3,0,0,10,5,5,10,10,29,94,2,50,57,8,15,251,0.703859,0.422197,0.26357,0.62377,0.534484,0.874303,0.675139,0.64577,0.64296,0.79504,0.644013,0.785706,0.862949,0.427622,5858.5
128986,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,3,3,2,6,4,3,15,9,0,0,2,4,3,7,11,10,29,94,0,30,57,0,15,94,0.281338,0.620805,0.484196,0.789639,0.281143,0.480957,0.486584,0.87954,0.32128,0.41743,0.291268,0.286079,0.663739,0.823007,4533.32
168928,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,2,3,4,4,6,4,13,8,0,0,0,6,5,6,8,5,29,34,4,46,36,0,13,155,0.629339,0.488789,0.54977,0.554673,0.82168,0.625784,0.449493,0.61229,0.64873,0.74396,0.569745,0.55738,0.689974,0.29546,1027.5
45545,0,1,0,1,1,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...,0,3,1,2,4,4,2,15,1,13,0,1,4,3,9,8,6,29,94,2,46,57,0,14,63,0.451274,0.245921,0.317666,0.753703,0.281143,0.326207,0.243696,0.37194,0.4916,0.32935,0.231253,0.227963,0.566274,0.297178,3658.63
140333,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,4,3,1,3,4,0,0,12,5,0,2,0,6,6,5,4,1,58,117,6,1,48,0,8,159,0.59912,0.358319,0.57136,0.554673,0.281143,0.46425,0.587462,0.40603,0.521,0.59869,0.644013,0.630853,0.284958,0.629746,3033.25
62415,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,3,1,2,1,6,2,15,11,5,0,0,11,4,4,7,6,29,101,0,46,57,0,16,208,0.960283,0.358319,0.230975,0.768556,0.491114,0.953118,0.633426,0.47669,0.97621,0.8351,0.853569,0.843026,0.907207,0.296406,1945.7
20913,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,4,3,3,3,4,4,0,12,14,8,0,0,4,6,7,6,1,29,98,0,30,33,0,11,90,0.351358,0.620805,0.875767,0.29243,0.902259,0.340845,0.544575,0.48864,0.32128,0.28677,0.546666,0.534677,0.345247,0.826986,5133.6


time: 132 ms


In [11]:
predict(sampled_train)

array([7.8227715, 7.284207 , 8.320862 , 8.542786 , 7.7198963, 7.3475814,
       8.268406 , 7.9695573, 7.571107 , 8.540485 , 8.654859 , 7.2890344,
       8.0878525, 7.9571137, 7.900096 ], dtype=float32)

time: 1.56 s


In [13]:
mae(sampled_train,sampled_train['loss'])

2854.0242257207233

time: 283 ms
