# Tranforming Validation Data to Match Training Data

### Loading Data and Dependencies

In [1]:
import pandas as pd
import numpy as np

#Final Train
train_fnl = pd.read_csv('Processed_DataSets/Final_Train.csv',encoding='ISO-8859-1',low_memory=False,index_col=0)
train_fnl.index = train_fnl['ticket_id']
train_fnl.drop(['ticket_id'],axis=1,inplace=True)

#Validation
val = pd.read_csv('Orignal_DataSet/validation.csv',encoding='ISO-8859-1',low_memory=False)

#Init. Train

train = pd.read_csv('Orignal_DataSet/train.csv',encoding='ISO-8859-1',low_memory=False)

## Tranformation Funtions

In [2]:
def get_violator_feature(df):
    df['len_violator'] = df['violator_name'].apply(lambda x : len(str(x)) if 'Inc' not in str(x) and 'L.L.C' not in str(x) 
                                                 and 'INC' not in str(x) and 'LLC' not in str(x) else 0)
    df['violator_type'] = df['len_violator'].apply(lambda x : 'Agency' if x==0 or x >20 else 'Person' )
    return df

def get_discription_feature(df):
    #Lenght of violation    
    df['len_description'] = df['violation_description'].str.len()
    #Number of violation
    df['count_violation'] = df['violation_description'].apply(lambda x : len(x.split(',')))
    return df

def get_disposition_features(df):
    # Reason of Guilt
    df['responsible_by'] = df['disposition'].apply(lambda x : 0 if 'Not responsible' in x else -1 if 'PENDING' in x else x.split()[-1])
    #Correcting Data Errors
    df.loc[df['responsible_by'] == 'Deter','responsible_by'] = 'Determination'
    df.loc[df['responsible_by'] == 'Determi','responsible_by'] = 'Determination'
    df.loc[df['responsible_by'] == 'Admis','responsible_by'] = 'Admission'
    # If Fine was Waived or not
    df['fine_waived'] = df['disposition'].apply(lambda x : 1 if 'Fine Waived' in x else 0)
    return df

def get_date_time_feature(df):
    #time of ticket_issued
    df['ticket_time'] = df['ticket_issued_date'].apply(lambda x : int(''.join(x[11:].split(':')[0:2])))
    #Month_bin
    df['month_bin'] = df['ticket_issued_date'].apply(lambda x : 1 if 1 <= int(x[5:7]) <= 3 else 2 if 4 <= int(x[5:7]) <= 6 
    else 3 if 7 <= int(x[5:7]) <= 9 else 4)
    return df

In [3]:
#Transforming

val = get_date_time_feature(val)
val = get_discription_feature(val)
val  = get_disposition_features(val)
val = get_violator_feature(val);
val = val[['compliance','ticket_id','count_violation','ticket_time','month_bin','fine_amount','late_fee','discount_amount','judgment_amount',
          'len_violator','len_description','agency_name','country','violator_type','responsible_by','fine_waived']]

In [4]:
#taking useful Columns from init. Train data

train = get_date_time_feature(train)
train = get_discription_feature(train)
train  = get_disposition_features(train)
train = get_violator_feature(train);

#features to be Normalze
train = train[['fine_amount','late_fee','discount_amount','judgment_amount','len_violator','len_description']] 


## Normalization and Feature Selection

In [5]:
from sklearn.preprocessing import MinMaxScaler

scale = MinMaxScaler(feature_range=(0,5)).fit(train) # Using Training MinMax Scale to avoid Data Leakage

dummy = scale.transform(val[['fine_amount','late_fee','discount_amount','judgment_amount',
                             'len_violator','len_description']])

In [6]:
dummy = pd.DataFrame(data=dummy)
dummy.columns = ['fine_amount_N','late_fee_N','discount_amount_N','judgment_amount_N',
                             'len_violator_N','len_description_N']
val.drop(['fine_amount','late_fee','discount_amount','judgment_amount',
                             'len_violator','len_description'],axis=1,inplace=True)
val = pd.concat([val,dummy],axis=1)
val.set_index('ticket_id',inplace=True)

#handling inf and -inf values
for cl in ['count_violation','ticket_time','month_bin','fine_amount_N','late_fee_N','discount_amount_N',
          'judgment_amount_N','len_violator_N','len_description_N']:
    val[cl] = val[cl].astype(np.float32)

val.replace([np.inf, -np.inf], np.nan)
val.dropna(inplace=True)

#Getting dummies Variable

val['fine_waived'] = val['fine_waived'].astype(object)
val = pd.get_dummies(val)

## Saving DataSet for Validation of Selected Model

In [7]:
val.dtypes

compliance                                                      int64
count_violation                                               float32
ticket_time                                                   float32
month_bin                                                     float32
fine_amount_N                                                 float32
late_fee_N                                                    float32
discount_amount_N                                             float32
judgment_amount_N                                             float32
len_violator_N                                                float32
len_description_N                                             float32
agency_name_Buildings, Safety Engineering & Env Department      uint8
agency_name_Department of Public Works                          uint8
agency_name_Detroit Police Department                           uint8
country_USA                                                     uint8
violator_type_Agency

In [8]:
val.to_csv('Processed_DataSets/Final_Validation_DataSet.csv')