In [87]:
import pandas as pd
import numpy as np
from pandas_schema import Column, Schema
from pandas_schema.validation import CustomElementValidation

In [30]:
PATH = './final2z.csv'
train_data = pd.DataFrame(pd.read_csv(PATH, index_col=False))

In [4]:
def is_int(value):
    return isinstance(value, int)z

In [5]:
def is_string(value):
    return isinstance(value, str)

In [6]:
def is_float(value):
    return isinstance(value, float)

In [7]:
int_validation=[CustomElementValidation(lambda d: is_int(d), 'is not integer')]

In [8]:
string_validation=[CustomElementValidation(lambda d: is_string(d), 'is not string')]

In [9]:
float_validation=[CustomElementValidation(lambda d: is_float(d), 'is not float')]

In [10]:
null_validation=[CustomElementValidation(lambda d: pd.notnull(d), 'cannot be null')]

In [11]:
hp_min_validation=[CustomElementValidation(lambda d: d>40, 'cannot be less than 40')]

In [12]:
hp_max_validation=[CustomElementValidation(lambda d: d<800, 'cannot be greater than 800')]

In [13]:
mileage_max_validation=[CustomElementValidation(lambda d: d<300000, 'cannot be less than 300000')]

In [14]:
price_max_validation=[CustomElementValidation(lambda d: d<5000000, 'cannot be greater than 5000 000')]

In [15]:
price_min_validation=[CustomElementValidation(lambda d: d>4000, 'cannot be less than 4000')]

In [16]:
def validate_data(df, schema):
    errors = schema.validate(df)
    return errors

In [17]:
schema = Schema([
    Column('brand', null_validation+string_validation),
    Column('gear', null_validation+string_validation),
    Column('model', null_validation+string_validation),
    Column('price', null_validation+int_validation+price_max_validation, price_min_validation),
    Column('fuel', null_validation+string_validation),
    Column('milage', null_validation+int_validation+mileage_max_validation),
    Column('hp', null_validation+float_validation+hp_min_validation+hp_max_validation),
    Column('type', null_validation+string_validation),
    Column('geo', null_validation+string_validation),
    Column('model_year', null_validation+int_validation)
])

In [18]:
errors=validate_data(train_data, schema)

In [429]:
# errors_index = [e.row for e in errors]
# valid_data = train_data.drop(index=errors_index)
# pd.DataFrame({'errors':errors}).to_csv('errors.csv')
# valid_data.to_csv('valid_data.csv')