In [32]:
# Check the versions of libraries

# Python version
import sys

# scipy
import scipy

# numpy
import numpy as np

# matplotlib
import matplotlib

# pandas
import pandas as pd
from pandas.plotting import scatter_matrix

# matplotlib
import matplotlib.pyplot as plt

# time
import time

# scikit-learn
import sklearn

# Load libraries
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [33]:
df_train = pd.read_csv("new/trainingSet.csv")
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147229 entries, 0 to 147228
Data columns (total 4 columns):
decision_date         147229 non-null object
case_received_date    147229 non-null object
case_status           147229 non-null int64
row ID                147229 non-null int64
dtypes: int64(2), object(2)
memory usage: 4.5+ MB


In [34]:
# convert date string datatype to date/time datatype
df_train['case_received_date'] = pd.to_datetime(df_train['case_received_date'], format='%d/%m/%Y')
df_train['decision_date'] = pd.to_datetime(df_train['decision_date'], format='%d/%m/%Y')

# convert date/time datatype to epoch unix numeric datatype
df_train["case_received_date"] = pd.to_numeric(df_train["case_received_date"])/1000000000
df_train["decision_date"] = pd.to_numeric(df_train["decision_date"])/1000000000
# show float numeric as int 
pd.options.display.float_format = '{:.0f}'.format

df_train

Unnamed: 0,decision_date,case_received_date,case_status,row ID
0,1468800000,1460592000,0,293694
1,1464912000,1454544000,0,226196
2,1464912000,1448841600,0,358496
3,1477267200,1471219200,0,300774
4,1437004800,1394064000,0,29121
5,1455235200,1442880000,1,123549
6,1453852800,1440460800,1,142077
7,1462147200,1451520000,0,242501
8,1461715200,1439251200,0,50082
9,1442793600,1426809600,0,89959


In [35]:
# Split training and testing dataset
dataset = df_train.values

# first 2 columns: case_received_date and decision_date
X = dataset[:,0:2] 
Y = dataset[:,2]

# testing group 20%
validation_size = 0.20
seed = 7 # random seed
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

# Test options and evaluation metric from cross value score of model
scoring = 'accuracy'

# models from scikit-learn that gonna be used 
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
# models.append(('SVM', SVC()))

# evaluate each model
results = []
names = []

# print date/time of testing
print (time.ctime())

for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

Thu May 31 20:14:56 2018
LR: 0.528693 (0.006385)
LDA: 0.544815 (0.005277)
KNN: 0.955435 (0.001826)
CART: 0.952175 (0.001975)
NB: 0.590399 (0.004478)


In [36]:
df_test = pd.read_csv("new/TestingSet.csv")
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63234 entries, 0 to 63233
Data columns (total 4 columns):
row ID                63234 non-null int64
case_received_date    63233 non-null object
decision_date         63234 non-null object
case_status           0 non-null float64
dtypes: float64(1), int64(1), object(2)
memory usage: 1.9+ MB


In [37]:
# convert date string datatype to date/time datatype
df_test['case_received_date'] = pd.to_datetime(df_test['case_received_date'], format='%d/%m/%Y')
df_test['decision_date'] = pd.to_datetime(df_test['decision_date'], format='%d/%m/%Y')

# convert date/time datatype to epoch unix numeric datatype
df_test["case_received_date"] = pd.to_numeric(df_test["case_received_date"])/1000000000
df_test["decision_date"] = pd.to_numeric(df_test["decision_date"])/1000000000
df_test

Unnamed: 0,row ID,case_received_date,decision_date,case_status
0,341467,1437523200,1453334400,
1,12669,1463356800,1470009600,
2,354661,1420588800,1438214400,
3,76459,1449532800,1461888000,
4,149166,1418774400,1435795200,
5,237295,1429228800,1447977600,
6,7800,1454544000,1464912000,
7,290591,1449532800,1469750400,
8,369544,1441238400,1456272000,
9,326868,1423526400,1441238400,


In [38]:
#loading my train and prediction dataset
training_df = df_train
testing_df = df_test

#datasets that will be used for predict the case_status
prediction_data = ['case_received_date','decision_date']

#training model with K Nearest Neighbors Classifier
model = KNeighborsClassifier()

#set prediction data to dataset_column that will predict, and set target of prediction to case_status
training_data = training_df[prediction_data]
testing_data = testing_df[prediction_data]
prediction_target = training_df.case_status

#fitting model with prediction data
model.fit(training_data, prediction_target)

# put result of prediction to case_status column at testing dataframe
testing_df['case_status'] = model.predict(testing_data)

testing_df

Unnamed: 0,row ID,case_received_date,decision_date,case_status
0,341467,1437523200,1453334400,1
1,12669,1463356800,1470009600,0
2,354661,1420588800,1438214400,0
3,76459,1449532800,1461888000,0
4,149166,1418774400,1435795200,0
5,237295,1429228800,1447977600,1
6,7800,1454544000,1464912000,0
7,290591,1449532800,1469750400,0
8,369544,1441238400,1456272000,1
9,326868,1423526400,1441238400,0
