In [100]:
from sklearn.linear_model import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import impyute as impy

import numpy as np
import pandas as pd

data_file = pd.read_csv("../TAMU_FINAL_DATASET_2018.csv")

In [77]:
# Process categroical variable
def catogrical_var_onehot(data_file):
    categorical_data = data_file.select_dtypes(exclude=['float64','int64'])
    categorical_df = pd.DataFrame()
    for column in categorical_data.columns:
        dummies = pd.get_dummies(categorical_data[column], prefix=column)
        categorical_df = pd.concat([categorical_df, dummies], axis=1)
    return categorical_df

# Process numerical variable
def numerical_var(data_file):
    numerical_df = data_file.select_dtypes(include=['float64','int64'])
    return numerical_df

# Compute na values
def fill_na(df):
    df_column = df.columns
    df_nona = impy.mice(df.values)
    df_nona = pd.DataFrame(df_nona, columns=df_column)
    return df_nona

whole_df = pd.concat([numerical_var(data_file), catogrical_var_onehot(data_file)], axis=1)
whole_df = fill_na(whole_df)

In [78]:
# ~2700 AMIFLAG = 1 entries and same amount of flag = 0
def balanced_data(df):
    df_AMI = df[df['AMI_FLAG'] == 1]
    df_NOAMI_sample = df_NOAMI.sample(df_AMI.shape[0])
    balanced_df = pd.concat([df_AMI, df_NOAMI_sample], axis = 0)
    return balanced_df

balanced_df = balanced_data(df)

balanced_df[balanced_df['AMI_FLAG'] == 1].shape

(2726, 469)

In [85]:
def train_test_set(df):
    X = df.loc[:, df.columns != 'AMI_FLAG']
    y = df['AMI_FLAG']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=42)
    return_dic = {'X_train':X_train, 'X_test':X_test, 'y_train':y_train, 'y_test':y_test}
    return return_dic

whole_suite = train_test_set(whole_df)
balanced_suite = train_test_set(balanced_df)

print(balanced_suite['X_train'].shape)

(3652, 468)


In [108]:
def logistic_model(suite):
    # construct linear model using sklearn 
    lm = LogisticRegression(penalty='l1')
    model = lm.fit(suite['X_train'], suite['y_train'])
    return model
    
def rf_model(suite):
    clf = RandomForestClassifier(n_estimators=300,
                                min_samples_leaf=2,
                                min_samples_split=10,
                                max_depth=10,
                                max_features='log2',
                                criterion='entropy')
    model = clf.fit(suite['X_train'], suite['y_train'])
    return model
    
def test_on(X, model, y):
    pred = model.predict(X)
    accuracy = accuracy_score(y, pred)
    print("Accuracy is: " + str(accuracy))
    return

In [109]:
log_model = logistic_model(balanced_suite)
rf_model = rf_model(balanced_suite)
test_on(whole_suite['X_train'], rf_model, whole_suite['y_train'])
test_on(whole_suite['X_test'], rf_model, whole_suite['y_test'])


Accuracy is: 0.746089552238806
Accuracy is: 0.7402424242424243
