In [None]:
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 12 17:44:09 2017

@author: Kathan Sheth & Preyas Shah
"""

import matplotlib.pyplot as plt
from feature import *
import numpy as np
import sys
import csv
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from sklearn.metrics import precision_recall_fscore_support
from sklearn.ensemble import GradientBoostingClassifier
#from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import GridSearchCV

# Script by https://www.kaggle.com/ogrellier
# Code: https://www.kaggle.com/ogrellier/python-target-encoding-for-categorical-features
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior  
    """ 
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

trainingDataCSV = "Dataset/train.csv"
testDataCSV = "Dataset/test.csv"
train = pd.read_csv(trainingDataCSV)
test = pd.read_csv(testDataCSV)
#Undersampling 
desired_apriori=0.10

# Get the indices per target value
idx_0 = train[train.target == 0].index
idx_1 = train[train.target == 1].index

# Get original number of records per target value
nb_0 = len(train.loc[idx_0])
nb_1 = len(train.loc[idx_1])

# Calculate the undersampling rate and resulting number of records with target=0
undersampling_rate = ((1-desired_apriori)*nb_1)/(nb_0*desired_apriori)
undersampled_nb_0 = int(undersampling_rate*nb_0)

# Randomly select records with target=0 to get at the desired a priori
undersampled_idx = shuffle(idx_0, random_state=37, n_samples=undersampled_nb_0)

# Construct list with remaining indices
idx_list = list(undersampled_idx) + list(idx_1)

# Return undersample data frame
train = train.loc[idx_list].reset_index(drop=True)

train = train.replace(-1,np.nan)
train.pop('id')
test = test.replace(-1,np.nan)
ids = test.pop('id')

train_encoded, test_encoded = target_encode(train["ps_car_11_cat"], 
                             test["ps_car_11_cat"], 
                             target=train.target, 
                             min_samples_leaf=100,
                             smoothing=10,
                             noise_level=0.01)

y = train.pop('target')
X = train
nanThresh = int(X.shape[0]*0.75) #75 % of non-null values should be there
droppedFeatures = X.isnull().sum() > 0.25*X.shape[0]
X.dropna(axis=1,thresh=nanThresh,inplace=True)
#Fill the NaN Values
nanValDict = {}
modeFrame = X.mode(axis=0)
maxArray = X.max(axis=0)
for columnName in X.columns.values:
    if 'cat' in columnName:
        nanValDict[columnName] = int(maxArray[columnName] + 1)
    elif 'bin' in columnName:
        nanValDict[columnName] = int(modeFrame[columnName].values[0])
    elif X[columnName].dtype == int:
        nanValDict[columnName] = int(modeFrame[columnName].values[0])
    else:
        nanValDict[columnName] = X[columnName].mean()
X = X.fillna(value=nanValDict)

# for columnName in X.columns.values:
#     if 'cat' in columnName or 'bin' in columnName:
#         continue
#     else:
#         gpa_mean,gpa_std = X[columnName].mean(),X[columnName].std()
#         X.loc[:,columnName] = (X[columnName] - gpa_mean) / gpa_std
    #print (columnName)
df_new = X
for columnName in X.columns.values:    
    if 'cat' not in columnName or columnName == 'ps_car_11_cat':
        continue
    else:
        dummies = pd.get_dummies(X[columnName], prefix = columnName)
        df_new = pd.concat([df_new, dummies], axis=1)
        df_new.drop(columnName,axis=1,inplace=True)
X = df_new

#Handle the Special Categorial Feature
    
X['ps_car_11_cat_te'] = train_encoded
X.drop('ps_car_11_cat', axis=1, inplace=True)

# for columnName in X.columns.values:
#      print (columnName)
#X_resampled, y_resampled = SMOTE().fit_sample(X.values, y.values)
clf = RandomForestClassifier(n_estimators=50,min_samples_leaf=2,min_samples_split=7,max_features=8)
clf.fit(X,y)
uselessFeatures = []
importances = clf.feature_importances_
indices = np.argsort(clf.feature_importances_)[::-1]

for f in range(X.shape[1]):
    print ("%2d) %-*s %f" % (f + 1, 30,X.columns.values[indices[f]], importances[indices[f]]))
for i in range(len(importances)):
    if(importances[i] < 0.001):
        uselessFeatures.append(X.columns.values[i])
for columnName in X.columns.values:
    if(columnName in uselessFeatures):
        X.drop(columnName,axis=1,inplace=True)
print (X.shape)
# clf = RandomForestClassifier(n_estimators=50)
# use a full grid over all parameters
# specify parameters and distributions to sample from
# param_dist = {"max_depth": [3, None],
#               "max_features": sp_randint(1, 11),
#               "min_samples_split": sp_randint(2, 11),
#               "min_samples_leaf": sp_randint(1, 11),
#               "bootstrap": [True, False]}

# run randomized search
# n_iter_search = 20
# random_search = RandomizedSearchCV(clf, param_distributions=param_dist,scoring='f1_micro',
#                                    n_iter=n_iter_search,return_train_score=False)
# random_search.fit(X, y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,stratify=y)
#clf = RandomForestClassifier(n_estimators=50,min_samples_leaf=1,min_samples_split=4,max_features=6,bootstrap=False)
#clf.fit(Xtrain,Ytrain)
#Youtput = clf.predict(Xtest)
#print precision_recall_fscore_support(Ytest, Youtput, average='micro')

# Fit classifier with out-of-bag estimates
#clf = GradientBoostingClassifier(n_estimators=1200, learning_rate=0.01)
#max_depth= [2,3,4,5,6,7]
#param_grid = dict(max_depth=max_depth)
#kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
#grid_search = GridSearchCV(clf, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold)
#grid_result = grid_search.fit(X, y)
#print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

#params = {'n_estimators': 1200, 'max_depth': 3, 'subsample': 0.5,
#          'learning_rate': 0.001, 'min_samples_leaf': 2, 'random_state': 3}
#clf = GradientBoostingClassifier(n_estimators= 1200, max_depth= 3, subsample= 0.4,
#          learning_rate= 0.01, min_samples_leaf= 2, random_state= 3)

#clf.fit(X_train, y_train)
#acc = clf.score(X_test, y_test)
#print ("Accuracy: {:.4f}".format(acc))

from sklearn.svm import SVC
clf = SVC(cache_size=200, class_weight={0:0.1,1:0.9}, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
clf.fit(X_train, y_train)
acc = clf.score(X_test, y_test)
print ("Accuracy: {:.4f}".format(acc))




################################################################################

In [13]:

#####################Test####################################
X = test
# nanThresh = int(X.shape[0]*0.75) #75 % of non-null values should be there
for columnName in X.columns.values:
    if(droppedFeatures[columnName]):
        X.drop(columnName,axis=1,inplace=True)
#Fill the NaN Values
nanValDict = {}
modeFrame = X.mode(axis=0)
maxArray = X.max(axis=0)
for columnName in X.columns.values:
    if 'cat' in columnName:
        nanValDict[columnName] = int(maxArray[columnName] + 1)
    elif 'bin' in columnName:
        nanValDict[columnName] = int(modeFrame[columnName].values[0])
    elif X[columnName].dtype == int:
        nanValDict[columnName] = int(modeFrame[columnName].values[0])
    else:
        nanValDict[columnName] = X[columnName].mean()
X = X.fillna(value=nanValDict)

# for columnName in X.columns.values:
#     if 'cat' in columnName or 'bin' in columnName:
#         continue
#     else:
#         gpa_mean,gpa_std = X[columnName].mean(),X[columnName].std()
#         X.loc[:,columnName] = (X[columnName] - gpa_mean) / gpa_std
        
df_new = X
for columnName in X.columns.values:    
    if 'cat' not in columnName or columnName == 'ps_car_11_cat':
        continue
    else:
        dummies = pd.get_dummies(X[columnName], prefix = columnName)
        df_new = pd.concat([df_new, dummies], axis=1)
        df_new.drop(columnName,axis=1,inplace=True)
X = df_new

X['ps_car_11_cat_te'] = test_encoded
X.drop('ps_car_11_cat', axis=1, inplace=True)

for columnName in X.columns.values:
    if(columnName in uselessFeatures):
        X.drop(columnName,axis=1,inplace=True)
Youtput = clf.predict_proba(X)

##########################################################################

Youtput = pd.DataFrame(data=Youtput,columns=['target1','target2'])
final_frame = pd.concat([ids,Youtput],axis=1)
final_frame.to_csv('final_output4.csv', sep=',')