In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_curve

In [None]:
%matplotlib

In [2]:
def convert_date(df):
    df['year'] = df['Original_Quote_Date'].apply(lambda x: x[0:4]).astype('int')
    df['month'] = df['Original_Quote_Date'].apply(lambda x: x[5:7]).astype('int')
    df['day'] = df['Original_Quote_Date'].apply(lambda x: x[8:]).astype('int')


def convert_field10(df):
    df['Field10'] = df['Field10'].apply(lambda x: str(x).replace(',', '')).astype('int')


def fill_PersonalField84(df):
    df.loc[df['PersonalField84'].isnull(), 'PersonalField84'] = 2


def fill_PropertyField29(df):
    df.loc[df['PropertyField29'].isnull(), 'PropertyField29'] = df['PropertyField29'].mean()


def beat_over_fitting(selected_feature):
    f = open('data/feature_importance.txt', 'r')
    sample_size = 0
    for line in f:
        sample_size += 1
        selected_feature.append(line.strip())
    return selected_feature


def check_na(df):
    ans = []
    for k in df.keys():
        if True in df[k].isnull().values:
            ans.append(k)
    return ans

In [9]:
train_file = 'data/train.csv'
test_file = 'data/test.csv'
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)

In [10]:
dtype = train.dtypes
not_number = []
for i, k in enumerate(train.keys()):
    if (str(dtype[i]) != 'float64') & (str(dtype[i]) != 'int64'):
        not_number.append(k)

not_number.pop(not_number.index('Original_Quote_Date'))
not_number.pop(not_number.index('Field10'))
train[not_number].head()

Unnamed: 0,Field6,Field12,CoverageField8,CoverageField9,SalesField7,PersonalField7,PersonalField16,PersonalField17,PersonalField18,PersonalField19,...,PropertyField30,PropertyField31,PropertyField32,PropertyField33,PropertyField34,PropertyField36,PropertyField37,PropertyField38,GeographicField63,GeographicField64
0,B,N,T,D,V,N,ZA,ZE,XR,XD,...,N,N,Y,G,Y,N,N,N,N,CA
1,F,N,T,E,P,N,XB,YJ,YE,XT,...,N,O,N,H,Y,N,N,N,N,NJ
2,F,N,T,J,K,N,ZH,XS,YP,XC,...,N,K,Y,H,Y,N,N,N,N,NJ
3,J,N,Y,F,V,N,XO,XE,YI,XX,...,N,O,Y,G,N,N,Y,N,N,TX
4,E,N,T,F,R,N,ZA,ZE,XR,XD,...,N,O,N,H,N,N,N,N,N,IL


In [11]:
for k in not_number:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(np.unique(list(train[k].values) + list(test[k].values)))
    train[k] = lbl.transform(list(train[k].values))
    test[k] = lbl.transform(list(test[k].values))

In [12]:
convert_date(train)
convert_field10(train)
convert_date(test)
convert_field10(test)
y = train['QuoteConversion_Flag']
x = train.select_dtypes(include = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'])
xt = test.select_dtypes(include = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'])
x = x.drop('QuoteConversion_Flag',axis= 1)
print check_na(x), check_na(xt)
print x.shape, xt.shape

['PersonalField84', 'PropertyField29'] ['PersonalField84', 'PropertyField29']
(260753, 300) (173836, 300)


In [13]:
fill_PersonalField84(x)
fill_PersonalField84(xt)
fill_PropertyField29(x)
fill_PropertyField29(xt)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5)

In [15]:
print x_train.shape, x_test.shape, y_train.shape, y_test.shape


(130376, 300) (130377, 300) (130376,) (130377,)


In [16]:
x_train, x_train_lr, y_train, y_train_lr = train_test_split(x_train,y_train,test_size=0.5)

In [17]:
print x_train.shape, y_train.shape, x_train_lr.shape, y_train_lr.shape

(65188, 300) (65188,) (65188, 300) (65188,)


In [18]:
params = {'n_estimators': 100, 'max_leaf_nodes': 4, 'max_depth': None, 'random_state': 2,
          'min_samples_split': 5, 'learning_rate': 0.1, 'subsample': 0.5}
gb = GradientBoostingClassifier(**params)
gb_encoder = preprocessing.OneHotEncoder()
lr = LogisticRegression()

In [19]:
gb.fit(x_train, y_train)

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=None, max_features=None, max_leaf_nodes=4,
              min_samples_leaf=1, min_samples_split=5,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=2, subsample=0.5, verbose=0,
              warm_start=False)

In [20]:
gb_encoder.fit(gb.apply(x_train)[:, :, 0])

OneHotEncoder(categorical_features='all', dtype=<type 'float'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [21]:
lr.fit(gb_encoder.transform(gb.apply(x_train_lr)[:, :, 0]), y_train_lr)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [22]:
yhat = lr.predict_proba(gb_encoder.transform(gb.apply(x_test)[:, :, 0]))[:,1]

In [23]:
yhat2 = gb.predict_proba(x_test)[:,1]

In [24]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(np.array(x_train),np.array(y_train))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [25]:
yhat3 = rf.predict_proba(x_test)[:,1]

In [26]:
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, yhat)

In [27]:
fpr_grd_lm2, tpr_grd_lm2, _ = roc_curve(y_test, yhat2)

In [28]:
fpr_grd_lm3, tpr_grd_lm3, _ = roc_curve(y_test, yhat3)

In [51]:
plt.xlim(0, 0.5)
plt.ylim(0.9,1)
plt.plot(fpr_grd_lm, tpr_grd_lm, label='GB + LR')
plt.plot(fpr_grd_lm2, tpr_grd_lm2, '--',label='GB')
plt.plot(fpr_grd_lm3, tpr_grd_lm3, '-.',label='RF')
plt.legend(loc = 'best')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
#plt.show()
plt.savefig('/Users/itbwtw/Documents/Courses/CompSci 273A Machine Learning/project/latex/figure/roc.eps')

In [None]:
selected_feature = beat_over_fitting([])
x = x[selected_feature[0:23]]
xt = xt[selected_feature[0:23]]

In [None]:
pd.DataFrame(fpr_grd_lm).to_csv('data/roc.csv',index=False)