In [1]:
import json
import pandas as pd
import numpy as np
import seaborn as sns
import xgboost as xgb
import matplotlib.pyplot as plt
from scipy.stats import skew
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV, LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import StratifiedShuffleSplit, validation_curve, cross_val_score
from sklearn.metrics import accuracy_score, log_loss
from sklearn.preprocessing import StandardScaler
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

#### Load in Housing Data

In [65]:
train = pd.read_csv('data-science-comp-f2020/train_final.csv').iloc[:,1:-1]
test = pd.read_csv('data-science-comp-f2020/test_final.csv').iloc[:,1:-1]
y = train.Y
print(train.head())
print(test.head())
print(np.shape(train))
print(np.shape(test))

   Y     f1  f2     f3      f4  f5  f6      f7      f8  f9  ...       f14  \
0  1  25884   1  33.63  118596   1   0  118595  125738   1  ... -2.266430   
1  1  34346   1  10.62  118041   1   0  117902  130913   1  ... -0.305612   
2  1  34923   1   1.77  118327   1   0  117961  124402   1  ...  2.015561   
3  1  80926   1  30.09  118300   1   0  117961  301218   1  ... -3.172501   
4  1   4674   1   1.77  119921   1   0  119920  302830   1  ...  0.573767   

     f15     f16     f17  f18     f19  f20  f21  f22  f23  
0   1945  118450  119184    1  121372    1    1    1    2  
1  15385  117945  292795    1  259173    1    1    1    1  
2   7547  118933  290919    1  118784    1    1    1    1  
3   4933  118458  118331    1  307024    1    1    1    2  
4  13836  142145    4673    1  128230    1    1    1  620  

[5 rows x 24 columns]
       f1  f2     f3      f4  f5  f6      f7      f8  f9  f10  ...       f14  \
0   37733   1   1.77  118603   1   0  118602  118097   1    0  ...  2.4537

In [3]:
#Samples from both train and test
all_data = pd.concat((train.iloc[:,1:-1],
                      test.iloc[:,1:-1]))

#### Some useful functions for below

In [33]:
def export_test(preds, name):
    ids = pd.read_csv('data-science-comp-f2020/test_final.csv').iloc[:,0]
    np.shape(ids)
    temp = pd.concat([ids, preds], axis=1)
    temp.to_csv(f'results/{name}.csv',index=False,header=True)

#### Brief Data Exploration

In [None]:
train.describe()

In [None]:
corr = train.corr()
sns.heatmap(corr)

In [None]:
sns.distplot(train.Y)
plt.show()

#### Feature Engineering

In [34]:
likely_categorical = {}
for var in train.columns:
    likely_categorical[var] = 1.*train[var].nunique()/train[var].count() < 0.05 #or some other threshold
del likely_categorical['Y']

### Training Models

#### Trying XGBoost

In [58]:
data = train
if "Y" in data.columns:
    del data["Y"]
dtrain = xgb.DMatrix(data=data, label=y)
dtest = xgb.DMatrix(data=test)
params = {
    'eta': 0.1, 
    'max_depth': 20,  
    'objective': 'multi:softprob',  
    'num_class': 2, 
    'verbosity':1} 

steps = 20
num_round = 10
xgb_model_counter=0

In [59]:
model = xgb.train(params, dtrain, steps)

In [60]:
soft_preds = model.predict(dtest)
print(soft_preds)

[[0.08859605 0.911404  ]
 [0.2567079  0.7432921 ]
 [0.06564732 0.9343527 ]
 ...
 [0.06564732 0.9343527 ]
 [0.09559052 0.9044095 ]
 [0.18103012 0.8189699 ]]


In [61]:
hard_preds = pd.DataFrame([np.argmax(line) for line in soft_preds])
hard_preds.columns = ["Y"]
np.shape(hard_preds)
print(hard_preds)

       Y
0      1
1      1
2      1
3      1
4      1
...   ..
16380  1
16381  1
16382  1
16383  1
16384  1

[16385 rows x 1 columns]


In [62]:
xgb_model_counter += 1
model_name = datetime.today().strftime('%Y_%m_%d') + "xgboost_" + str(xgb_model_counter)
model.save_model(f'models/{model_name}.model')
model_params = f"models/{model_name}_params.json"
with open(model_params, 'w') as fp:
    json.dump(params, fp)

In [64]:
export_test(hard_preds, model_name)
print(xgb_model_counter)

1


#### Pt 2 - Logistic Regression

In [None]:
#Info on what cross validation is. RMSE is just the RMS error in the model: https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation
def rmse_cv(model):
    rmse= np.sqrt(-cross_val_score(model, train, y, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)

In [None]:
###Ridge Regression###
model_ridge = Ridge()

#Run the cross validation using different alpha parameters, test cross-validation on each one
alphas = [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]
cv_ridge = [rmse_cv(Ridge(alpha = alpha)).mean() 
            for alpha in alphas]
###Plotting Ridge Regression###
cv_ridge = pd.Series(cv_ridge, index = alphas)
cv_ridge.plot()
plt.xlabel("alpha")
plt.ylabel("rmse")

In [None]:
lasso_alphas = [1, 0.1, 0.001, 0.0005]
model_lasso = LassoCV(alphas = [1, 0.1, 0.001, 0.0005]).fit(train, y)

In [None]:
coef = pd.Series(model_lasso.coef_, index = train.columns)
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")
imp_coef = pd.concat([coef.sort_values().head(10),
                     coef.sort_values().tail(10)])

imp_coef.plot(kind = "barh")
plt.title("Coefficients in the Lasso Model")

In [None]:
n_values = 11
accuracies = []
non_zero = []
Cs = np.logspace(-5, 5, num=n_values)
cv = StratifiedShuffleSplit(n_splits=n_values, test_size=0.25)

In [None]:
for i, (train_index, test_index) in enumerate(cv.split(train,y)):
    print("running training for C =", Cs[i])
    X_train, X_test = train[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    logreg_l1 = LogisticRegression(C=Cs[i], penalty='l1',solver='saga', multi_class='multinomial', tol=0.1)
    logreg_l1.fit(X_train, y_train)
    
    y_pred = logreg_l1.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracies.append(acc)
    print("Accuracy:", acc)