In [None]:
import os
import sys
import warnings
warnings.filterwarnings('ignore')

## Import required modules
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

In [None]:
df_train = pd.read_csv('train.csv')
print('Size of training set: {} rows and {} columns'
      .format(*df_train.shape))

# Print few rows and see how the data looks like
df_train.head()

In [None]:
# Seperate y from data as we will use this to learn as prediction output
y_train = df_train['y'].values

In [None]:
# Iterate through all the columns which has X in the name of the column
cols = [c for c in df_train.columns if 'X' in c]
print('Number of features: {}'.format(len(cols)))

print('Feature types:')
df_train[cols].dtypes.value_counts()

In [None]:
counts = [[], [], []]
for c in cols:
    typ = df_train[c].dtype
    uniq = len(np.unique(df_train[c]))
    if uniq == 1:
        counts[0].append(c)
    elif uniq == 2 and typ == np.int64:
        counts[1].append(c)
    else:
        counts[2].append(c)

print('Constant features: {} Binary features: {} Categorical features: {}\n'
      .format(*[len(c) for c in counts]))
print('Constant features:', counts[0])
print('Binary features:', counts[1])
print('Categorical features:', counts[2])

In [None]:
df_test = pd.read_csv('test.csv')

# Remove columns ID and Y from the data as they are not used for learning
usable_columns = list(set(df_train.columns) - set(['ID', 'y']))
y_train = df_train['y'].values
id_test = df_test['ID'].values

X_train = df_train[usable_columns]
X_test = df_test[usable_columns]

In [None]:
def check_missing_values(df):
    if df.isnull().any().any():
        print("There are missing values in the dataframe")
    else:
        print("There are no missing values in the dataframe")
check_missing_values(X_train)
check_missing_values(X_test)

In [None]:
for column in usable_columns:
    cardinality = len(np.unique(X_train[column]))    
    if cardinality == 1:
        # Column with only one value is useless so we drop it
        X_train.drop(columns=[column], axis=1, inplace=True)
        X_test.drop(columns=[column], axis=1, inplace=True)
    # Column with more than 2 values is categorical
    if cardinality > 2:
        mapper = lambda x: sum([ord(digit) for digit in x])
        X_train[column] = X_train[column].apply(mapper)
        X_test[column] = X_test[column].apply(mapper)

In [None]:
print('Feature types after dropping constant features:')
X_train[list(set(cols) - set(counts[0]))].dtypes.value_counts()

In [None]:
n_comp = 12
pca = PCA(n_components=n_comp, random_state=42)
pca2_results_train = pca.fit_transform(X_train)
pca2_results_test = pca.transform(X_test)

In [None]:
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
        pca2_results_train, 
        y_train, test_size=0.2, 
        random_state=42)

d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_valid, label=y_valid)
d_test = xgb.DMatrix(pca2_results_test)

params = {}
params['objective'] = 'reg:squarederror'
params['eta'] = 0.02
params['max_depth'] = 4

def xgb_r2_score(preds, dtrain):
    labels = dtrain.get_label()
    return 'r2', r2_score(labels, preds)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

clf = xgb.train(params, d_train, 
                1000, watchlist, early_stopping_rounds=50, 
                feval=xgb_r2_score, maximize=True, verbose_eval=10)

In [None]:
p_test = clf.predict(d_test)

sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = p_test

print(sub)