In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import StandardScaler
import joblib
from sklearn.decomposition import PCA

In [2]:
train = pd.read_csv('../data/train_test_data/train.csv')
test = pd.read_csv('../data/train_test_data/test.csv')

In [3]:
to_drop=['Id','Soil_Type7','Soil_Type15']
train.drop(to_drop,axis=1,inplace=True)
test.drop(to_drop,axis=1,inplace=True)

In [4]:
num_col = []
for col in train.columns:
    if train[col].nunique()>2 and col!='Cover_Type':
        num_col.append(col)
        
num_col

['Elevation',
 'Aspect',
 'Slope',
 'Horizontal_Distance_To_Hydrology',
 'Vertical_Distance_To_Hydrology',
 'Horizontal_Distance_To_Roadways',
 'Hillshade_9am',
 'Hillshade_Noon',
 'Hillshade_3pm',
 'Horizontal_Distance_To_Fire_Points']

In [5]:
def find_outliers_tukey(feature):
    q1 = np.percentile(feature, 25)
    q3 = np.percentile(feature, 75)
    iqr = q3-q1
    upbound = q3+1.5*iqr
    downbound = q1 - 1.5*iqr
    outlier_indices = list(
        feature.index[(feature > upbound) | (feature < downbound)])
    outlier_values = list(feature[outlier_indices])
    return outlier_indices, outlier_values

# find outlier by 1.5IQR, then impute with median in train set, do not touch test set
for col in num_col:
    outlier_indices_train, outlier_values_train = find_outliers_tukey(train[col])
    train.loc[outlier_indices_train, col] = train[col].median()

In [6]:
pos_num_col=[] # choose features whose data all >= 0, then do skewness test, log1p
for col in num_col:
    if (train[train[col]<0].shape[0])==0 and (test[test[col]<0].shape[0])==0:
        pos_num_col.append(col)
print(pos_num_col)
print(num_col)

['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points']
['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points']


In [7]:
# Handle skewness

for col in pos_num_col:
    if abs(train[col].skew()) > 0.5:
        print("Skewness for "+str(col)+": " + str(train[col].skew()))        
print('---------------------------')            
for col in pos_num_col:        # log(1+x)
    if abs(train[col].skew()) > 0.5 and abs(np.log1p(train[col]).skew())<abs(train[col].skew()):
        print("Skewness for "+str(col)+": " + str(train[col].skew()))
        train[col] = np.log1p(train[col])
        test[col] = np.log1p(test[col])
        print("Skewness for "+str(col)+": " + str(train[col].skew()))
        pos_num_col.remove(col)          
print('---------------------------')   
for col in pos_num_col:
    if abs(train[col].skew()) > 0.5:
        print("Skewness for "+str(col)+": " + str(train[col].skew()))      
print('---------------------------')   
for col in pos_num_col:     # x square
    if abs(train[col].skew()) > 0.5 and abs(np.square(train[col]).skew())<abs(train[col].skew()):
        print("Skewness for "+str(col)+": " + str(train[col].skew()))
        train[col] = np.square(train[col])
        test[col] = np.square(test[col])
        print("Skewness for "+str(col)+": " + str(train[col].skew()))
        pos_num_col.remove(col)   
print('---------------------------')   
for col in pos_num_col:
    if abs(train[col].skew()) > 0.5:
        print("Skewness for "+str(col)+": " + str(train[col].skew()))
print('---------------------------')   
for col in pos_num_col:        # x square root
    if abs(train[col].skew()) > 0.5 and abs(np.sqrt(train[col]).skew())<abs(train[col].skew()):
        print("Skewness for "+str(col)+": " + str(train[col].skew()))
        train[col] = np.sqrt(train[col])
        test[col] = np.sqrt(test[col])
        print("Skewness for "+str(col)+": " + str(train[col].skew()))
        pos_num_col.remove(col)   
print('---------------------------')   
for col in pos_num_col:
    if abs(train[col].skew()) > 0.5:
        print("Skewness for "+str(col)+": " + str(train[col].skew()))

Skewness for Horizontal_Distance_To_Hydrology: 0.8324195210682085
Skewness for Horizontal_Distance_To_Roadways: 1.0004715097062213
Skewness for Hillshade_9am: -0.7393861768856743
Skewness for Hillshade_Noon: -0.5496178397165092
Skewness for Horizontal_Distance_To_Fire_Points: 0.8027994190907131
---------------------------
Skewness for Horizontal_Distance_To_Roadways: 1.0004715097062213
Skewness for Horizontal_Distance_To_Roadways: -0.8998026883390026
---------------------------
Skewness for Horizontal_Distance_To_Hydrology: 0.8324195210682085
Skewness for Hillshade_9am: -0.7393861768856743
Skewness for Hillshade_Noon: -0.5496178397165092
Skewness for Horizontal_Distance_To_Fire_Points: 0.8027994190907131
---------------------------
Skewness for Hillshade_9am: -0.7393861768856743
Skewness for Hillshade_9am: -0.4801426168943799
---------------------------
Skewness for Horizontal_Distance_To_Hydrology: 0.8324195210682085
Skewness for Hillshade_Noon: -0.5496178397165092
Skewness for Horizo

In [8]:
cat_col = [] 
for col in train.columns:
    if train[col].nunique()==2:
        cat_col.append(col)

In [9]:
# scaling numerical features
ss = StandardScaler()
train_num = ss.fit_transform(train[num_col])
train_num = pd.DataFrame(train_num,columns=num_col)
test_num = ss.transform(test[num_col])
test_num = pd.DataFrame(test_num,columns=num_col)
joblib.dump(ss, '../models/scaler.m') # save scaler model as scaler.m

['../models/scaler.m']

In [10]:
X_train = pd.concat([train_num,train[cat_col]],axis=1)
X_test = pd.concat([test_num,test[cat_col]],axis=1)
X_train.shape,X_test.shape

((12096, 52), (3024, 52))

In [11]:
y_train = train['Cover_Type']
y_test = test['Cover_Type']

In [12]:
pca = PCA(n_components=0.99)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
joblib.dump(pca, '../models/pca.m') # save pca model as pca.m
X_train_pca.shape,X_test_pca.shape

((12096, 30), (3024, 30))

In [13]:
pd.DataFrame(X_train_pca).to_csv('../data/featured_data_PCA/X_train.csv',index=False)
pd.DataFrame(X_test_pca).to_csv('../data/featured_data_PCA/X_test.csv',index=False)
pd.DataFrame(y_train).to_csv('../data/featured_data_PCA/y_train.csv',index=False)
pd.DataFrame(y_test).to_csv('../data/featured_data_PCA/y_test.csv',index=False)

In [14]:
pd.DataFrame(X_train).to_csv('../data/featured_data/X_train.csv',index=False)
pd.DataFrame(X_test).to_csv('../data/featured_data/X_test.csv',index=False)
pd.DataFrame(y_train).to_csv('../data/featured_data/y_train.csv',index=False)
pd.DataFrame(y_test).to_csv('../data/featured_data/y_test.csv',index=False)