In [None]:
import time
import datetime
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

import sklearn.preprocessing as prep
from sklearn.decomposition import PCA

In [None]:
# Visualisations

def correlation(data):
    corrMatrix = data.corr()
    sn.heatmap(corrMatrix, annot=True)
    plt.show()
    return 

def scatterplot(data):
    sn.set()
    sn.pairplot(data[data.columns], size = 2.5)s
    plt.show();
    return

In [None]:
# Generic Preprocessing

def preprocessor(data, targetname, labelEncoding=True):
    y = data.pop(targetname)
    X = data     
    
    scaler = prep.RobustScaler().fit(X)
    X = scaler.transform(X)
    
        
    if labelEncoding:
        le = prep.LabelEncoder()
        y = le.fit_transform(y)
        df_y = pd.DataFrame(y, columns=['target'])
    else:
        df_y = y

    
    pca = PCA(.95)
    data_PCA = pca.fit_transform(X)
    
    df_SVD = pd.DataFrame(data_PCA)
    processedData = pd.concat([df_y, df_SVD], axis=1)
    
    # correlation(pd.concat([df_y, pd.DataFrame(data)], axis=1))

    return processedData

In [None]:
def preprocessorSeoul(data, targetname, labelEncoding=True):

    def robustScaler(X, columnlabels):
        
        partialX = X[columnlabels]
        scaler = prep.RobustScaler().fit(partialX)
        scaledPartialX = pd.DataFrame(scaler.transform(partialX), columns=columnlabels)
        
        for item in range(len(columnlabels)):
            X[columnlabels[item]] =  scaledPartialX[columnlabels[item]]

        return X
    
    def ordinalScaler(X, columnlabels):
        
        partialX = X[columnlabels]
        scaler = prep.RobustScaler().fit(partialX)
        scaledPartialX = pd.DataFrame(scaler.transform(partialX), columns=columnlabels)
        
        for item in range(len(columnlabels)):
            X[columnlabels[item]] =  scaledPartialX[columnlabels[item]]

        return X
    
    def oneHotEnc(X, columnlabels):
        
        partialX = X[columnlabels]
        encoder = prep.OneHotEncoder(sparse=False)
        encoder.fit(partialX)
        newFeatureNames = encoder.get_feature_names()
        encodedPartialX = pd.DataFrame(encoder.transform(partialX), columns= newFeatureNames)
        
        for item in range(len(columnlabels)):
            X = X.drop(columns=columnlabels[item])
        
        X = pd.concat([X, encodedPartialX], axis=1)

        return X
    
    y = data.pop(targetname)
    X = pd.DataFrame(data)
    
    X = robustScaler(X, ['Hour', 'Temperature(degC)', 'Humidity(%)',
        'Wind speed (m/s)', 'Visibility (10m)', 'Dew point temperature(degC)',
        'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)', 'Wind speed (m/s)'])   
    
    
    X = ordinalScaler(X, ['Date'])
    X = oneHotEnc(X, ['Seasons'])
    X = (X.replace({'Holiday': {'No Holiday': 0., 'Holiday': 1.}, 
                               'Functioning Day': {'No': 0., 'Yes': 1.}}))
    
    if labelEncoding:
        le = prep.LabelEncoder()
        y = le.fit_transform(y)
        
    pca = PCA(.9)
    data_PCA = pca.fit_transform(X)

    
    df_y = pd.DataFrame(y)
    df_SVD = pd.DataFrame(data_PCA) 
    processedData = pd.concat([df_y, df_SVD], axis=1)

    return processedData

In [None]:
def dateConversion():
    dates = seoulbike['Date']
    
    for i in range(seoulbike.shape[0]):
        dates[i] = time.mktime(datetime.datetime.strptime(str(dates[i]), "%d/%m/%Y").timetuple())   
        
    seoulbike['Date'] = dates
    seoulbike.to_csv('seoulbike/SeoulBikeData_dateconv.csv')
    return

In [None]:
# ********************************************
# Breastcancer 
# ********************************************

breastcancerRaw = pd.read_csv('breastcancer/breast-cancer-diagnostic.shuf.lrn.csv')
breastcancerRaw = breastcancerRaw.drop('ID', axis=1)
breastcancerProcessed = preprocessor(breastcancerRaw, 'class')

# breastcancerProcessed.to_csv('preprocessed/breast-cancer-diagnostic.shuf.lrn_processed.csv')
correlation(pd.DataFrame(breastcancerProcessed))
scatterplot(pd.DataFrame(breastcancerProcessed))

In [None]:
# ********************************************
# Concrete 
# ********************************************

concreteRaw = pd.read_csv('concrete/concrete_data.csv')
concreteProcessed = preprocessor(concreteRaw, 'Strength', False)
concreteProcessed.to_csv('preprocessed/concrete_data_processed.csv')
correlation(concreteProcessed)
scatterplot(concreteProcessed)

In [None]:
# ********************************************
# Seoul Bike Sharing Demand 
# ********************************************

# seoulbike = pd.read_csv('seoulbike/SeoulBikeData.csv')
# dateConversion()

seoulbike = pd.read_csv('seoulbike/SeoulBikeData_dateconv.csv')
seoulbikeProcessed = preprocessorSeoul(seoulbike,'Rented Bike Count', False)

correlation(pd.DataFrame(seoulbikeProcessed))
# scatterplot(pd.DataFrame(seoulbikeProcessed))

seoulbikeProcessed.to_csv('preprocessed/SeoulBikeData_processed.csv')
