In [None]:
import time
import datetime
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

import sklearn.preprocessing as prep
from sklearn.decomposition import TruncatedSVD

In [None]:
# Visualisations

def correlation(data):
    corrMatrix = data.corr()
    sn.heatmap(corrMatrix, annot=True)
    plt.show()
    return 

def scatterplot(data):
    sn.set()
    sn.pairplot(data[data.columns], size = 2.5)
    plt.show();
    return

In [None]:
# Generic Preprocessing

def preprocessor(data, targetname, labelEncoding=True):
    y = data.pop(targetname)
    X = data     
    
    scaler = prep.RobustScaler().fit(X)
    X = scaler.transform(X)
    
        
    if labelEncoding:
        le = prep.LabelEncoder()
        y = le.fit_transform(y)
    
    i = 1
    SVD_components = X.shape[1]-i
    svd = TruncatedSVD(n_components=SVD_components, random_state=42)
    dataSVD = svd.fit_transform(X)
    varianceExplained = np.sum(svd.explained_variance_)
    maxVarianceExplained = varianceExplained
    
    while varianceExplained/maxVarianceExplained > .99 and X.shape[1]-i > 1:
        SVD_components = X.shape[1]-i
        svd = TruncatedSVD(n_components=SVD_components, random_state=42)
        dataSVD = svd.fit_transform(X)
        varianceExplained = np.sum(svd.explained_variance_)
        i += 1
        print(varianceExplained, SVD_components)
    
    
    svd = TruncatedSVD(n_components=SVD_components+1, random_state=42)
    dataSVD = svd.fit_transform(X)
    
    df_y = pd.DataFrame(y)
    df_SVD = pd.DataFrame(dataSVD)
    processedData = pd.concat([df_y, df_SVD], axis=1)

    return processedData

In [None]:
def preprocessorSeoul(data, targetname, labelEncoding=True, SVD_components=5):

    def robustScaler(X, columnlabels):
        
        partialX = X[columnlabels]
        scaler = prep.RobustScaler().fit(partialX)
        scaledPartialX = pd.DataFrame(scaler.transform(partialX), columns=columnlabels)
        
        for item in range(len(columnlabels)):
            X[columnlabels[item]] =  scaledPartialX[columnlabels[item]]

        return X
    
    def ordinalScaler(X, columnlabels):
        
        partialX = X[columnlabels]
        scaler = prep.RobustScaler().fit(partialX)
        scaledPartialX = pd.DataFrame(scaler.transform(partialX), columns=columnlabels)
        
        for item in range(len(columnlabels)):
            X[columnlabels[item]] =  scaledPartialX[columnlabels[item]]

        return X
    
    def oneHotEnc(X, columnlabels):
        
        partialX = X[columnlabels]
        encoder = prep.OneHotEncoder(sparse=False)
        encoder.fit(partialX)
        newFeatureNames = encoder.get_feature_names()
        encodedPartialX = pd.DataFrame(encoder.transform(partialX), columns= newFeatureNames)
        
        for item in range(len(columnlabels)):
            X = X.drop(columns=columnlabels[item])
        
        X = pd.concat([X, encodedPartialX], axis=1)

        return X
    
    y = data.pop(targetname)
    X = pd.DataFrame(data)
    
    X = robustScaler(X, ['Hour', 'Temperature(degC)', 'Humidity(%)',
        'Wind speed (m/s)', 'Visibility (10m)', 'Dew point temperature(degC)',
        'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)', 'Wind speed (m/s)'])   
    
    
    X = ordinalScaler(X, ['Date'])
    X = oneHotEnc(X, ['Seasons'])
    X = (X.replace({'Holiday': {'No Holiday': 0., 'Holiday': 1.}, 
                               'Functioning Day': {'No': 0., 'Yes': 1.}}))
    
    if labelEncoding:
        le = prep.LabelEncoder()
        y = le.fit_transform(y)
    
    i = 1
    SVD_components = X.shape[1]-i
    svd = TruncatedSVD(n_components=SVD_components, random_state=42)
    dataSVD = svd.fit_transform(X)
    varianceExplained = np.sum(svd.explained_variance_)
    maxVarianceExplained = varianceExplained
    
    while varianceExplained/maxVarianceExplained > .99 and X.shape[1]-i > 1:
        SVD_components = X.shape[1]-i
        svd = TruncatedSVD(n_components=SVD_components, random_state=42)
        dataSVD = svd.fit_transform(X)
        varianceExplained = np.sum(svd.explained_variance_)
        i += 1
        print(varianceExplained, SVD_components)
    
    
    svd = TruncatedSVD(n_components=SVD_components+1, random_state=42)
    dataSVD = svd.fit_transform(X)
    
    df_y = pd.DataFrame(y)
    df_SVD = pd.DataFrame(dataSVD) 
    processedData = pd.concat([df_y, df_SVD], axis=1)

    return processedData


In [None]:
# ********************************************
# Breastcancer 
# ********************************************

breastcancerRaw = pd.read_csv('breastcancer/breast-cancer-diagnostic.shuf.lrn.csv')
breastcancerRaw.drop('ID', axis=1)
breastcancerProcessed = preprocessor(breastcancerRaw, 'class')

# correlation(pd.DataFrame(breastcancerProcessed))
# scatterplot(pd.DataFrame(breastcancerProcessed.drop('class')))


In [None]:
# ********************************************
# Concrete 
# ********************************************

concreteRaw = pd.read_csv('concrete/concrete_data.csv')
concreteProcessed = preprocessor(concreteRaw, 'Strength', False)

# concrete.hist()
# scatterplot(concrete)



In [None]:
# ********************************************
# Seoul Bike Sharing Demand 
# ********************************************

seoulbike = pd.read_csv('seoulbike/SeoulBikeData.csv')

dates = seoulbike['Date']

for i in range(seoulbike.shape[0]):
    dates[i] = time.mktime(datetime.datetime.strptime(str(dates[i]), "%d/%m/%Y").timetuple())   
    
seoulbike['Date'] = dates

seoulbikeProcessed = preprocessorSeoul(seoulbike,'Rented Bike Count', False)
