In [None]:
import time
import datetime
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

import sklearn.preprocessing as prep
from sklearn.decomposition import TruncatedSVD


# Visualisations

def correlation(data):
    corrMatrix = data.corr()
    sn.heatmap(corrMatrix, annot=True)
    plt.show()
    return 

def scatterplot(data):
    sn.set()
    sn.pairplot(data[data.columns], size = 2.5)
    plt.show();
    return


# Generic Preprocessing

def preprocessor(data, targetname, labelEncoding=True, SVD_components=5):
    y = data.pop(targetname)
    X = data     
    
    scaler = prep.RobustScaler().fit(X)
    scaledX = scaler.transform(X)
    
        
    if labelEncoding:
        le = prep.LabelEncoder()
        y = le.fit_transform(y)
    
    svd = TruncatedSVD(n_components=SVD_components, random_state=42)
    dataSVD = svd.fit_transform(scaledX)
    
    df_y = pd.DataFrame(y)
    df_SVD = pd.DataFrame(dataSVD)
    processedData = pd.concat([df_y, df_SVD], axis=1)

    return processedData


# ********************************************
# Breastcancer 
# ********************************************

breastcancerRaw = pd.read_csv('./breastcancer/breast-cancer-diagnostic.shuf.lrn.csv')

breastcancerRaw.drop('ID', axis=1)

breastcancerProcessed = preprocessor(breastcancerRaw, 'class')

print(breastcancerProcessed)

# correlation(pd.DataFrame(breastcancerProcessed))
# scatterplot(pd.DataFrame(breastcancerProcessed.drop('class')))


# ********************************************
# Concrete 
# ********************************************

concreteRaw = pd.read_csv('concrete/concrete_data.csv')
concreteProcessed = preprocessor(concreteRaw, 'Strength', False)

print(concreteProcessed)

# concrete.hist()
# scatterplot(concrete)


# ********************************************
# Seoul Bike Sharing Demand 
# ********************************************

seoulbike = pd.read_csv('seoulbike/SeoulBikeData.csv')
dates = seoulbike['Date']

for i in range(seoulbike.shape[0]):
    dates[i] = time.mktime(datetime.datetime.strptime(str(dates[i]), "%d/%m/%Y").timetuple())
    
seoulbike['Date'] = dates
 