In [1]:
# import required packages
import numpy as np
import matplotlib.pyplot as plt  # For creating plots

from sklearn.preprocessing import scale
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import LinearSVR
from sklearn.linear_model import LinearRegression

from sklearn.feature_selection import r_regression
from sklearn.feature_selection import SelectKBest, f_classif, chi2

from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from sklearn.svm import SVR

from sklearn.feature_selection import VarianceThreshold

from sklearn.metrics import mean_squared_error

In [2]:
def getGarmentInitial():
    '''retrieve the initial data for the garment dataset from the csv file'''
    import numpy as np
    #get data from file
    initDatasetDate = np.genfromtxt("gwp_assessment.csv", delimiter=",", dtype="U25", encoding=None, missing_values=None, filling_values=None, skip_header=1, usecols=(0))
    initDatasetCategorical = np.genfromtxt("gwp_assessment.csv", delimiter=",", dtype="U25", encoding=None, missing_values=None, filling_values=None, skip_header=1, usecols=(1,2,3))
    initDatasetNumerical = np.genfromtxt("gwp_assessment.csv", delimiter=",", dtype="f8", encoding=None, missing_values=None, filling_values=None, skip_header=1, usecols=(range(4,14)))
    target = np.genfromtxt("gwp_assessment.csv", delimiter=",", dtype="f8", encoding=None, missing_values=None, filling_values=None, skip_header=1, usecols=(14))
    return initDatasetDate, initDatasetCategorical, initDatasetNumerical, target

In [3]:
def imputeAboveMultiple(dataToImpute):
    '''perform a simple imputation on multiple columns of data, replacing missing values with the value directly above them'''
    #for each column
    for i in range(len(dataToImpute[1])):
        #for each row
        for x in range(len(dataToImpute)):
            if (dataToImpute[x,i] == ''):
                dataToImpute[x,i] = dataToImpute[x-1,i]
    return dataToImpute


    

In [4]:
def dateImputeAbove(dataToImpute):
    '''perform directly above imputation for the date column'''
    #for each row
    for x in range(len(dataToImpute)):
        if (dataToImpute[x] == ''):
            dataToImpute[x] = dataToImpute[x-1]
    return dataToImpute

In [5]:
def garmentDetectAndEncodeDate(dataToEncode):
    '''convert the date string into 3 seperate columns, and then use encoding to make them readable my machine learning algorithms'''
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.preprocessing import OrdinalEncoder
    import numpy as np
    #generate day, month and year arrays from date
    #month/day/year
    day = []
    month = []
    year = []
    for x in range(len(dataToEncode)):
        splitDate = (dataToEncode[x]).split('/')
        day.append([splitDate[1]])
        month.append([splitDate[0]])
        year.append([splitDate[2]])
    day = np.array(day)
    month = np.array(month)
    year = np.array(year)
    datasetDate = np.concatenate([day,month,year], axis=1)
    #print(datasetDate.shape)
    
    #ordinal encoding
    for i in [0]:
        oec = OrdinalEncoder(categories='auto', dtype=float)
        oec.fit(datasetDate[:, [i]])

        # replace original data in the dataset with encoded values
        datasetDate[:, i] = oec.transform(datasetDate[:, [i]]).flatten()
        
    #one hot encoding
    for i in [2,1]:
        enc = OneHotEncoder(categories='auto', dtype=float, sparse=False)
        enc.fit(datasetDate[:, [i]])
        encReplace = enc.transform(datasetDate[:, [i]])
        datasetDate = np.concatenate([datasetDate[:, :i], encReplace, datasetDate[:, i+1:]], axis=1)

    datasetDate = datasetDate.astype(float)
    #print(datasetDate.dtype)
    return datasetDate

In [6]:
def garmentEncodeCategorical(categoricalToEncode):
    '''Encode categorical data for the garment dataset'''
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.preprocessing import OrdinalEncoder
    import numpy as np
    #do ordinal encoding for categorical data
    for i in []:
        oec = OrdinalEncoder(categories='auto', dtype=float)
        oec.fit(categoricalToEncode[:, [i]])

        # replace original data in the dataset with encoded values
        categoricalToEncode[:, i] = oec.transform(categoricalToEncode[:, [i]]).flatten()
    
    #do one hot encoding for categorical data
    for i in [2,1,0]:
        enc = OneHotEncoder(categories='auto', dtype=float, sparse=False)
        enc.fit(categoricalToEncode[:, [i]])
        encReplace = enc.transform(categoricalToEncode[:, [i]])
        categoricalToEncode = np.concatenate([categoricalToEncode[:, :i], encReplace, categoricalToEncode[:, i+1:]], axis=1)
        
    categoricalToEncode = categoricalToEncode.astype(float)
    #print(categoricalToEncode)
    return categoricalToEncode

In [7]:
def concatenateAndImputeNumericalGarment(datasetDate, datasetCategorical, datasetNumerical):
    '''concatenate seperated garment columns together, and perform iterative imputation to fill in missing numeric values'''
    import numpy as np
    from sklearn.experimental import enable_iterative_imputer
    from sklearn.impute import IterativeImputer
    
    dataset = np.concatenate([datasetDate, datasetCategorical, datasetNumerical], axis=1)
    
    #input missing numerical data using iterative imputing
    imputer = IterativeImputer(random_state=17)
    dataset = imputer.fit_transform(dataset)
    
    return dataset

In [8]:
#final preprocessing for garment dataset
def pruneGarment(dataset, target):
    '''remove any constant features from the garment dataset, and select the 13 best features to be used for machine learning'''
    import numpy as np
    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import r_regression
    from sklearn.feature_selection import VarianceThreshold
    
    #remove constant features
    selector = VarianceThreshold()
    dataset = selector.fit_transform(dataset)
    #print(selector.variances_)
    #print(dataset.shape)
    
    gwp_pr = r_regression(dataset,target)
    #print(gwp_pr)
        
    #select best 13 features using pearson coefficient, as testing has shown that to be optimal amount of features
    skb = SelectKBest(r_regression, k=13)
    datasetSelected = skb.fit_transform(dataset, target) # dataset with selected features
    return datasetSelected

In [9]:
#test number of features for garment
def testGarment():
    '''testing function used to decide how many features should be kept for the garment dataset using pearson correlation coefficient'''
    accuracyPlot =[]
    prev_train_acc = 0
    prev_test_acc = 0
    for f in range(1, garmentDataset.shape[1]+1):
        # feature selection
        skb = SelectKBest(r_regression, k=f)
        selectedDataset = skb.fit_transform(garmentDataset, garmentTarget) # dataset with selected features

        # split the dataset into training and testing
        x_train, x_test, y_train, y_test = train_test_split(selectedDataset, garmentTarget, test_size=0.2)

        # normalize the training data
        mms = MinMaxScaler(feature_range=(-1, 1))
        x_train_norm = mms.fit_transform(x_train)

        # train a classifier
        model = LinearRegression()
        model.fit(x_train_norm, y_train) # train a model using the training data.
        train_acc = model.score(x_train_norm, y_train) # estimate training accuracy

        # testing
        x_test_norm = mms.transform(x_test) # normalize the testing data before testing. Note that we use the model fit using the training data
        test_acc = model.score(x_test_norm, y_test) # estimate testing accuracy
        accuracyPlot.append(test_acc)

        print(f'Using {f} features:')
        print('Training Accuracy: ', train_acc) # Accuracy on the training data
        print('Testing Accuracy: ', test_acc) # Accuracy on the testing data


    #plot a chart
    featuresPlot = range(1, garmentDataset.shape[1]+1)
    plt.plot(featuresPlot, accuracyPlot)
    plt.title("Accuracy against features for garment")
    plt.xlabel('Number of features')
    plt.ylabel('Accuracy')
    plt.show()


In [10]:
def getStarsInitial():
    '''get the initial stars dataset from the csv file'''
    import numpy as np
    #get dataset and create arrays of features and labels
    #convert all features to float in order to create 2d array
    #get array of features
    initFeatures = np.genfromtxt("star_assessment.csv", delimiter=",", dtype="f8", encoding=None, missing_values='', filling_values=np.nan, usecols=range(0,17), skip_header=1)
    #get array of labels
    target = np.genfromtxt("star_assessment.csv", delimiter=",", dtype="U25", encoding=None, missing_values='', filling_values=None, usecols=17, skip_header=1)
    #get class names
    classes = np.genfromtxt("star_assessment.csv", delimiter=",", dtype="U25", encoding=None, missing_values='', filling_values=None, max_rows=1)
    return initFeatures, target

In [11]:
def imputeStars(features):
    '''use iterative imputation to fill in missing numeric values for the stars dataset'''
    import numpy as np
    from sklearn.experimental import enable_iterative_imputer
    from sklearn.impute import IterativeImputer
    #input missing data using iterative imputing
    imputer = IterativeImputer(random_state=17)
    features = imputer.fit_transform(features)
    return features

In [12]:
def encodeStars(labels):
    '''encode the labels column for the stars dataset, so that we may use it for machine learning'''
    from sklearn.preprocessing import LabelEncoder
    #encode labels
    enc = LabelEncoder()
    enc.fit(labels)
    #print(list(enc.classes_))
    labels = enc.transform(labels)
    return labels

In [13]:
#final preprocessing for stars dataset
def pruneStars(dataset, target):
    '''remove constant features from the stars dataset, and select the best 15 features to be used for machine learning'''
    import numpy as np
    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import r_regression
    from sklearn.feature_selection import VarianceThreshold
    
    #remove constant features
    selector = VarianceThreshold()
    dataset = selector.fit_transform(dataset)
    #print(selector.variances_)
    #print(dataset.shape)
    
    gwp_pr = r_regression(dataset,target)
    #print(gwp_pr)
        
    #select best 15 features using pearson coefficient, as testing has shown that to be optimal amount of features
    skb = SelectKBest(r_regression, k=15)
    datasetSelected = skb.fit_transform(dataset, target) # dataset with selected features
    return datasetSelected

In [14]:
def testStars():
    '''testing function used to decide how many features should be kept using pearson correlation coefficient'''
    accuracyPlot =[]
    prev_train_acc = 0
    prev_test_acc = 0
    for f in range(1, starsFeatures.shape[1]+1):
        # feature selection
        skb = SelectKBest(r_regression, k=f)
        selectedFeatures = skb.fit_transform(starsFeatures, starsTarget) # dataset with selected features

        # split the dataset into training and testing
        x_train, x_test, y_train, y_test = train_test_split(selectedFeatures, starsTarget, test_size=0.2)

        # normalize the training data
        mms = MinMaxScaler(feature_range=(0, 1))
        x_train_norm = mms.fit_transform(x_train)

        # train a classifier
        model = LinearSVC()
        model.fit(x_train_norm, y_train) # train a model using the training data.
        train_acc = model.score(x_train_norm, y_train) # estimate training accuracy

        # testing
        x_test_norm = mms.transform(x_test) # normalize the testing data before testing. Note that we use the model fit using the training data
        test_acc = model.score(x_test_norm, y_test) # estimate testing accuracy
        accuracyPlot.append(test_acc)

        print(f'Using {f} features:')
        print('Training Accuracy: ', train_acc) # Accuracy on the training data
        print('Testing Accuracy: ', test_acc) # Accuracy on the testing data

    #plot a chart
    featuresPlot = range(1, starsFeatures.shape[1]+1)
    plt.plot(featuresPlot, accuracyPlot)
    plt.title("Accuracy against features for stars")
    plt.xlabel('Number of features')
    plt.ylabel('Accuracy')
    plt.show()

Markdown question for 3_2:
The pearson correlation coefficient is used to test for linear correlation between two values. The closer the pearson correlation coefficient of a feature and the label is to 1, the greater the relevance of that feature in predicting the label. Therefore pearson correlation coefficient can also be used to find and weed out features which are shown to be unrelated to the label, as they will have a value close to 0. It is worth noting that a large negative value is in fact a good indicator of negative correlation, which is also useful for predicting the label.

Therefore, the pearson correlation coefficient can be used to select only the features which are relevant to the label, in order to reduce noise in the dataset. This reduction in noise is likely to bring better results when the dataset is used to train a machine learning algorithm, as well as reducing the processing time needed to fit all features to the model.