In [5]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score

In [6]:
#Read the input files and read every line
def loadData(trainingFile, testingFile):
    
    def convertDataframe(inputFile):
        data = pd.DataFrame(columns=range(100000))
        
        for i in range(len(inputFile)):
            record = np.fromstring(inputFile[i], dtype=int, sep=' ')
            record_bool = [0 for j in range(100000)]
            for col in record:
                record_bool[col-1] = 1
            
            data.loc[i] = record_bool
            
        return data

    
    with open(trainingFile, "r") as fr1:
        trainFile = fr1.readlines()
    
    #Split each line in the two files into label and data  
    train_data_list = []
    train_labels_list = []
    
    for inputData in trainFile:
        train_labels_list.append(inputData[0])
        
        #Remove the activity label (0/1) and new line character from each record
        inputData = inputData.replace("0\t", "")
        inputData = inputData.replace("1\t", "")
        inputData = inputData.replace("\n", "")
        train_data_list.append(inputData)
    
    train_labels = np.asarray(train_labels_list)
    train_data = convertDataframe(train_data_list)
        
    with open(testingFile, "r") as fr2:
        testFile = fr2.readlines()
    
    test_data = convertDataframe(testFile)
            
    return train_data, test_data, train_labels

In [7]:
def pca(train_data, test_data, k):
    """
    Perform PCA on training and testing data.
    
    Args:
    train_data (pandas.DataFrame): Training data.
    test_data (pandas.DataFrame): Testing data.
    k (int): Number of components for PCA.
    
    Returns:
    numpy.ndarray, numpy.ndarray: Projected training data, projected testing data.
    """
    pca = PCA(n_components=k)
    PCA_projected_trainData = pca.fit_transform(train_data)
    PCA_projected_testData = pca.transform(test_data)
    return PCA_projected_trainData, PCA_projected_testData

In [8]:
def classifier(PCA_projected_trainData, PCA_projected_testData, train_labels):
    """
    Perform classification using Naive Bayes Classifier.
    
    Args:
    PCA_projected_trainData (numpy.ndarray): Projected training data.
    PCA_projected_testData (numpy.ndarray): Projected testing data.
    train_labels (numpy.ndarray): Training labels.
    
    Returns:
    numpy.ndarray: Predictions.
    """
    BNBC = BernoulliNB()
    BNBC.fit(PCA_projected_trainData, train_labels)
    predictions = BNBC.predict(PCA_projected_testData)
    return predictions

In [9]:
def writeOutput(predictions, output_file):
    """
    Write predictions to output file.
    
    Args:
    predictions (numpy.ndarray): Predictions.
    output_file (str): Path to the output file.
    """
    with open(output_file, 'w') as output:
        output.writelines("%s\n" % prediction for prediction in predictions)

In [33]:
# Load data
training_data, testing_data, training_labels = loadData('train.dat', 'test.dat')

In [11]:
training_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
796,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
797,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
798,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
training_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [13]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 800 entries, 0 to 799
Columns: 100000 entries, 0 to 99999
dtypes: int64(100000)
memory usage: 610.4 MB


In [14]:
training_data.shape

(800, 100000)

In [15]:
training_data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
count,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,...,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0
mean,0.0125,0.0,0.00125,0.0,0.02,0.00375,0.0175,0.00125,0.00625,0.015,...,0.01375,0.0075,0.0,0.00125,0.0,0.00625,0.0075,0.0125,0.00625,0.035
std,0.111172,0.0,0.035355,0.0,0.140088,0.061161,0.131207,0.035355,0.078859,0.121628,...,0.116524,0.086331,0.0,0.035355,0.0,0.078859,0.086331,0.111172,0.078859,0.183895
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [16]:
training_data.isnull().sum()

0        0
1        0
2        0
3        0
4        0
        ..
99995    0
99996    0
99997    0
99998    0
99999    0
Length: 100000, dtype: int64

In [17]:
testing_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
346,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
347,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
348,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [18]:
testing_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
testing_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 350 entries, 0 to 349
Columns: 100000 entries, 0 to 99999
dtypes: int64(100000)
memory usage: 267.0 MB


In [20]:
testing_data.shape

(350, 100000)

In [21]:
testing_data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
count,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,...,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0
mean,0.014286,0.0,0.0,0.0,0.011429,0.0,0.02,0.0,0.014286,0.005714,...,0.022857,0.008571,0.0,0.005714,0.0,0.011429,0.008571,0.014286,0.002857,0.04
std,0.118836,0.0,0.0,0.0,0.106444,0.0,0.1402,0.0,0.118836,0.075485,...,0.149662,0.092316,0.0,0.075485,0.0,0.106444,0.092316,0.118836,0.053452,0.19624
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [22]:
testing_data.isnull().sum()

0        0
1        0
2        0
3        0
4        0
        ..
99995    0
99996    0
99997    0
99998    0
99999    0
Length: 100000, dtype: int64

In [23]:
# Perform PCA
PCA_projected_trainData, PCA_projected_testData = pca(training_data, testing_data, 500)

In [24]:
PCA_projected_trainData

array([[-1.12588063, -0.12843588, -0.10290311, ...,  1.26843016,
         1.50248048, -0.04614513],
       [-1.3084465 ,  0.19359076, -0.40511008, ..., -0.65187923,
        -0.74560428, -2.27833238],
       [-1.40500663,  0.09790089, -0.3454316 , ..., -0.67012797,
        -0.87244407, -0.11917497],
       ...,
       [ 4.84998229, -4.85770264,  3.79228264, ...,  0.44838165,
        -0.04382407,  0.76465553],
       [-1.22658018,  0.28182164, -0.44175987, ..., -0.81315085,
        -1.14776027, -1.07920566],
       [-1.41125448,  0.39474357, -0.20262831, ...,  1.28076139,
        -0.33352255,  0.15523199]])

In [25]:
PCA_projected_testData

array([[-0.58553161, -0.24295881, -0.09959482, ...,  0.16469614,
        -0.12746058, -0.07626548],
       [-1.15338059,  0.05895854, -0.1234527 , ..., -0.0302093 ,
         0.41637239, -0.00139884],
       [-0.74393648,  0.78502356,  0.60948551, ..., -0.15521366,
         0.16724566, -0.06503108],
       ...,
       [-0.97419831,  0.20911034, -0.45770536, ...,  0.06428891,
         0.11087054, -0.23850642],
       [-1.1214388 ,  0.73908004,  0.26961088, ..., -0.1664028 ,
        -0.02395061,  0.01619082],
       [-1.27863347,  0.06716937, -0.27188503, ...,  0.19630595,
         0.13068362, -0.10288748]])

In [26]:
# Classify testing data using Naive Bayes Classifier
predictions = classifier(PCA_projected_trainData, PCA_projected_testData, training_labels)

In [27]:
# Write predictions for testing data to a separate file
writeOutput(predictions, 'output-k-100-PCA-BNBC.dat')

In [28]:
predictions

array(['0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '1', '1', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '1', '1',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '1', '1', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '1', '0', '0', '1', '0', '0', '1', '0', '0',
       '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '1', '0

In [29]:
# Classify training data to obtain predicted labels for testing data
training_predictions = classifier(PCA_projected_trainData, PCA_projected_trainData, training_labels)

In [30]:
# Write predictions for training data to a separate file
writeOutput(training_predictions, 'output-k-100-PCA-BNBC-train.dat')

In [31]:
training_predictions

array(['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0',
       '0', '1', '0', '0', '0', '1', '1', '1', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '1',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0',
       '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '1', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0',
       '0', '1', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0

In [32]:
# Calculate accuracy for testing data

accuracy = accuracy_score(training_labels, training_predictions)
print("Training Accuracy:", accuracy)

Training Accuracy: 0.95375
