In [1]:
# Example Feature Extraction from XML Files
# We count the number of specific system calls made by the programs, and use
# these as our features.

# This code requires that the unzipped training set is in a folder called "train". 

import os
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import numpy as np
from scipy import sparse

import util

TRAIN_DIR = "train"

call_set = set([])

def add_to_set(tree):
    for el in tree.iter():
        call = el.tag
        call_set.add(call)

def create_data_matrix(start_index, end_index, direc="train"):
    X = None
    classes = []
    ids = [] 
    i = -1
    for datafile in os.listdir(direc):
        if datafile == '.DS_Store':
            continue

        i += 1
        if i < start_index:
            continue 
        if i >= end_index:
            break

        # extract id and true class (if available) from filename
        id_str, clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))

        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)

        # parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))
        if direc=="train":
            add_to_set(tree)
        this_row = call_feats(tree)
        if X is None:
            X = this_row 
        else:
            X = np.vstack((X, this_row))

    return X, np.array(classes), ids

def call_feats(tree):
    good_calls = ['sleep', 'dump_line']

    call_counter = {}
    for el in tree.iter():
        call = el.tag
        if call not in call_counter:
            call_counter[call] = 1
        else:
            call_counter[call] += 1

    call_feat_array = np.zeros(len(good_calls))
    for i in range(len(good_calls)):
        call = good_calls[i]
        call_feat_array[i] = 0
        if call in call_counter:
            call_feat_array[i] = call_counter[call]

    return call_counter

# # Feature extraction
# def main():
#     X_train, t_train, train_ids = create_data_matrix(0, 5, TRAIN_DIR)
#     X_valid, t_valid, valid_ids = create_data_matrix(10, 15, TRAIN_DIR)

#     print 'Data matrix (training set):'
#     print X_train
#     print 'Classes (training set):'
#     print t_train

#     # From here, you can train models (eg by importing sklearn and inputting X_train, t_train).

# if __name__ == "__main__":
#     main()
    

In [2]:
X_train, t_train, train_ids = create_data_matrix(0, 3086, TRAIN_DIR)

In [3]:
X_test, t_test, test_ids = create_data_matrix(0, 3728, 'test')

In [4]:
def convertMatrix(X, call_set):
    features = dict()
    for i in call_set:
        features[i] = []
    
    for i in X:
        for j in call_set:
            if j in i[0].keys():
                features[j].append(i[0][j])
            else:
                features[j].append(0)
    
    return features

In [5]:
import pandas as pd

In [6]:
train_set = pd.DataFrame(convertMatrix(X_train,call_set))
test_set= pd.DataFrame(convertMatrix(X_test,call_set))
# train_set['id'] = train_ids
# train_set['class'] = t_train

In [7]:
#train_set

In [None]:
os.listdir('train/')[0].split('.')[0]

In [None]:
feature_eng_dir = 'train/'
id_list_feat_eng = []
hash_error_list_feat_eng = []
security_anony_list_feat_eng = []
class_list_feat_eng = []
for file_name in os.listdir('train/'):
    if file_name == '.DS_Store':
        continue
    content = open(feature_eng_dir + file_name, 'r').read()
    num_of_line = content.count('<')
    num_of_hash_error = content.count('hash_error')
    num_of_security_anony = content.count('SECURITY_ANONYMOUS')
    class_list_feat_eng.append(file_name.split('.')[1])
    
    id_list_feat_eng.append(file_name.split('.')[0])
    hash_error_list_feat_eng.append(1.0*num_of_hash_error/num_of_line)
    security_anony_list_feat_eng.append(1.0*num_of_security_anony/num_of_line)

In [None]:
import copy
train_set_2=copy.deepcopy(train_set)
train_set_2['id'] =train_ids
feat_eng_df = pd.DataFrame({'id':id_list_feat_eng, 'hash_error':hash_error_list_feat_eng, 'security_anonymous':security_anony_list_feat_eng})
new=pd.merge(train_set_2, feat_eng_df, on='id')
new=new.drop('id',axis=1)

In [None]:
from scipy.stats.stats import pearsonr   
b=new['security_anonymous'].values
type(t_train)
pearsonr(t_train,b)

In [15]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

h = .02  # step size in the mesh

names = ["Nearest Neighbors", "Linear SVM","Poly SVM","Sigmoid SVM","RBF SVM", "Decision Tree",
         "Random Forest",  "Linear Discriminant Analysis",
         "Quadratic Discriminant Analysis"]
classifiers = [
    KNeighborsClassifier(5),
    SVC(kernel="linear", C=0.025),
    SVC(kernel="poly", C=0.025),
    SVC(kernel="sigmoid", C=0.025),
    SVC(kernel="rbf",C=0.025),
    #SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=10),
    RandomForestClassifier(max_depth=10, n_estimators=10, max_features=1),
    #AdaBoostClassifier(),
    #GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis()]

In [None]:
y_train = t_train[:2000]
X_train = train_set[:2000]
y_valid = t_train[2000:]
X_valid = train_set[2000:]
y_test = t_test
X_test = test_set

In [None]:
for name, clf in zip(names, classifiers):
#         ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
        clf.fit(X_train, y_train)
        score = clf.score(X_valid, y_valid)
        print name, score


In [None]:
clf=RandomForestClassifier(max_depth=10, n_estimators=10, max_features=1)
clf.fit(train_set, t_train)
t_test=clf.predict(X_test)

In [None]:
t_test

In [7]:
# these are the fifteen malware classes we're looking for
malware_classes = ["Agent", "AutoRun", "FraudLoad", "FraudPack", "Hupigon", "Krap",
           "Lipler", "Magania", "None", "Poison", "Swizzor", "Tdss",
           "VB", "Virut", "Zbot"]

# a function for writing predictions in the required format
def write_predictions(predictions, ids, outfile):
    """
    assumes len(predictions) == len(ids), and that predictions[i] is the
    index of the predicted class with the malware_classes list above for 
    the executable corresponding to ids[i].
    outfile will be overwritten
    """
    with open(outfile,"w+") as f:
        # write header
        f.write("Id,Prediction\n")
        for i, history_id in enumerate(ids):
            f.write("%s,%d\n" % (history_id, predictions[i]))

In [None]:
write_predictions(t_test,test_ids,"randomforrest.csv")

# Exploration

In [None]:
%matplotlib inline

In [None]:

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


sns.set(style="white")

# Generate a large random dataset

# Compute the correlation matrix
corr = train_set.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3,
            square=True, xticklabels=5, yticklabels=5,
            linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)

In [None]:
plt.show()

In [8]:
from sklearn.decomposition import PCA

pca=PCA(n_components=20)
pca.fit_transform(train_set,t_train)
pca.explained_variance_ratio_

array([  3.91315276e-01,   2.63259786e-01,   1.45155855e-01,
         9.41070436e-02,   4.22026597e-02,   1.50369668e-02,
         1.35901802e-02,   1.08299298e-02,   9.02474844e-03,
         4.79322777e-03,   3.51131656e-03,   1.46616478e-03,
         1.30781114e-03,   1.09285478e-03,   8.78713852e-04,
         4.77860237e-04,   4.44813602e-04,   3.01532820e-04,
         2.17952089e-04,   1.66558047e-04])

In [9]:
pca_train=pca.fit_transform(train_set,t_train)
pca_test=pca.transform(test_set)

# Systemized Model Selection

In [24]:
from sklearn.cross_validation import train_test_split
itrain, itest = train_test_split(xrange(train_set.shape[0]), train_size=0.7,random_state=123)
mask=np.ones(train_set.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)

In [11]:
from sklearn.grid_search import GridSearchCV

"""
Function
--------
cv_optimize

Inputs
------
clf : an instance of a scikit-learn classifier
parameters: a parameter grid dictionary thats passed to GridSearchCV (see above)
X: a samples-features matrix in the scikit-learn style
y: the response vectors of 1s and 0s (+ives and -ives)
n_folds: the number of cross-validation folds (default 5)
score_func: a score function we might want to pass (default python None)
   
Returns
-------
The best estimator from the GridSearchCV, after the GridSearchCV has been used to
fit the model.
     
Notes
-----
see do_classify and the code below for an example of how this is used
"""
#your code here

def cv_optimize(clf,parameters,X,y,n_folds,score_func=None):
    if score_func:
        fitmodel = GridSearchCV(clf, param_grid=parameters, cv=n_folds, scoring=score_func)
    else:
        fitmodel = GridSearchCV(clf, param_grid=parameters, cv=n_folds)
    fitmodel.fit(X,y)
    print "BEST", fitmodel.best_params_, fitmodel.best_score_, fitmodel.grid_scores_
    best = fitmodel.best_estimator_
    return best

In [12]:
from sklearn.metrics import confusion_matrix
def do_classify(clf, parameters, train, test, mask=None, reuse_split=None, score_func=None, n_folds=5):
    
    X=train
    y=test
    if mask !=None:
        print "using mask"
        Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]
    if reuse_split !=None:
        print "using reuse split"
        Xtrain, Xtest, ytrain, ytest = reuse_split['Xtrain'], reuse_split['Xtest'], reuse_split['ytrain'], reuse_split['ytest']
    if parameters:
        clf = cv_optimize(clf, parameters, Xtrain, ytrain, n_folds=n_folds, score_func=score_func)
    clf=clf.fit(Xtrain, ytrain)
    training_accuracy = clf.score(Xtrain, ytrain)
    test_accuracy = clf.score(Xtest, ytest)
    print "############# based on standard predict ################"
    print "Accuracy on training data: %0.2f" % (training_accuracy)
    print "Accuracy on test data:     %0.2f" % (test_accuracy)
    print confusion_matrix(ytest, clf.predict(Xtest))
    print "########################################################"
    return clf, Xtrain, ytrain, Xtest, ytest

In [None]:
%%time
clfrf, Xtrain, ytrain, Xtest, ytest = do_classify(RandomForestClassifier(),
                                                   {"max_depth": [10,20,50,100,200,500]}, 
                                                   train_set,t_train,mask=mask)

In [None]:
t_test=clfrf.predict(X_test)
write_predictions(t_test,test_ids,"randomforrest2.csv")

In [16]:
%%time
clfrf, Xtrain, ytrain, Xtest, ytest = do_classify(RandomForestClassifier(random_state=123),
                                                   {"max_depth": [10,20,50],
                                                    "n_estimators":[1,5,10,20,50]}, 
                                                   train_set,t_train,mask=mask)

using mask
BEST {'n_estimators': 50, 'max_depth': 20} 0.883333333333 [mean: 0.83380, std: 0.01164, params: {'n_estimators': 1, 'max_depth': 10}, mean: 0.87083, std: 0.01103, params: {'n_estimators': 5, 'max_depth': 10}, mean: 0.87454, std: 0.01100, params: {'n_estimators': 10, 'max_depth': 10}, mean: 0.87593, std: 0.01088, params: {'n_estimators': 20, 'max_depth': 10}, mean: 0.87917, std: 0.01683, params: {'n_estimators': 50, 'max_depth': 10}, mean: 0.84074, std: 0.00589, params: {'n_estimators': 1, 'max_depth': 20}, mean: 0.86620, std: 0.00570, params: {'n_estimators': 5, 'max_depth': 20}, mean: 0.87824, std: 0.01279, params: {'n_estimators': 10, 'max_depth': 20}, mean: 0.87546, std: 0.00940, params: {'n_estimators': 20, 'max_depth': 20}, mean: 0.88333, std: 0.01391, params: {'n_estimators': 50, 'max_depth': 20}, mean: 0.82870, std: 0.00936, params: {'n_estimators': 1, 'max_depth': 50}, mean: 0.86713, std: 0.00693, params: {'n_estimators': 5, 'max_depth': 50}, mean: 0.87639, std: 0.01



In [17]:
reuse_split=dict(Xtrain=Xtrain, Xtest=Xtest, ytrain=ytrain, ytest=ytest)

In [18]:
%%time
from sklearn.linear_model import LogisticRegression 

clflr, Xtrain, ytrain, Xtest, ytest = do_classify(LogisticRegression(penalty="l2",random_state=12,\
                                                                     multi_class="ovr"), \
                                                   {"C": [0.01, 0.1, 1.0, 10.0],"solver":["sag","newton-cg"]},
                                                   train_set,t_train,reuse_split=reuse_split)



using reuse split
BEST {'C': 1.0, 'solver': 'newton-cg'} 0.856018518519 [mean: 0.66389, std: 0.00949, params: {'C': 0.01, 'solver': 'sag'}, mean: 0.84954, std: 0.01880, params: {'C': 0.01, 'solver': 'newton-cg'}, mean: 0.66389, std: 0.00949, params: {'C': 0.1, 'solver': 'sag'}, mean: 0.85370, std: 0.01900, params: {'C': 0.1, 'solver': 'newton-cg'}, mean: 0.66389, std: 0.00949, params: {'C': 1.0, 'solver': 'sag'}, mean: 0.85602, std: 0.01845, params: {'C': 1.0, 'solver': 'newton-cg'}, mean: 0.66389, std: 0.00949, params: {'C': 10.0, 'solver': 'sag'}, mean: 0.85370, std: 0.02148, params: {'C': 10.0, 'solver': 'newton-cg'}]
############# based on standard predict ################
Accuracy on training data: 0.92
Accuracy on test data:     0.86
[[ 14   0   1   0   0   1   0   1  13   1   0   0   3   0   1]
 [  0   9   0   0   0   0   0   0   0   2   0   0   5   0   0]
 [  0   0   4   3   0   2   0   0   2   0   0   0   0   0   0]
 [  0   0   2   6   0   3   0   0   2   0   0   0   0   0   0

In [20]:
t_test=clflr.predict(test_set)
write_predictions(t_test,test_ids,"logistic1.csv")

#### Kaggle Score: 0.76684

In [21]:
%%time
from sklearn.linear_model import LogisticRegression 

clflr, Xtrain, ytrain, Xtest, ytest = do_classify(LogisticRegression(penalty="l2",random_state=12,\
                                                                     multi_class="multinomial"), \
                                                   {"C": [0.01, 0.1, 1.0, 10.0],"solver":["newton-cg"]},
                                                   train_set,t_train,reuse_split=reuse_split)


using reuse split
BEST {'C': 0.01, 'solver': 'newton-cg'} 0.859722222222 [mean: 0.85972, std: 0.01729, params: {'C': 0.01, 'solver': 'newton-cg'}, mean: 0.85926, std: 0.01533, params: {'C': 0.1, 'solver': 'newton-cg'}, mean: 0.85694, std: 0.01795, params: {'C': 1.0, 'solver': 'newton-cg'}, mean: 0.85509, std: 0.01898, params: {'C': 10.0, 'solver': 'newton-cg'}]
############# based on standard predict ################
Accuracy on training data: 0.91
Accuracy on test data:     0.85
[[ 12   0   0   0   0   3   0   2  17   0   1   0   0   0   0]
 [  0   8   0   0   0   0   0   0   1   1   1   0   5   0   0]
 [  0   0   4   3   0   1   0   0   3   0   0   0   0   0   0]
 [  0   0   1   7   0   3   0   0   2   0   0   0   0   0   0]
 [  0   0   0   0   4   0   1   0   4   0   0   0   4   0   0]
 [  0   0   0   0   0   5   0   0   2   0   0   0   1   0   0]
 [  0   0   0   0   0   0  10   0   2   0   0   0   0   0   0]
 [  1   0   0   0   0   0   0   4   5   0   0   0   0   0   0]
 [  5   0  

In [23]:
t_test=clflr.predict(test_set)
write_predictions(t_test,test_ids,"logistic2.csv")

In [None]:
%%time
# from sklearn.neural_network import MLPClassifier

# clfnn, Xtrain, ytrain, Xtest, ytest = do_classify(MLPClassifier(algorithm="sgd",activation="logistic",
#                                                                random_state=1),
#                                                    {"alpha": [1e-5,1e-4,1e-3,1e-2,1],
#                                                     "n_estimators":[1,5,10]}, 
#                                                    train_set,t_train,reuse_split=reuse_split)

# PCA transformation

In [None]:
from sklearn.cross_validation import train_test_split
itrain, itest = train_test_split(xrange(pca_train.shape[0]), train_size=0.7)
mask=np.ones(pca_train.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)

In [25]:
%%time
clfrf, Xtrain, ytrain, Xtest, ytest = do_classify(RandomForestClassifier(random_state=123),
                                                   {"max_depth": [10,20,50],
                                                    "n_estimators":[1,5,10,20,50]}, 
                                                   pca_train,t_train,mask=mask)

using mask
BEST {'n_estimators': 50, 'max_depth': 20} 0.869907407407 [mean: 0.81898, std: 0.01532, params: {'n_estimators': 1, 'max_depth': 10}, mean: 0.85324, std: 0.00982, params: {'n_estimators': 5, 'max_depth': 10}, mean: 0.85972, std: 0.00693, params: {'n_estimators': 10, 'max_depth': 10}, mean: 0.86157, std: 0.00596, params: {'n_estimators': 20, 'max_depth': 10}, mean: 0.86944, std: 0.01028, params: {'n_estimators': 50, 'max_depth': 10}, mean: 0.82269, std: 0.02251, params: {'n_estimators': 1, 'max_depth': 20}, mean: 0.84769, std: 0.01220, params: {'n_estimators': 5, 'max_depth': 20}, mean: 0.85741, std: 0.01079, params: {'n_estimators': 10, 'max_depth': 20}, mean: 0.86435, std: 0.01150, params: {'n_estimators': 20, 'max_depth': 20}, mean: 0.86991, std: 0.01455, params: {'n_estimators': 50, 'max_depth': 20}, mean: 0.82269, std: 0.02251, params: {'n_estimators': 1, 'max_depth': 50}, mean: 0.84722, std: 0.01164, params: {'n_estimators': 5, 'max_depth': 50}, mean: 0.85648, std: 0.01



In [26]:
%%time
from sklearn.linear_model import LogisticRegression 

clflr, Xtrain, ytrain, Xtest, ytest = do_classify(LogisticRegression(penalty="l2",random_state=12,\
                                                                     multi_class="ovr"), \
                                                   {"C": [0.01, 0.1, 1.0, 10.0],"solver":["sag","newton-cg"]},
                                                   pca_train,t_train,reuse_split=reuse_split)

using reuse split
BEST {'C': 1.0, 'solver': 'newton-cg'} 0.855555555556 [mean: 0.66389, std: 0.00949, params: {'C': 0.01, 'solver': 'sag'}, mean: 0.84954, std: 0.01880, params: {'C': 0.01, 'solver': 'newton-cg'}, mean: 0.66389, std: 0.00949, params: {'C': 0.1, 'solver': 'sag'}, mean: 0.85370, std: 0.01900, params: {'C': 0.1, 'solver': 'newton-cg'}, mean: 0.66389, std: 0.00949, params: {'C': 1.0, 'solver': 'sag'}, mean: 0.85556, std: 0.01823, params: {'C': 1.0, 'solver': 'newton-cg'}, mean: 0.66389, std: 0.00949, params: {'C': 10.0, 'solver': 'sag'}, mean: 0.85324, std: 0.02000, params: {'C': 10.0, 'solver': 'newton-cg'}]
############# based on standard predict ################
Accuracy on training data: 0.92
Accuracy on test data:     0.86
[[ 14   0   0   0   0   2   0   1  13   1   0   0   3   0   1]
 [  0   9   0   0   0   0   0   0   0   2   0   0   5   0   0]
 [  0   0   4   3   0   2   0   0   2   0   0   0   0   0   0]
 [  0   0   2   5   0   4   0   0   2   0   0   0   0   0   0

In [None]:
len(call_set)

In [7]:
from GaussianGenerativeModel import GaussianGenerativeModel

nb1 = GaussianGenerativeModel(isSharedCovariance=False)
nb1.fit(train_set.values,t_train)
call_list=list(call_set)
t_test=nb1.predict(test_set.values,call_list)



[8]
[10]
[14]
[8]
[13]
[10]
[5]
[9]
[12]
[1]
[3]
[9]
[9]
[10]
[10]
[3]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
[10]
[11]
[9]
[8]
[12]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
[3]
[8]
[12]
[13]
[3]
[8]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
[12]
[13]
[8]
[1]
[8]
[8]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
[12]
[9]
[12]
[9]
[8]
[14]
[9]
[10]
[10]
[6]
[11]
[8]
[8]
[10]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
[13]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
[3]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
[12]
[1]
[14]
[8]
[3]
[12]
[10]
[10]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
[9]
[8]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
[11]
[8]
[9]
[2]
[10]
[12]
[1]
[8]
[0]
[12]
[0]
[8]
[8]
[8]
[13]
[9]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
[7]
[7]
[10]
[3]
[13]
[8]
[8]
[13]
[3]
[12]
[9]
[3]
[12]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
[3]
[1]
[8]
[12]
[13]
[12]
[8]
[9]
[10]
[10]
[10]
[5]
[13]
[8]
[8]
[10]
[8]
[12]
[10]
[9]
[8]


In [10]:
write_predictions(t_test,test_ids,"GMM2.csv")

In [None]:
nb1 = GaussianGenerativeModel(isSharedCovariance=False)
nb1.fit(pca_train,t_train)
call_list=list(call_set)
t_test=nb1.predict(pca_test,call_list)

In [11]:
# Kaggle score 0.65158

In [None]:
%%time

from sklearn.mixture import GMM

# clflr, Xtrain, ytrain, Xtest, ytest = do_classify(GMM(covariance_type='diag',random_state=None,
#                                                       n_components=15,n_iter=100), \
#                                                   {},pca_train,t_train,reuse_split=reuse_split)

clfgmm=GMM(n_components=15)
clfgmm.fit(pca_train[:2000],y=t_train[:2000])
print np.average(clfgmm.score(pca_train[2000:],y=t_train[2000:]))

clfgmm.fit(pca_train,y=t_train)
t_test=clfgmm.predict(pca_test)
write_predictions(t_test,test_ids,"GMM.csv")