In [20]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from itertools import product

df = pd.read_csv("data/npf_train.csv")


df["event"], _ = pd.factorize(df["class4"])
df = df.drop(["class4", "partlybad"], axis = 1)
y = df["event"].values

#Remove columns that we don't want to use.
columns = df.columns.values.tolist()
remove = ["id", "date", "event"]
columns = [column for column in columns if column not in remove]

df = df[columns].apply(pd.to_numeric, errors='coerce')

df_pairs = pd.DataFrame(columns = np.arange(10000), index= np.arange(df.shape[0]))

def pairs(row):
    return [(x[0] * x[1]) for x in product(row, repeat= 2)]

for index, row in df.iterrows():
    df_pairs.iloc[index, 0:10000] = pairs(row)



row = pairs(df.iloc[0].values)
print(len(df.iloc[0].values))
print(len(row))
df_pairs

model = Pipeline([
    ('minmax', MinMaxScaler()),
    ('model', MultinomialNB()),
])

X = df_pairs.values

X, X_test, y, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train, X_validate, y_train, y_validate = train_test_split(X, y, train_size= 0.75, random_state= 42)

model.fit(X_train, y_train)

y_pred = model.predict(X_validate)

print("accuracy: ", accuracy_score(y_validate, y_pred))
print("cross validate score: ", cross_validate(model, X_train, y_train, cv = 3))

100
10000


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,135992.574485,114.433196,135953.465274,112.52205,136213.64342,123.024611,135935.172256,111.503425,15359.380664,11042.012908,...,-0.000051,0.000009,-0.000031,0.000009,0.000086,0.000045,0.000001,0.000001,0.0,0.0
1,143033.194007,378.761993,142990.001739,387.83089,143212.465725,384.705351,142969.737461,386.615193,1315.688674,819.064169,...,0.00139,0.000295,0.001911,0.000255,0.000278,0.000167,0.000005,0.000003,0.000003,0.000001
2,139161.197652,279.73194,139119.009416,271.728619,139357.633675,311.580732,139096.552875,258.454748,111149.72469,70262.060946,...,-0.000052,0.000389,0.000149,0.000445,0.002753,0.001645,0.000067,0.000052,0.0,0.0
3,141107.677624,209.845225,141072.521762,203.209454,141258.997972,218.147133,141056.212198,199.079712,121827.937545,72576.965201,...,0.002618,0.001169,0.003043,0.001255,0.007487,0.00465,0.000265,0.00021,0.000001,0.0
4,142627.85381,154.244864,142608.421433,160.114172,142800.106457,157.576428,142598.098698,160.301997,63250.655605,51095.6092,...,0.000774,0.000976,0.001449,0.001096,0.006592,0.004789,0.00023,0.000198,0.000003,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459,145173.667289,1680.881707,145179.085643,1547.733499,145891.928797,1928.702544,145194.796393,1463.343809,46189.799435,50093.894026,...,0.006936,0.000463,0.007339,0.000556,0.003693,0.003536,0.000197,0.000211,0.000001,0.0
460,147224.26701,3230.105706,147360.282586,3119.615016,148275.083588,3799.201133,147380.603442,3051.526749,127673.43905,93515.926801,...,0.014277,0.002589,0.014757,0.002757,0.016673,0.011586,0.000798,0.000656,0.000002,0.000001
461,143852.65657,4568.714976,143884.500744,4374.359878,144607.218394,5326.409434,143808.687682,4146.641775,43351.157341,37769.985925,...,0.006591,0.000776,0.006899,0.000798,0.003702,0.002675,0.000191,0.000166,0.000002,0.0
462,147797.003364,2465.552183,147766.49141,2222.483386,148543.210051,2946.074987,147698.252643,2116.846954,69374.635449,69791.164901,...,0.01629,0.001049,0.017401,0.001296,0.012626,0.010892,0.000674,0.000644,0.000005,0.000001
