In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Read training set file, then do data cleaning
df = pd.read_csv('TrainOnMe-6.csv')

# delete all N/A rows but keep those in x6
df.dropna(subset=['x1'], inplace=True)
df.dropna(subset=['x2'], inplace=True)
df.dropna(subset=['x3'], inplace=True)
df.dropna(subset=['x4'], inplace=True)
df.dropna(subset=['x5'], inplace=True)
df.dropna(subset=['x7'], inplace=True)
df.dropna(subset=['x8'], inplace=True)
df.dropna(subset=['x9'], inplace=True)
df.dropna(subset=['x10'], inplace=True)
df.dropna(subset=['x11'], inplace=True)
df.dropna(subset=['x12'], inplace=True)
df.dropna(subset=['x13'], inplace=True)

# correct typo in x6
df.loc[df['x6'] == 'Ostra stationen', 'x6'] = 'Östra stationen'

# apply one-hot encoding for x6, encode N/A as well
one_hot_encoded_x6 = pd.get_dummies(df['x6'], prefix='x6', dummy_na=True)
df = pd.concat([df.drop('x6', axis=1), one_hot_encoded_x6], axis=1)

# unify true and false expression
df.loc[df['x11'] == 'Tru', 'x11'] = 'True'
df.loc[df['x11'] == 'F', 'x11'] = 'False'
df.loc[df['x12'] == 'Flase', 'x12'] = 'False'
df.loc[df['x12'] == 'F', 'x12'] = 'False'

# encode true and false
df['x11'] = df['x11'].replace({'True': 1, 'False': 0}).astype(float)
df['x12'] = df['x12'].replace({'True': 1, 'False': 0}).astype(float)

# delete a row with x1 = "?"
df = df[df.iloc[:, 1] != '?']

# encode x5
df.loc[df['x5'] == '0.00000', 'x5'] = 1.0
df.loc[df['x5'] == '0.00001', 'x5'] = 1.0
df.loc[df['x5'] == '-0.00000', 'x5'] = 0.0
df.loc[df['x5'] == '-0.00001', 'x5'] = 0.0

# change all types to numerical types
df['x1'] = df['x1'].astype("float")
df['x2'] = df['x2'].astype("float")
df['x3'] = df['x3'].astype("float")
df['x4'] = df['x4'].astype("float")
df['x5'] = df['x5'].astype("float")

# standardization for x3
standard_scaler_x3 = StandardScaler()
standard_scaler_x3.fit(df[['x3']])
df['x3'] = standard_scaler_x3.transform(df[['x3']])

# standardization for x9
standard_scaler_x9 = StandardScaler()
standard_scaler_x9.fit(df[['x9']])
df['x9'] = standard_scaler_x9.transform(df[['x9']])

# standardization for x10
standard_scaler_x10 = StandardScaler()
standard_scaler_x10.fit(df[['x10']])
df['x10'] = standard_scaler_x10.transform(df[['x10']])

In [3]:
df

Unnamed: 0,y,x1,x2,x3,x4,x5,x7,x8,x9,x10,x11,x12,x13,x6_Brinnelvägen 8,x6_Entrée,x6_KTH Biblioteket,x6_Lindstedsvägen 24,x6_Slussen,x6_Östra stationen,x6_nan
0,Dragspel,1.42038,735.17784,1.661789,-141.68616,1.0,0.95499,-1.69662,-0.595352,-1.444570,1.0,0.0,-2.84076,0,0,0,0,1,0,0
1,Serpent,-0.30564,-1172.52474,0.119391,30.72446,0.0,-2.29523,-0.28491,0.370825,0.325214,0.0,0.0,0.61127,1,0,0,0,0,0,0
2,Dragspel,1.41124,-2270.39585,-1.192848,-140.91191,0.0,0.90352,-0.55290,0.250363,-1.192266,0.0,0.0,-2.82248,0,0,1,0,0,0,0
3,Dragspel,0.40752,-1691.50839,0.088214,-40.61188,0.0,3.56674,-1.05227,0.147193,0.511700,0.0,0.0,-0.81505,0,0,0,0,0,1,0
4,Serpent,-0.33494,188.67147,0.140014,33.90150,1.0,1.33713,-0.85736,-0.143364,0.407697,0.0,0.0,0.66987,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,Dragspel,0.86303,-1865.67818,0.360426,-85.75314,0.0,2.42699,-0.02280,-0.311290,0.499998,0.0,0.0,-1.72606,0,0,0,0,0,1,0
1001,Nyckelharpa,2.17891,-727.50379,-0.572578,-217.70070,0.0,1.18831,-1.64841,-0.638917,0.719286,1.0,0.0,-4.35782,0,1,0,0,0,0,0
1002,Dragspel,-0.09707,-1224.01073,-0.816915,10.03523,0.0,0.26424,-0.03609,-0.296906,-1.607816,0.0,0.0,0.19413,1,0,0,0,0,0,0
1003,Serpent,1.56273,-759.44721,-0.405537,-155.92072,0.0,1.35845,-0.98182,0.050071,2.233647,0.0,0.0,-3.12545,0,1,0,0,0,0,0


In [4]:
X_train = df.iloc[:, 1:]
y_train = df.iloc[:, 0]

In [5]:
# Read evaluation set file, then do data cleaning
dfe = pd.read_csv('EvaluateOnMe-6.csv')
dfe

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13
0,-0.19528,1813.16939,1000.47742,19.92780,0.0,KTH Biblioteket,-2.11119,1.07467,5475.23204,-89185.68603,True,False,0.39055
1,0.96711,1039.24998,1000.59127,-96.24958,0.0,Brinnelvägen 8,0.29538,0.28703,5474.36969,-89187.04831,True,False,-1.93421
2,-0.36744,791.84564,1000.87490,36.95219,0.0,Entrée,-2.22641,2.15474,5473.33639,-89184.93092,False,False,0.73489
3,-1.72245,-92.98303,1000.48905,172.32768,-0.0,,-3.72120,2.82907,5473.57009,-89186.46529,False,False,3.44490
4,2.33596,-1265.86780,1000.51694,-233.36179,-0.0,Lindstedsvägen 24,4.64681,-3.01512,5475.74198,-89186.14419,True,False,-4.67193
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2.00175,808.61995,1000.46742,-200.02277,0.0,Entrée,1.82118,-3.51946,5476.95866,-89185.86543,True,False,-4.00351
9996,0.87403,312.75680,1000.24830,-87.29552,0.0,Slussen,0.42565,-1.09671,5475.10608,-89187.62275,True,False,-1.74806
9997,2.82473,-460.43325,1000.50988,-282.11081,-0.0,Slussen,2.96395,-2.91809,5472.56315,-89184.80599,True,False,-5.64946
9998,2.10735,745.59292,1000.44494,-210.47444,0.0,Slussen,3.32706,-1.74554,5475.45232,-89184.43963,True,False,-4.21470


In [6]:
dfe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x1      10000 non-null  float64
 1   x2      10000 non-null  float64
 2   x3      10000 non-null  float64
 3   x4      10000 non-null  float64
 4   x5      10000 non-null  float64
 5   x6      7716 non-null   object 
 6   x7      10000 non-null  float64
 7   x8      10000 non-null  float64
 8   x9      10000 non-null  float64
 9   x10     10000 non-null  float64
 10  x11     10000 non-null  bool   
 11  x12     10000 non-null  bool   
 12  x13     10000 non-null  float64
dtypes: bool(2), float64(10), object(1)
memory usage: 879.0+ KB


In [7]:
dfe['x6'].unique()

array(['KTH Biblioteket', 'Brinnelvägen 8', 'Entrée', nan,
       'Lindstedsvägen 24', 'Slussen', 'Östra stationen'], dtype=object)

In [8]:
# apply one-hot encoding for x6, encode N/A as well
one_hot_encoded_x6e = pd.get_dummies(dfe['x6'], prefix='x6', dummy_na=True)
dfe = pd.concat([dfe.drop('x6', axis=1), one_hot_encoded_x6e], axis=1)

In [9]:
dfe

Unnamed: 0,x1,x2,x3,x4,x5,x7,x8,x9,x10,x11,x12,x13,x6_Brinnelvägen 8,x6_Entrée,x6_KTH Biblioteket,x6_Lindstedsvägen 24,x6_Slussen,x6_Östra stationen,x6_nan
0,-0.19528,1813.16939,1000.47742,19.92780,0.0,-2.11119,1.07467,5475.23204,-89185.68603,True,False,0.39055,0,0,1,0,0,0,0
1,0.96711,1039.24998,1000.59127,-96.24958,0.0,0.29538,0.28703,5474.36969,-89187.04831,True,False,-1.93421,1,0,0,0,0,0,0
2,-0.36744,791.84564,1000.87490,36.95219,0.0,-2.22641,2.15474,5473.33639,-89184.93092,False,False,0.73489,0,1,0,0,0,0,0
3,-1.72245,-92.98303,1000.48905,172.32768,-0.0,-3.72120,2.82907,5473.57009,-89186.46529,False,False,3.44490,0,0,0,0,0,0,1
4,2.33596,-1265.86780,1000.51694,-233.36179,-0.0,4.64681,-3.01512,5475.74198,-89186.14419,True,False,-4.67193,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2.00175,808.61995,1000.46742,-200.02277,0.0,1.82118,-3.51946,5476.95866,-89185.86543,True,False,-4.00351,0,1,0,0,0,0,0
9996,0.87403,312.75680,1000.24830,-87.29552,0.0,0.42565,-1.09671,5475.10608,-89187.62275,True,False,-1.74806,0,0,0,0,1,0,0
9997,2.82473,-460.43325,1000.50988,-282.11081,-0.0,2.96395,-2.91809,5472.56315,-89184.80599,True,False,-5.64946,0,0,0,0,1,0,0
9998,2.10735,745.59292,1000.44494,-210.47444,0.0,3.32706,-1.74554,5475.45232,-89184.43963,True,False,-4.21470,0,0,0,0,1,0,0


In [10]:
# encode true and false
dfe['x11'] = dfe['x11'].replace({True: 1, False: 0}).astype(float)
dfe['x12'] = dfe['x12'].replace({True: 1, False: 0}).astype(float)

In [11]:
# change dfe x5 to string
dfe['x5'] = dfe['x5'].astype(str)

In [12]:
dfe['x5'].unique()

array(['0.0', '-0.0', '-1e-05', '1e-05'], dtype=object)

In [13]:
# encode x5
dfe.loc[dfe['x5'] == '0.0', 'x5'] = 1.0
dfe.loc[dfe['x5'] == '1e-05', 'x5'] = 1.0
dfe.loc[dfe['x5'] == '-0.0', 'x5'] = 0.0
dfe.loc[dfe['x5'] == '-1e-05', 'x5'] = 0.0

In [14]:
# change all types to numerical types
dfe['x5'] = dfe['x5'].astype("float")

In [15]:
dfe

Unnamed: 0,x1,x2,x3,x4,x5,x7,x8,x9,x10,x11,x12,x13,x6_Brinnelvägen 8,x6_Entrée,x6_KTH Biblioteket,x6_Lindstedsvägen 24,x6_Slussen,x6_Östra stationen,x6_nan
0,-0.19528,1813.16939,1000.47742,19.92780,1.0,-2.11119,1.07467,5475.23204,-89185.68603,1.0,0.0,0.39055,0,0,1,0,0,0,0
1,0.96711,1039.24998,1000.59127,-96.24958,1.0,0.29538,0.28703,5474.36969,-89187.04831,1.0,0.0,-1.93421,1,0,0,0,0,0,0
2,-0.36744,791.84564,1000.87490,36.95219,1.0,-2.22641,2.15474,5473.33639,-89184.93092,0.0,0.0,0.73489,0,1,0,0,0,0,0
3,-1.72245,-92.98303,1000.48905,172.32768,0.0,-3.72120,2.82907,5473.57009,-89186.46529,0.0,0.0,3.44490,0,0,0,0,0,0,1
4,2.33596,-1265.86780,1000.51694,-233.36179,0.0,4.64681,-3.01512,5475.74198,-89186.14419,1.0,0.0,-4.67193,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2.00175,808.61995,1000.46742,-200.02277,1.0,1.82118,-3.51946,5476.95866,-89185.86543,1.0,0.0,-4.00351,0,1,0,0,0,0,0
9996,0.87403,312.75680,1000.24830,-87.29552,1.0,0.42565,-1.09671,5475.10608,-89187.62275,1.0,0.0,-1.74806,0,0,0,0,1,0,0
9997,2.82473,-460.43325,1000.50988,-282.11081,0.0,2.96395,-2.91809,5472.56315,-89184.80599,1.0,0.0,-5.64946,0,0,0,0,1,0,0
9998,2.10735,745.59292,1000.44494,-210.47444,1.0,3.32706,-1.74554,5475.45232,-89184.43963,1.0,0.0,-4.21470,0,0,0,0,1,0,0


In [16]:
dfe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   x1                    10000 non-null  float64
 1   x2                    10000 non-null  float64
 2   x3                    10000 non-null  float64
 3   x4                    10000 non-null  float64
 4   x5                    10000 non-null  float64
 5   x7                    10000 non-null  float64
 6   x8                    10000 non-null  float64
 7   x9                    10000 non-null  float64
 8   x10                   10000 non-null  float64
 9   x11                   10000 non-null  float64
 10  x12                   10000 non-null  float64
 11  x13                   10000 non-null  float64
 12  x6_Brinnelvägen 8     10000 non-null  uint8  
 13  x6_Entrée             10000 non-null  uint8  
 14  x6_KTH Biblioteket    10000 non-null  uint8  
 15  x6_Lindstedsvägen 24

In [17]:
# standardization for x3
standard_scaler_x3e = StandardScaler()
standard_scaler_x3e.fit(dfe[['x3']])
dfe['x3'] = standard_scaler_x3e.transform(dfe[['x3']])

In [18]:
# standardization for x9
standard_scaler_x9e = StandardScaler()
standard_scaler_x9e.fit(dfe[['x9']])
dfe['x9'] = standard_scaler_x9e.transform(dfe[['x9']])

In [19]:
# standardization for x10
standard_scaler_x10e = StandardScaler()
standard_scaler_x10e.fit(dfe[['x10']])
dfe['x10'] = standard_scaler_x10e.transform(dfe[['x10']])

In [20]:
dfe

Unnamed: 0,x1,x2,x3,x4,x5,x7,x8,x9,x10,x11,x12,x13,x6_Brinnelvägen 8,x6_Entrée,x6_KTH Biblioteket,x6_Lindstedsvägen 24,x6_Slussen,x6_Östra stationen,x6_nan
0,-0.19528,1813.16939,-0.504709,19.92780,1.0,-2.11119,1.07467,0.188911,0.209360,1.0,0.0,0.39055,0,0,1,0,0,0,0
1,0.96711,1039.24998,-0.042720,-96.24958,1.0,0.29538,0.28703,-0.255426,-0.845276,1.0,0.0,-1.93421,1,0,0,0,0,0,0
2,-0.36744,791.84564,1.108216,36.95219,1.0,-2.22641,2.15474,-0.787847,0.793943,0.0,0.0,0.73489,0,1,0,0,0,0,0
3,-1.72245,-92.98303,-0.457516,172.32768,0.0,-3.72120,2.82907,-0.667430,-0.393920,0.0,0.0,3.44490,0,0,0,0,0,0,1
4,2.33596,-1265.86780,-0.344342,-233.36179,0.0,4.64681,-3.01512,0.451665,-0.145334,1.0,0.0,-4.67193,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2.00175,808.61995,-0.545288,-200.02277,1.0,1.82118,-3.51946,1.078575,0.070474,1.0,0.0,-4.00351,0,1,0,0,0,0,0
9996,0.87403,312.75680,-1.434450,-87.29552,1.0,0.42565,-1.09671,0.124009,-1.289991,1.0,0.0,-1.74806,0,0,0,0,1,0,0
9997,2.82473,-460.43325,-0.372991,-282.11081,0.0,2.96395,-2.91809,-1.186269,0.890660,1.0,0.0,-5.64946,0,0,0,0,1,0,0
9998,2.10735,745.59292,-0.636509,-210.47444,1.0,3.32706,-1.74554,0.302414,1.174285,1.0,0.0,-4.21470,0,0,0,0,1,0,0


In [21]:
X_eval = dfe

In [22]:
X_eval

Unnamed: 0,x1,x2,x3,x4,x5,x7,x8,x9,x10,x11,x12,x13,x6_Brinnelvägen 8,x6_Entrée,x6_KTH Biblioteket,x6_Lindstedsvägen 24,x6_Slussen,x6_Östra stationen,x6_nan
0,-0.19528,1813.16939,-0.504709,19.92780,1.0,-2.11119,1.07467,0.188911,0.209360,1.0,0.0,0.39055,0,0,1,0,0,0,0
1,0.96711,1039.24998,-0.042720,-96.24958,1.0,0.29538,0.28703,-0.255426,-0.845276,1.0,0.0,-1.93421,1,0,0,0,0,0,0
2,-0.36744,791.84564,1.108216,36.95219,1.0,-2.22641,2.15474,-0.787847,0.793943,0.0,0.0,0.73489,0,1,0,0,0,0,0
3,-1.72245,-92.98303,-0.457516,172.32768,0.0,-3.72120,2.82907,-0.667430,-0.393920,0.0,0.0,3.44490,0,0,0,0,0,0,1
4,2.33596,-1265.86780,-0.344342,-233.36179,0.0,4.64681,-3.01512,0.451665,-0.145334,1.0,0.0,-4.67193,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2.00175,808.61995,-0.545288,-200.02277,1.0,1.82118,-3.51946,1.078575,0.070474,1.0,0.0,-4.00351,0,1,0,0,0,0,0
9996,0.87403,312.75680,-1.434450,-87.29552,1.0,0.42565,-1.09671,0.124009,-1.289991,1.0,0.0,-1.74806,0,0,0,0,1,0,0
9997,2.82473,-460.43325,-0.372991,-282.11081,0.0,2.96395,-2.91809,-1.186269,0.890660,1.0,0.0,-5.64946,0,0,0,0,1,0,0
9998,2.10735,745.59292,-0.636509,-210.47444,1.0,3.32706,-1.74554,0.302414,1.174285,1.0,0.0,-4.21470,0,0,0,0,1,0,0


In [23]:
# build classifier
dt = DecisionTreeClassifier(max_depth=5, random_state=42)
classifier = AdaBoostClassifier(base_estimator=dt, n_estimators=700, random_state=42)

In [24]:
classifier.fit(X_train, y_train)
y_eval = classifier.predict(X_eval)

In [25]:
y_eval

array(['Dragspel', 'Nyckelharpa', 'Dragspel', ..., 'Dragspel', 'Serpent',
       'Dragspel'], dtype=object)

In [26]:
with open('evaluation_prediction.txt', 'w') as f:
    for element in y_eval:
        f.write(element+'\n')