In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('imputed_interpolations.csv')
df = df.drop(columns = 'IMF Spot')
df = df.apply(lambda x: x.astype('category') if x.dtype=='object' else x)
to_work = df.copy()
interpolations = to_work['Interpolation Method'].unique().astype(str)
colors = ['r', 'g', 'b', 'black', 'purple', 'orange', 'yellow']

# Data without categorical features and without normalizing

In [3]:
df.head()

Unnamed: 0,Mean Frequency,Variance Frequency,Rows,Cols,Median Frequency,Skewness Frequency,Kurtosis Frequency,Mean Color,Variance Color,Median Color,Skewness Color,Kurtosis Color,Interpolation Method
0,65818.750159,64236760000.0,480.0,640.0,20776.810283,1.935814e+17,7.920839e+23,19.100767,222841.121472,-0.016074,798554700.0,9956162000000.0,Bicubic
1,87890.692769,113236700000.0,480.0,640.0,27737.539819,4.293371e+17,2.131216e+24,-27.426445,393002.70896,-0.005735,-1713181000.0,26839470000000.0,Lanczos4
2,47013.060751,29677970000.0,480.0,640.0,15362.985279,5.294232e+16,1.195883e+23,-4.069003,103786.167805,-0.037379,72072960.0,1569154000000.0,Lanczos4
3,51967.313185,38667420000.0,480.0,640.0,16893.375866,8.079948e+16,2.138331e+23,-0.529587,134661.234589,-0.025672,-132843900.0,3802050000000.0,Bicubic
4,33656.254725,15467720000.0,480.0,640.0,10946.170528,1.98606e+16,3.28482e+22,13.369099,53859.235616,0.72854,246869900.0,2023371000000.0,Bicubic


# Defining Models + Train-Test Splitting

In [4]:
target = to_work['Interpolation Method'].to_numpy()
to_work = to_work.drop(columns='Interpolation Method').to_numpy()
#to_work = minmax_scale(to_work)
#target = minmax_scale(target)

In [5]:
x_train, x_test, y_train, y_test = train_test_split(to_work, target)

# Defining and training the models

In [6]:
random_forest = RandomForestClassifier()
random_forest.fit(x_train, y_train)

RandomForestClassifier()

In [7]:
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)

KNeighborsClassifier()

In [8]:
desicion_tree = DecisionTreeClassifier()
desicion_tree.fit(x_train, y_train)

DecisionTreeClassifier()

# Testing the models

In [9]:
knn.score(x_test, y_test)

0.7945205479452054

In [10]:
desicion_tree.score(x_test, y_test)

0.8697469236127234

In [11]:
random_forest.score(x_test, y_test)

0.9022521476665892

It's clear that the Random Forest model makes the best results, and the results themselves are great.

In [12]:
# Saving the model for further use
#import pickle
#filename = 'random_forest_model.pkl'
#pickle.dump(random_forest, open(filename, 'wb'))

Let's try optimize our results:

In [13]:
random_forest = RandomForestClassifier(110, criterion='entropy', random_state=42)
random_forest.fit(x_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=110, random_state=42)

In [14]:
predictions = random_forest.predict(x_test)
(predictions == y_test).sum()/len(x_test)

0.9045739493847226