In [1]:
import numpy as np
import pandas as pd
import sklearn
import csv
import matplotlib.pyplot as plt
from sklearn import metrics

In [2]:
feature_selection = pd.read_csv('Data/feature_selection_incl_features.csv', index_col = 0, compression = 'gzip')
feature_selection.head(2)

Unnamed: 0,word,garbage_score,word_length,vowels,consonants,vowelandconsonant,punctuation,dutch_char,upper,lower,...,word_numbers_ratio,word_alpha_ratio,word_accents_ratio,word_other_characters_ratio,word_vowels_consonant_ratio,word_consonant_vowels_ratio,max_count_same_char,max_count_strip_same_char,max_consecutive_consonants,max_consecutive_vowels
0,fttïsietm,1,9,3,6,9,0,9,0,9,...,0.0,1.0,0.111111,0.0,0.5,2.0,2,2,3,2
1,jjatcgfcs»,1,10,1,8,9,1,9,0,9,...,0.0,0.9,0.0,0.0,0.125,8.0,2,2,6,1


In [3]:
train = pd.read_csv('Data/train_incl_feat.csv', index_col = 0, compression = 'gzip')
train.head(2)

Unnamed: 0,word,garbage_score,word_length,vowels,consonants,vowelandconsonant,punctuation,dutch_char,upper,lower,...,word_numbers_ratio,word_alpha_ratio,word_accents_ratio,word_other_characters_ratio,word_vowels_consonant_ratio,word_consonant_vowels_ratio,max_count_same_char,max_count_strip_same_char,max_consecutive_consonants,max_consecutive_vowels
0,Ba)cile)'t,1,10,3,4,7,3,8,0,6,...,0.0,0.7,0.0,0.0,0.75,1.333333,1,1,1,1
1,téycltènéoi,1,11,6,5,11,0,11,0,11,...,0.0,1.0,0.272727,0.0,1.2,0.833333,1,1,3,3


In [4]:
test = pd.read_csv('Data/test_incl_features.csv', index_col = 0, compression = 'gzip')
test.head(2)

Unnamed: 0,word,garbage_score,word_length,vowels,consonants,vowelandconsonant,punctuation,dutch_char,upper,lower,...,word_numbers_ratio,word_alpha_ratio,word_accents_ratio,word_other_characters_ratio,word_vowels_consonant_ratio,word_consonant_vowels_ratio,max_count_same_char,max_count_strip_same_char,max_consecutive_consonants,max_consecutive_vowels
0,"Vcêlét:evangeaeß,.gtrtpeuflcert",1,31,10,17,27,3,27,0,27,...,0.0,0.903226,0.064516,0.0,0.588235,1.7,1,1,5,3
1,heeftalorrigëaffigeërt,1,22,10,12,22,0,22,0,22,...,0.0,1.0,0.090909,0.0,0.833333,1.2,2,2,2,2


In [5]:
features = ["word_length",
                "word_vowel_ratio",
                "word_consonant_ratio",
                "word_vowelandconsonant_ratio",
                "word_punctuation_ratio",
                "word_dutch_char_ratio",
                "word_uppercase_ratio",
                "word_lowercase_ratio",
                "word_numbers_ratio",
                "word_other_characters_ratio",
                "word_vowels_consonant_ratio",
                "word_consonant_vowels_ratio",
                "max_count_same_char", 
                "max_count_strip_same_char",
                "max_consecutive_consonants",
                "max_consecutive_vowels",
                "word_accents_ratio"]

In [6]:
X_feature_selection=feature_selection[features]  # Features
y_feature_selection=feature_selection['garbage_score']  # Labels

### Select features with Boruta

In [8]:
### make X_shadow by randomly permuting each column of X
np.random.seed(42)
X_shadow = X_feature_selection.apply(np.random.permutation)
X_shadow.columns = ['shadow_' + feat for feat in X_feature_selection.columns]### make X_boruta by appending X_shadow to X
X_boruta = pd.concat([X_feature_selection, X_shadow], axis = 1)

In [9]:
X_boruta.head(2)

Unnamed: 0,word_length,word_vowel_ratio,word_consonant_ratio,word_vowelandconsonant_ratio,word_punctuation_ratio,word_dutch_char_ratio,word_uppercase_ratio,word_lowercase_ratio,word_numbers_ratio,word_other_characters_ratio,...,shadow_word_lowercase_ratio,shadow_word_numbers_ratio,shadow_word_other_characters_ratio,shadow_word_vowels_consonant_ratio,shadow_word_consonant_vowels_ratio,shadow_max_count_same_char,shadow_max_count_strip_same_char,shadow_max_consecutive_consonants,shadow_max_consecutive_vowels,shadow_word_accents_ratio
0,9,0.333333,0.666667,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,2.0,1,1,2,2,0.0
1,10,0.1,0.8,0.9,0.1,0.9,0.0,0.9,0.0,0.0,...,0.6,0.0,0.0,0.25,1.5,1,2,2,1,0.0


In [10]:
from sklearn.ensemble import RandomForestRegressor### fit a random forest (suggested max_depth between 3 and 7)
forest = RandomForestRegressor(max_depth = 5, random_state = 42)
forest.fit(X_boruta, y_feature_selection)### store feature importances
feat_imp_X = forest.feature_importances_[:len(X_feature_selection.columns)]
feat_imp_shadow = forest.feature_importances_[len(X_feature_selection.columns):]### compute hits
hits = feat_imp_X > feat_imp_shadow.max()

In [11]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestRegressor
import numpy as np###initialize Boruta
forest = RandomForestRegressor(
   n_jobs = -1, 
   max_depth = 5
)
boruta = BorutaPy(
   estimator = forest, 
   n_estimators = 'auto',
   max_iter = 100 # number of trials to perform
)### fit Boruta (it accepts np.array, not pd.DataFrame)
boruta.fit(np.array(X_feature_selection), np.array(y_feature_selection))### print results
green_area = X_feature_selection.columns[boruta.support_].to_list()
blue_area = X_feature_selection.columns[boruta.support_weak_].to_list()
red_area = X_feature_selection.columns[~(boruta.support_ | boruta.support_weak_)]
print('features in the green area:', green_area)
print('features in the blue area:', blue_area)
print('features in the red area:', red_area)

features in the green area: ['word_length', 'word_vowel_ratio', 'word_vowelandconsonant_ratio', 'word_punctuation_ratio', 'word_dutch_char_ratio', 'word_uppercase_ratio', 'word_lowercase_ratio', 'word_numbers_ratio', 'word_vowels_consonant_ratio', 'word_consonant_vowels_ratio', 'max_count_same_char', 'max_count_strip_same_char', 'max_consecutive_consonants', 'word_accents_ratio']
features in the blue area: []
features in the red area: Index(['word_consonant_ratio', 'word_other_characters_ratio',
       'max_consecutive_vowels'],
      dtype='object')


features in the green area: ['word_length', 'word_vowel_ratio', 'word_vowelandconsonant_ratio', 'word_punctuation_ratio', 'word_dutch_letters_ratio', 'word_uppercase_ratio', 'word_lowercase_ratio', 'word_numbers_ratio', 'word_other_characters_ratio', 'word_vowels_consonant_ratio', 'max_consecutive_consonants', 'word_accents_ratio']
features in the blue area: []
features in the red area: Index(['word_consonant_ratio', 'word_consonant_vowels_ratio',
       'max_count_same_char', 'max_count_strip_same_char',
       'max_consecutive_vowels'],
      dtype='object')


### Select features from Boruta output

In [13]:
selected_features = green_area

X_train=train[selected_features]  # Features
y_train=train['garbage_score']  # Labels

X_test=test[selected_features]  # Features
y_test=test['garbage_score']  # Labels

feature_names = list(X_train.columns.values)

### Scale features 
With min max as SVM can not handle negative numbers

In [14]:
from sklearn.preprocessing import MinMaxScaler #fixed import

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Support Vector Machine
The linear version as the other one took to long

In [26]:
from sklearn.svm import LinearSVC

svm=LinearSVC()
svm.fit(X_train, y_train)

LinearSVC()

In [27]:
y_pred_svm = svm.predict(X_test)

In [28]:
print("Accuracy:",round(metrics.accuracy_score(y_test, y_pred_svm),3))
print("Precision:",round(metrics.precision_score(y_test, y_pred_svm),3,))
print("Recall:",round(metrics.recall_score(y_test, y_pred_svm),3))
print('F1 Score: %.3f' % round(metrics.f1_score(y_test, y_pred_svm),3))

Accuracy: 0.859
Precision: 0.889
Recall: 0.793
F1 Score: 0.838


In [28]:
import pickle

pickle.dump(svm, open('garbage_model_SVM_fs.pkl', 'wb'))

### Random Forest

In [29]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier()

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred_rf=clf.predict(X_test)

In [30]:
print("Accuracy:",round(metrics.accuracy_score(y_test, y_pred_rf),3))
print("Precision:",round(metrics.precision_score(y_test, y_pred_rf),3))
print("Recall:",round(metrics.recall_score(y_test, y_pred_rf),3))
print('F1 Score: %.3f' % round(metrics.f1_score(y_test, y_pred_rf),3))

Accuracy: 0.92
Precision: 0.95
Recall: 0.872
F1 Score: 0.909


In [30]:
import pickle

pickle.dump(clf, open('garbage_model_RF_fs.pkl', 'wb'))

### KNN

In [1]:
from sklearn.neighbors import KNeighborsClassifier

#Create KNN Classifier
knn = KNeighborsClassifier()

#Train the model using the training sets
knn.fit(X_train, y_train)

#Predict the response for test dataset  
y_pred_knn = knn.predict(X_test)

In [37]:
print("Accuracy:",round(metrics.accuracy_score(y_test, y_pred_knn),3))
print("Precision:",round(metrics.precision_score(y_test, y_pred_knn),3))
print("Recall:",round(metrics.recall_score(y_test, y_pred_knn),3))
print("F1 Score:",round(metrics.f1_score(y_test, y_pred_knn),3))

Accuracy: 0.514
Precision: 0.486
Recall: 0.98
F1 Score: 0.65


### Naive Bayes

In [34]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB().fit(X_train, y_train)
y_pred_nb = mnb.predict(X_test)

In [35]:
print("Accuracy:",round(metrics.accuracy_score(y_test, y_pred_nb),3))
print("Precision:",round(metrics.precision_score(y_test, y_pred_nb),3))
print("Recall:",round(metrics.recall_score(y_test, y_pred_nb),3))
print('F1 Score: %.3f' % round(metrics.f1_score(y_test, y_pred_nb),3))

Accuracy: 0.777
Precision: 0.922
Recall: 0.563
F1 Score: 0.699
