In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [None]:
pip install eli5

Collecting eli5
  Downloading eli5-0.13.0.tar.gz (216 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m216.2/216.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: eli5
  Building wheel for eli5 (setup.py) ... [?25l[?25hdone
  Created wheel for eli5: filename=eli5-0.13.0-py2.py3-none-any.whl size=107720 sha256=18edaeadc090a383f27cc929484dc08e078812cc1c91cdf00f46cb82324cdfdb
  Stored in directory: /root/.cache/pip/wheels/b8/58/ef/2cf4c306898c2338d51540e0922c8e0d6028e07007085c0004
Successfully built eli5
Installing collected packages: eli5
Successfully installed eli5-0.13.0


In [24]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import uniform, randint
import sklearn.metrics as skm
import sklearn.model_selection as skms
import sklearn.preprocessing as skp
from sklearn.linear_model import LogisticRegression
import sklearn.ensemble as ske
import catboost as cb
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier as knn
import pickle
from pprint import pprint
import random
import librosa, IPython
import librosa.display as lplt
import eli5
from eli5.sklearn import PermutationImportance
seed = 12
np.random.seed(seed)

In [25]:
df = pd.read_excel(r'/content/drive/MyDrive/Machine_L/ML_DATA 2.xlsx')

In [26]:
label_index = dict()
index_label = dict()
for i, x in enumerate(df.label.unique()):
    label_index[x] = i
    index_label[i] = x

In [27]:
df_shuffle = df.sample(frac=1, random_state=seed).reset_index(drop=True)

In [28]:
# remove irrelevant columns
df_shuffle.drop(['filename'], axis=1, inplace=True)
df_y = df_shuffle.pop('label')
df_X = df_shuffle

In [29]:
# split into train dev and test
X_train, df_test_valid_X, y_train, df_test_valid_y = skms.train_test_split(df_X, df_y, train_size=0.7, random_state=seed, stratify=df_y)
X_dev, X_test, y_dev, y_test = skms.train_test_split(df_test_valid_X, df_test_valid_y, train_size=0.66, random_state=seed, stratify=df_test_valid_y)

In [30]:
#Scale the features
scaler = skp.StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_dev = pd.DataFrame(scaler.transform(X_dev), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_train.columns)


In [31]:
pickle.dump(scaler, open('scalar.pkl','wb'))
pickle.dump(X_train, open('xtrain.pkl','wb'))

In [32]:
lr = LogisticRegression(random_state=seed)
lr.fit(X_train,y_train)

In [33]:
# Permutation Importance Feature Selection
perm = PermutationImportance(lr, random_state=seed).fit(X_train, y_train, n_iter=10)
perm_indices = np.argsort(perm.feature_importances_)[::-1]
perm_features = [X_dev.columns.tolist()[xx] for xx in perm_indices]
pickle.dump(perm_features, open('perm_features.pkl','wb'))

In [34]:
# Model Scoring using Permutation Importances
X_train_perm = X_train[perm_features[:30]]
X_train_rfe = X_train_perm

In [35]:
#Logistic Regression
lr = LogisticRegression()
lr.fit(X_train_rfe,y_train)
pickle.dump(lr, open('Logistic Regression.pkl','wb'))

In [36]:
#Random Forest
rfc = ske.RandomForestClassifier(random_state=seed, n_jobs=-1)
rfc.fit(X_train_rfe, y_train)
pickle.dump(rfc, open('Random Forest.pkl','wb'))

In [37]:
#AdaBoost
abc = ske.AdaBoostClassifier(n_estimators=150, random_state=seed)
abc.fit(X_train_rfe, y_train)
pickle.dump(abc, open('AdaBoost.pkl','wb'))

In [38]:
#Gradient Boosting
gbc = ske.GradientBoostingClassifier(n_estimators=100, random_state=seed)
gbc.fit(X_train_rfe, y_train)
pickle.dump(gbc, open('Gradient Boosting.pkl','wb'))


In [39]:
#XGBoost
xgbc = xgb.XGBClassifier(n_estimators=100, random_state=seed)
from sklearn.preprocessing import LabelEncoder
# Initialize LabelEncoder
label_encoder = LabelEncoder()
# Fit LabelEncoder on the target variable and transform it
y_train_encoded = label_encoder.fit_transform(y_train)
xgbc.fit(X_train_rfe, y_train_encoded)
pickle.dump(xgbc, open('XGBoost.pkl','wb'))

In [40]:
#CatBoost
cbc = cb.CatBoostClassifier(random_state=seed, verbose=0, eval_metric='Accuracy', loss_function='MultiClass')
cbc.fit(X_train_rfe, y_train)
pickle.dump(cbc, open('CatBoost.pkl','wb'))

In [41]:
#KNN
cls = knn() #random_state=seed)
cls.fit(X_train_rfe, y_train)
pickle.dump(cls, open('KNN.pkl','wb'))