In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

![Screenshot 2022-12-01 at 18.01.09.png](attachment:8716832e-1392-4cb8-bb10-4910f468bd3e.png)

# Import the Data


In [3]:
# Import the Data
music_data = pd.read_csv('Data/music.csv')
music_data

Unnamed: 0,age,gender,genre
0,20,1,HipHop
1,23,1,HipHop
2,25,1,HipHop
3,25,1,HipHop
4,26,1,Jazz
5,29,1,Jazz
6,30,1,Jazz
7,31,1,Classical
8,32,-,Classical
9,33,1,Classical


# Clean the Data

In [4]:
music_data.age.str.isnumeric()
music_data.gender.str.isnumeric()

0      True
1      True
2      True
3      True
4      True
5      True
6      True
7      True
8     False
9      True
10     True
11     True
12     True
13     True
14     True
15     True
16     True
17     True
18     True
19     True
20     True
21     True
Name: gender, dtype: bool

In [5]:
music_data.drop(music_data.index[music_data['age'] == '?'], inplace=True)
music_data.drop(music_data.index[music_data['gender'] == '-'], inplace=True)
music_data.reset_index(drop=True, inplace=True)
music_data

Unnamed: 0,age,gender,genre
0,20,1,HipHop
1,23,1,HipHop
2,25,1,HipHop
3,25,1,HipHop
4,26,1,Jazz
5,29,1,Jazz
6,30,1,Jazz
7,31,1,Classical
8,33,1,Classical
9,37,1,Classical


In [6]:
duplicatedRows = music_data[music_data.duplicated()]
duplicatedRows

Unnamed: 0,age,gender,genre
3,25,1,HipHop
16,30,0,Acoustic


In [7]:
music_data.drop_duplicates(inplace=True)
music_data.reset_index(drop=True, inplace=True)
music_data

Unnamed: 0,age,gender,genre
0,20,1,HipHop
1,23,1,HipHop
2,25,1,HipHop
3,26,1,Jazz
4,29,1,Jazz
5,30,1,Jazz
6,31,1,Classical
7,33,1,Classical
8,37,1,Classical
9,20,0,Dance


# Split the Data

In [8]:
X = music_data.drop(columns=['genre'])
X

Unnamed: 0,age,gender
0,20,1
1,23,1
2,25,1
3,26,1
4,29,1
5,30,1
6,31,1
7,33,1
8,37,1
9,20,0


In [9]:
y = music_data['genre']
y

0        HipHop
1        HipHop
2        HipHop
3          Jazz
4          Jazz
5          Jazz
6     Classical
7     Classical
8     Classical
9         Dance
10        Dance
11        Dance
12     Acoustic
13     Acoustic
14     Acoustic
15    Classical
16    Classical
17    Classical
Name: genre, dtype: object

In [10]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()

# Train & Predict the model

In [11]:
model.fit(X,y)
music_data

Unnamed: 0,age,gender,genre
0,20,1,HipHop
1,23,1,HipHop
2,25,1,HipHop
3,26,1,Jazz
4,29,1,Jazz
5,30,1,Jazz
6,31,1,Classical
7,33,1,Classical
8,37,1,Classical
9,20,0,Dance


In [12]:
predictions = model.predict([[21,1], [22, 0]])
predictions

array(['HipHop', 'Dance'], dtype=object)

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [14]:
model.fit(X_train, y_train)
predictions = model.predict(X_test)
predictions

array(['Classical', 'Jazz', 'Jazz', 'Jazz'], dtype=object)

# Evaluate accuracy

In [15]:
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, predictions)
score

0.25

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

predictions = model.predict(X_test)

score = accuracy_score(y_test, predictions)
score

1.0

# Perisiting Models

In [17]:
import joblib

X = music_data.drop(columns=['genre'])
y = music_data['genre']

model = DecisionTreeClassifier()
model.fit(X, y)

joblib.dump(model, 'music-recommender.joblib')

['music-recommender.joblib']

In [18]:
joblib.load('music-recommender.joblib')
predictions = model.predict([ [21, 1] ])
predictions

array(['HipHop'], dtype=object)

# <b> END OF EXAMPLE SESSION 1

# <b> START OF EXAMPLE SESSION 2

# <b> Ease of doing ML analysis with PyCaret

#### PyCaret
+ PyCaret yra atvirojo kodo, low-code mašininio mokymosi biblioteka parašyta Python'u, kuri leidžia greitai pereiti nuo duomenų paruošimo iki modelio įdiegimo per kelias sekundes ir sutaupo šimtus Python'o kodo eilučių.

+ Ji apima keletą populiarių ML ir duomenų mokslo bibliotekų, kai jos oficiali svetainė yra https://pycaret.org/

# <i> Jei galite labai rekomenduoju ją naudoti.

In [4]:
#### Installation
# !pip install pycaret

# Loading data

# First look at the data 

![Screenshot 2022-12-04 at 08.24.27.png](attachment:47e561b7-ebad-4768-8a5e-d9153d44655f.png)

# Cleaning data

# Using PyCaret for ML

## Import

## Initialization

### Confusion Matrix

![Screenshot 2022-12-04 at 16.58.13.png](attachment:a4d142a3-7bb3-4a44-8dcd-bbabc96e6986.png)

#### Compare Clasification Model with their Metrics
+ For Classification Problems
  - Accuracy is the number of correct predictions divided by the total number of predictions: Accuaracy = (TP+TN) / (TP+TN+FP+FN) 
  - Precision is the number of True Positives (TP) divided by the sum of TP & FP: Precision = TP / (TP+FP)
  - Recall uses the same principle as Precision, except the focus is now on the False Negatives instead of the False Recall: TP / (TP+FN) 
  - F1 takes into consideration both Precision and Recall: 2(Precision * Recall) / (Precision + Recall) 
  - AUC measures seperability between TN & TP
  - Kappa is frequently used to test interrater reliability. 
  - MCC takes into account all four values in the confusion matrix: (TP*TN - FP-*FN) / sqrt((TP+FP)(TP+FN)(TN+FP)(TN+FN)) | A high value means that both classes (trues and negatives) are predicted well.
+ For Regression Problems
  - MAE, MSE, RMSE, R2, RMSLE and MAPE
  
 
 

## Train (& Evaluate) Model 

### Overview of PyCaret
+ Pycaret builds a model using several algorithms and compares the best
+ It automatically sort them from the best accuracy to the worst 
+ It highlightx the best model according to the classification metrics


## Create Model


## Optimize Model

## Optimize The Model Parameter

## Analyze Model

## Optimize more


## Interpret Model

## Predict the Model


## Conclusion

+ PyCaret is an open-source automates machine learning workflows.
+ For data oreparation use the setup() function.
+ Now that data preparation is done, we start the training process by using compare_models functionality.
+ Create and optimize the selected algorithm through create_model and tune_model
+ Analyze model through evaluate_model 
+ Interpret model by interpet_model() through SHAP values that do not tell you anything about model performance but instead, interpret the impact of having a certain value for a given feature in comparison to the prediction we’d make if that feature took some baseline value.
+ Finalize model by finalize_model()
+ Predict model by predict_model() to apply the tuned model to new leads to generate the score.