In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
file_path = 'SpotifyFeatures.csv'
data = pd.read_csv(file_path)
data.head()

### Vizualiing the data

In [None]:
# visulize "duplicate songs"(same trackid = same song)
data['track_id'].value_counts()

In [None]:
# display how many songs are "duplicated"(same trackid = same song) distrobution
data['track_id'].value_counts().value_counts()

In [None]:
# Sample to Understand why there are duplicated songs
data[data['track_id'] == '6AIte2Iej1QKlaofpjCzW1']

In [None]:
# Visulize the distribution of songs into all genres
data['genre'].value_counts()

In [None]:
data.describe() 

In [None]:
# how many songs are above 50 in popularity
hits = data[data['popularity'] > 50].shape[0]
miss = data[data['popularity'] < 50].shape[0]

# visulize the distribution of popularity
sns.histplot(data['popularity'], kde=True)
plt.show()

hits_ratio = hits/(hits+miss)*100
print(f"{hits_ratio:.3f}% of the songs are above 50 in popularity")
print(f"with an average popularity of {data['popularity'].mean():.3f}")

### Conclusions
- data has songs that are to long we want 8.3 minutes to be max and 1 minute to be min because we want to measure normal songs
- some attributes need to be transformed to numerical scale
- A capella is to small to be considered
- Children's Music and Children’s Music should be the same

### Data prepping steps
- remove song that are to long in duration songs longer than 500000 ms (500 seconds = 8.3 minutes)
- remove songs that are to short in duration songs shorter than 60000 ms (60 seconds = 1 minute)
- Remove A Capella songs due to being to small
- Combine union child genres into one genre Children's Music
- Split songs into it's genres
- Convert songs to Miss or Hit based on popularity

### Create new attribute
- Genre count for each song with multilabeld genres

In [None]:
# Drop songs that are to long
data = data[data['duration_ms'] < 500000]
# Drop songs that are to short 
data = data[data['duration_ms'] > 60000]

# drop gernre a capella
data = data[data['genre'] != 'A Capella']

# Combine union child genres
data['genre'] = data['genre'].replace('Children’s Music', 'Children\'s Music')

# Create a new column for genre count
data['genre_count'] = data.groupby('track_id')['genre'].transform('count')

### Data Preprocessing
- Pitch preprocessing
- Time signature preprocessing
- Mode preprocessing (minor/major)
- Create datasets for each genre
- Flop or Bop labeling

In [None]:
# Function to convert pitch to number
def pitch_to_number(pitch):
    pitch_map = {
        'C': 0,
        'C#': 1, 'Db': 1,
        'D': 2,
        'D#': 3, 'Eb': 3,
        'E': 4, 'Fb': 4,
        'E#': 5, 'F': 5,
        'F#': 6, 'Gb': 6,
        'G': 7,
        'G#': 8, 'Ab': 8,
        'A': 9,
        'A#': 10, 'Bb': 10,
        'B': 11, 'Cb': 11
    }
    return pitch_map.get(pitch, None)

# Replace the pitch column with its numeric representation
# Assuming the column name in your DataFrame that contains pitch values is 'pitch_column'
data['key'] = data['key'].apply(pitch_to_number)

In [None]:
data['time_signature'] = (data['time_signature'].apply(lambda x: x.split('/')[0])).astype(int)

In [None]:
# Convert the mode column to 1 for major and 0 for minor
data['mode'] = data['mode'].replace("Minor", 0)
data['mode'] = data['mode'].replace("Major", 1)

In [None]:
# Group by genre and calculate mean popularity
grouped = data.groupby('genre')
mean_popularity = grouped['popularity'].mean()
mean_popularity = mean_popularity.sort_values(ascending=False)

In [None]:
# Function to label rows as 'bop' or 'flop'
def label_popularity(row, mean_popularity):
    if row['popularity'] >= mean_popularity[row['genre']]:
        return 'bop'
    else:
        return 'flop'

# Apply the function to each row
data['popularity_label'] = data.apply(lambda row: label_popularity(row, mean_popularity), axis=1)

# Splitting the data into bop and flop, ensuring each genre is split 50/50
# This step might require adjusting the labels for genres with an odd number of entries
for genre in data['genre'].unique():
    genre_data = data[data['genre'] == genre]
    n = len(genre_data) // 2
    popular_indices = genre_data.nlargest(n, 'popularity').index
    nonpopular_indices = genre_data.nsmallest(n, 'popularity').index
    data.loc[popular_indices, 'popularity_label'] = 'bop'
    data.loc[nonpopular_indices, 'popularity_label'] = 'flop'

### Visualization so far
- Visualize the distribution of the target variable in terms of genres
- Visualize the distribution of bop and flop songs in terms of genres songs from spotify with a popularity over 50 and under 50
- Visualize the distribution of the new 50/50 split to balance the dataset

In [None]:
# plot the popularity of each genre and color children's music
mean_popularity.plot(kind='bar', figsize=(15, 10))
colors = ['red' if genre == 'Children\'s Music' else 'skyblue' for genre in mean_popularity.index]
plt.bar(mean_popularity.index, mean_popularity, color=colors)
plt.title('Mean Popularity of Each Genre')
plt.xlabel('Genre')
plt.ylabel('Mean Popularity')
plt.show()

In [None]:
data['popularity_over_50'] = data['popularity'] >= 50

popularity_over_50_df = data.groupby(['genre', 'popularity_over_50']).size().unstack(fill_value=0)
popularity_over_50_df['total'] = popularity_over_50_df.sum(axis=1)
popularity_over_50_df_sorted = popularity_over_50_df.sort_values(by='total', ascending=False)
popularity_over_50_df_sorted = popularity_over_50_df_sorted.drop(columns='total')

# Plotting
popularity_over_50_df_sorted.plot(kind='bar', stacked=True, figsize=(20, 10))
plt.title('Popularity Over 50 by Genre')
plt.xlabel('Genre')
plt.ylabel('Count')
plt.show()

In [None]:
accuracy_dict_over50 = {}
for genre in data['genre'].unique():
    genre_data = data[data['genre'] == genre]
    genre_data = genre_data.dropna()
    X = genre_data[['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']]
    y = genre_data['popularity_over_50']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    accuracy_dict_over50[genre] = accuracy
    print(f'Accuracy for {genre}: {accuracy}')


In [None]:
accuracy_dict_over50 = dict(sorted(accuracy_dict_over50.items(), key=lambda item: item[1], reverse=True))

accuracy_df = pd.DataFrame(list(accuracy_dict_over50.items()), columns=['Genre', 'Accuracy'])

accuracy_df.plot(kind='bar', x='Genre', y='Accuracy', figsize=(20, 10), legend=True)
plt.title('Model Accuracy by Genre')
plt.ylabel('Accuracy')
plt.xlabel('Genre')
plt.xticks(rotation=45)
plt.show()

In [None]:

popularity_label_df = data.groupby(['genre', 'popularity_label']).size().unstack(fill_value=0)


popularity_label_df['total'] = popularity_label_df.sum(axis=1)


popularity_label_df_sorted = popularity_label_df.sort_values(by='total', ascending=False)
popularity_label_df_sorted = popularity_label_df_sorted.drop(columns='total')

popularity_label_df_sorted.plot(kind='bar', stacked=True, figsize=(20, 10))
plt.title('Popularity Label by Genre')
plt.xlabel('Genre')
plt.ylabel('Count')
plt.show()


In [None]:
accuracy_dict_50percent = {}
for genre in data['genre'].unique():
    genre_data = data[data['genre'] == genre]
    genre_data = genre_data.dropna()
    X = genre_data[['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']]
    y = genre_data['popularity_label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    accuracy_dict_50percent[genre] = accuracy
    print(f'Accuracy for {genre}: {accuracy}')

    


In [None]:
accuracy_dict_50percent = dict(sorted(accuracy_dict_50percent.items(), key=lambda item: item[1], reverse=True))

accuracy_df = pd.DataFrame(list(accuracy_dict_50percent.items()), columns=['Genre', 'Accuracy'])

# Plotting
accuracy_df.plot(kind='bar', x='Genre', y='Accuracy', figsize=(20, 10), legend=True)
plt.title('Model Accuracy by Genre')
plt.ylabel('Accuracy')
plt.xlabel('Genre')
plt.xticks(rotation=45)
plt.show()

### Comments on genre selection to further analyze
- By splitting the genres into it's own column we can now analyze the data in terms of genres this results in us having 25 datasets to analyze. To make the analysis easier to understand we will only analyze the biggest dataset children's music. 

### Next steps
- quick overview
  - check for duplicates
  - check avg popularity (to understand target variable)
  - find weird values
  - Analyze artist distribution (to understand if the artst with the most songs can make the dataset biased) # side note: Each artist has a different style so this could be a good thing
- Analyze 

In [None]:
childrens_music = data[data['genre'] == 'Children\'s Music']
# plot the distribution of popularity for children's music
sns.histplot(childrens_music['popularity'])
plt.title('Popularity Distribution for Children\'s Music')
plt.xlabel('Popularity')
plt.ylabel('Count')
plt.show()
print(childrens_music['popularity'].mean())

In [None]:
# Check for songs with same track_id to find duplicates
print(childrens_music[childrens_music['track_id'].duplicated()])

In [None]:
# Find songs with bad values - like the sample below
childrens_music[childrens_music['track_id'] == '7ARLbcqLgOrBI2JfzfKtHD']

In [None]:
# Replace the $ in the track_name and artist_name columns with s
childrens_music.loc[:, 'track_name'] = childrens_music['track_name'].str.replace('$', 's')
childrens_music.loc[:, 'artist_name'] = childrens_music['artist_name'].str.replace('$', 's')

childrens_music[childrens_music['track_id'] == '7ARLbcqLgOrBI2JfzfKtHD']

In [None]:
total_songs = len(childrens_music)
 
# Calculate the average popularity for each artist
avg_popularity_by_artist = childrens_music.groupby('artist_name')['popularity'].mean()

# Find the top 20 artists by the number of songs
top_20_artists = childrens_music['artist_name'].value_counts().head(20)
top_20_artists_df = top_20_artists.reset_index()
top_20_artists_df.columns = ['artist_name', 'Song Count']

# Merge the average popularity data with the top 20 artists DataFrame
top_20_artists_with_avg_popularity = pd.merge(top_20_artists_df, avg_popularity_by_artist, on='artist_name')

# Calculate the share of total songs for each of the top 20 artists and add it to the DataFrame
top_20_artists_with_avg_popularity['% of songs'] = (top_20_artists_with_avg_popularity['Song Count'] / total_songs) * 100

#'Share of Total Songs (%)' round 2 decimals
top_20_artists_with_avg_popularity['popularity'] = top_20_artists_with_avg_popularity['popularity'].round(2)
top_20_artists_with_avg_popularity['% of songs'] = top_20_artists_with_avg_popularity['% of songs'].round(2)

print(top_20_artists_with_avg_popularity)

In [None]:
# Create a list of the columns to plot with audio features, duration and genre count
columns = ['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'genre_count']

# Create a figure and axis to plot on
fig, ax = plt.subplots(5, 2, figsize=(20, 20))

# Flatten the axis to make it easier to iterate over
ax = ax.flatten()

# Iterate over the columns and plot each one
for i, col in enumerate(columns):
    sns.histplot(childrens_music[col], kde=True, ax=ax[i])
    ax[i].set_title(f'{col} Distribution')

# Display the plots
plt.tight_layout()
plt.show()


In [None]:
# Dive deeper into instrumentalness
instrumentalness = childrens_music['instrumentalness']
instrumentalness.describe()

# show the most common values for instrumentalness and how many songs are above 0
instrumentalness.value_counts().head(10), childrens_music.shape[0]

### Test a model to see 
- feature importance
- current preformance

In [None]:
# Run a test to see what feature is the most important
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Create a new DataFrame with only the audio features and the popularity label
X = childrens_music[['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo']]
y = childrens_music['popularity_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_0 = RandomForestClassifier(n_estimators=100, random_state=42)
model_0.fit(X_train, y_train)
y_pred = model_0.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
feature_importances = pd.DataFrame(model_0.feature_importances_, index=X_train.columns, columns=['importance']).sort_values('importance', ascending=False)
print("feature importances")
print(feature_importances)

###  Conclusion on Children's Music dataset
- The 2 biggest artists are contributing to 10% of the dataset which is not to much to make the dataset biased 
- The dataset need some some scaling and normalization to be able to use it in a model
- the target variable is now flop or bop and popularity is not needed anymore
- The most import features can be viewed above.
- The dataset is now free from weird values and duplicates

### Next steps
- preform log transformation on the dataset to make it a tiny bit more normal distributed
- split the dataset into a train and test set before scaling to prevent data leakage
- scale the dataset before preforming PCA
- preform PCA to reduce the dimensionality of the dataset

In [None]:
# preform logtransofrmation on livness, speechiness
childrens_music['liveness'] = np.log1p(childrens_music['liveness'])
childrens_music['speechiness'] = np.log1p(childrens_music['speechiness'])

# Create a figure and axis to plot on
fig, ax = plt.subplots(5, 2, figsize=(20, 20))

# Flatten the axis to make it easier to iterate over
ax = ax.flatten()

# Iterate over the columns and plot each one
for i, col in enumerate(columns):
    sns.histplot(childrens_music[col], kde=True, ax=ax[i])
    ax[i].set_title(f'{col} Distribution')

# Display the plots
plt.tight_layout()
plt.show()

# get data to csv
childrens_music.to_csv('childrens_music.csv', index=False)

In [None]:
# What data do we want and what order do we want it
X = childrens_music[['track_name', 'artist_name', 'track_id', 'genre_count','mode', 'time_signature','acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'duration_ms', 'valence']]
y = childrens_music['popularity_label']

# Split the data into a training and testing set for final model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.head()

In [None]:
predict_features = ['genre_count', 'mode', 'time_signature','acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']

model_2 = RandomForestClassifier(n_estimators=100, random_state=42)

model_2.fit(X_train[predict_features], y_train)

y_pred = model_2.predict(X_test[predict_features])

print(classification_report(y_test, y_pred))

feature_importances = pd.DataFrame(model_2.feature_importances_, index=predict_features, columns=['importance']).sort_values('importance', ascending=False)
feature_importances

#### Scaling selection
Standard scaler: This technique transforms the features so they have the properties of a standard normal distribution with a mean of 0 and a standard deviation of 1. It's useful when your data follows a Gaussian distribution and when using algorithms sensitive to variance in the data, such as Support Vector Machines (SVMs) and Principal Component Analysis (PCA). From this we will exclude genre count.

In [None]:
from sklearn.preprocessing import StandardScaler

# Selecting the numerical features for standardization
numerical_features = ['acousticness', 'danceability', 'duration_ms', 'energy', 
                      'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']

# Standardizing the numerical features
scaler = StandardScaler()

# Fit and transform the training data
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])

# Transform the testing data
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

X_test.head()

In [None]:
predict_features = ['genre_count', 'mode', 'time_signature','acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']

model_2 = RandomForestClassifier(n_estimators=100, random_state=42)

model_2.fit(X_train[predict_features], y_train)

y_pred = model_2.predict(X_test[predict_features])

print(classification_report(y_test, y_pred))

In [None]:
# Assuming X_train[features] is your scaled dataset for training
features = ['acousticness', 'danceability', 'duration_ms', 'energy', 
            'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'genre_count', 'mode', 'time_signature']

# Correlation analysis
corr_matrix = np.corrcoef(X_train[features].T)
sns.heatmap(corr_matrix, annot=True)
plt.show()

high_correlation_threshold = 0.41 # 

# Calculate the number of variables with at least one high correlation
# We subtract the count by the length of the matrix to ignore the diagonal (self-correlation)
high_correlations = np.sum((np.abs(corr_matrix) > high_correlation_threshold) & (corr_matrix != 1)) / 2


# Decide whether PCA is recommended
# This threshold can be adjusted based on the size of the matrix or domain knowledge
pca_recommended = high_correlations > len(corr_matrix) * 0.5

print(f"PCA is {'recommended' if pca_recommended else 'not recommended'} based on the correlation matrix. There are {int(high_correlations)} pairs of highly correlated variables.")

### We decided to use PCA
because we want to see the impact of the features on the target variable and we want to see if we can reduce the dimensionality of the dataset. But from the analysis we can see that the dataset is not to big so we might not need to use PCA and the correlation between the features are not to high so we might not need to use PCA. But we will still use it to see if we can reduce the dimensionality of the dataset.

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(X_train[features])

# Calculating the cumulative explained variance ratio
cumulative_explained_variance = np.cumsum(pca.explained_variance_ratio_)
limit = 0.85

# Plotting the cumulative explained variance to visualize the optimal number of components
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(cumulative_explained_variance) + 1), cumulative_explained_variance, marker='o', linestyle='--')
plt.title('Cumulative Explained Variance by PCA Components')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.axhline(y=limit, color='r', linestyle='--', label='85% Explained Variance')
plt.axvline(x=np.where(cumulative_explained_variance >= limit)[0][0] + 1, color='r', linestyle='--')
plt.legend(loc='best')
plt.grid(True)
plt.show()

# To find the exact number of components explaining 85% of the variance
optimal_n_components = np.where(cumulative_explained_variance >= limit)[0][0] + 1
print(f"Optimal number of components to retain 85% of the variance: {optimal_n_components}")

# Now, fitting PCA again with the optimal number of components found
pca_optimal = PCA(n_components=optimal_n_components)
X_train_pca_optimal = pca_optimal.fit_transform(X_train[features])
X_test_pca_optimal = pca_optimal.transform(X_test[features])

In [None]:
# If needed, creating a DataFrame for the PCA-transformed training data for easier analysis
pca_columns = [f'PC{i+1}' for i in range(optimal_n_components)]
pca_train_df = pd.DataFrame(X_train_pca_optimal, columns=pca_columns)

# Similarly, creating a DataFrame for the PCA-transformed testing data
pca_test_df = pd.DataFrame(X_test_pca_optimal, columns=pca_columns)

# Display the first few rows of the PCA-transformed training DataFrame
print(pca_train_df.head())

# Additionally, to examine the variance explained by the chosen components
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance = explained_variance_ratio.cumsum()

print("Explained variance ratio by component:", explained_variance_ratio)
print("Cumulative variance explained:", cumulative_variance)

In [None]:
# Model with PCA
model_pca = RandomForestClassifier(n_estimators=100, random_state=42)

model_pca.fit(X_train_pca_optimal, y_train)

y_pred_pca = model_pca.predict(X_test_pca_optimal)

print(classification_report(y_test, y_pred_pca))

# Classification model selection
- Random Forest classifier
- Support Vector Machine
- XGBoost
- Logistic Regression 
- Decision Tree Classifier
- K-Nearest Neighbors Classifier

In [None]:
# Selection of the best model
from sklearn.svm import SVC, LinearSVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, accuracy_score, roc_auc_score 
from sklearn.model_selection import GridSearchCV
# The datasets with PCA components
X_train_pca_optimal, X_test_pca_optimal

# The datasets without PCA components
X_train_normal = X_train[features]
X_test_normal = X_test[features]

In [None]:
# Convert flop or bop to 0 or 1
y_train = y_train.replace({'flop': 0, 'bop': 1})
y_test = y_test.replace({'flop': 0, 'bop': 1})

## Random Forest Classifier

In [None]:
RFC_Model = RandomForestClassifier()
RFC_Model.fit(X_train_normal, y_train)
RFC_Predict = RFC_Model.predict(X_test_normal)
RFC_Accuracy = accuracy_score(y_test, RFC_Predict)
print("Accuracy: " + str(RFC_Accuracy))
RFC_AUC = roc_auc_score(y_test, RFC_Predict)
print("AUC: " + str(RFC_AUC))

## Support Vector Machine

In [None]:
SVC_Model = SVC()
SVC_Model.fit(X_train_normal, y_train)
SVC_Predict = SVC_Model.predict(X_test_normal)
SVC_Accuracy = accuracy_score(y_test, SVC_Predict)
print("Accuracy: " + str(SVC_Accuracy))
SVC_AUC = roc_auc_score(y_test, SVC_Predict)
print("AUC: " + str(SVC_AUC))

In [None]:
# With PCA
SVC_Model = SVC()
SVC_Model.fit(X_train_pca_optimal, y_train)
SVC_Predict = SVC_Model.predict(X_test_pca_optimal)
SVC_Accuracy = accuracy_score(y_test, SVC_Predict)
print("Accuracy: " + str(SVC_Accuracy))
SVC_AUC = roc_auc_score(y_test, SVC_Predict)
print("AUC: " + str(SVC_AUC))

## XGBoost


In [None]:
XGB_Model = XGBClassifier()
XGB_Model.fit(X_train_normal, y_train)
XGB_Predict = XGB_Model.predict(X_test_normal)
XGB_Accuracy = accuracy_score(y_test, XGB_Predict)
print("Accuracy: " + str(XGB_Accuracy))
XGB_AUC = roc_auc_score(y_test, XGB_Predict)
print("AUC: " + str(XGB_AUC))

## Logistic Regression

In [None]:
LR_Model = LogisticRegression()
LR_Model.fit(X_train_normal, y_train)
LR_Predict = LR_Model.predict(X_test_normal)
LR_Accuracy = accuracy_score(y_test, LR_Predict)
print("Accuracy: " + str(LR_Accuracy))
LR_AUC = roc_auc_score(y_test, LR_Predict) 
print("AUC: " + str(LR_AUC))

## Decision Tree Classifier

In [None]:
DT_Model = DecisionTreeClassifier()
DT_Model.fit(X_train_normal, y_train)
DT_Predict = DT_Model.predict(X_test_normal)
DT_Accuracy = accuracy_score(y_test, DT_Predict)
print("Accuracy: " + str(DT_Accuracy))
DT_AUC = roc_auc_score(y_test, DT_Predict)
print("AUC: " + str(DT_AUC))

In [None]:
# whit PCA
DT_Model = DecisionTreeClassifier()
DT_Model.fit(X_train_pca_optimal, y_train)
DT_Predict = DT_Model.predict(X_test_pca_optimal)
DT_Accuracy = accuracy_score(y_test, DT_Predict)
print("Accuracy: " + str(DT_Accuracy))
DT_AUC = roc_auc_score(y_test, DT_Predict)
print("AUC: " + str(DT_AUC))

## KNears Neighbors Classifier

In [None]:
KNN_Model = KNeighborsClassifier()
KNN_Model.fit(X_train_normal, y_train)
KNN_Predict = KNN_Model.predict(X_test_normal)
KNN_Accuracy = accuracy_score(y_test, KNN_Predict)
print("Accuracy: " + str(KNN_Accuracy))
KNN_AUC = roc_auc_score(y_test, KNN_Predict)
print("AUC: " + str(KNN_AUC))

In [None]:
# With PCA
KNN_Model = KNeighborsClassifier()
KNN_Model.fit(X_train_pca_optimal, y_train)
KNN_Predict = KNN_Model.predict(X_test_pca_optimal)
KNN_Accuracy = accuracy_score(y_test, KNN_Predict)
print("Accuracy: " + str(KNN_Accuracy))
KNN_AUC = roc_auc_score(y_test, KNN_Predict)
print("AUC: " + str(KNN_AUC))

In [None]:
# plot ascending in accuracy and then an plot for AUC
models = ['RFC', 'SVC', 'XGB', 'LR', 'DT', 'KNN']
accuracy = [RFC_Accuracy, SVC_Accuracy, XGB_Accuracy, LR_Accuracy, DT_Accuracy, KNN_Accuracy]
auc = [RFC_AUC, SVC_AUC, XGB_AUC, LR_AUC, DT_AUC, KNN_AUC]

accuracy_df = pd.DataFrame({'Model': models, 'Accuracy': accuracy})
auc_df = pd.DataFrame({'Model': models, 'AUC': auc})

accuracy_df = accuracy_df.sort_values(by='Accuracy', ascending=True)
auc_df = auc_df.sort_values(by='AUC', ascending=True)

best_model_accuracy = accuracy_df.iloc[-1]['Model']
best_model_AUC = auc_df.iloc[-1]['Model']

print(f"The best model based on accuracy is {best_model_accuracy} with an accuracy of {accuracy_df.iloc[-1]['Accuracy']:.3f}")
print(f"The best model based on AUC is {best_model_AUC} with an AUC of {auc_df.iloc[-1]['AUC']:.3f}")

color_accuracy = ['skyblue' if model != best_model_accuracy else 'red' for model in accuracy_df['Model']]
color_auc = ['skyblue' if model != best_model_AUC else 'red' for model in auc_df['Model']]

# plot in subplots color the best model
fig, ax = plt.subplots(1, 2, figsize=(12, 3))

accuracy_df.plot(kind='bar', x='Model', y='Accuracy', ax=ax[0], legend=False, color=color_accuracy)
ax[0].set_title('Model Accuracy Score')
ax[0].set_ylabel('Accuracy')

auc_df.plot(kind='bar', x='Model', y='AUC', ax=ax[1], legend=False, color=color_auc)
ax[1].set_title('Model AUC Score')
ax[1].set_ylabel('AUC')

plt.tight_layout()
plt.show()

# list format for the best model
accuracy_df.sort_values(by='Accuracy', ascending=False), auc_df.sort_values(by='AUC', ascending=False)