In [1]:
# Importing all the Required Libraries
# 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
import seaborn as sns
import datetime as dt
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)
init_notebook_mode(connected=True)

In [2]:
df=pd.read_csv('data.csv')
df.head()       
df = df[~df['explicit'].str.startswith("An error occurred")]


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16303 entries, 0 to 16311
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   track_id                  16303 non-null  object 
 1   track_name                16300 non-null  object 
 2   track_artist              16300 non-null  object 
 3   track_popularity          16303 non-null  int64  
 4   track_album_id            16303 non-null  object 
 5   track_album_name          16300 non-null  object 
 6   track_album_release_date  15148 non-null  object 
 7   playlist_name             16303 non-null  object 
 8   playlist_id               16303 non-null  object 
 9   playlist_genre            16303 non-null  object 
 10  playlist_subgenre         16303 non-null  object 
 11  danceability              16303 non-null  float64
 12  energy                    16303 non-null  float64
 13  key                       16303 non-null  int64  
 14  loudness   

In [4]:
# Check for null values
df.isnull().sum()


track_id                       0
track_name                     3
track_artist                   3
track_popularity               0
track_album_id                 0
track_album_name               3
track_album_release_date    1155
playlist_name                  0
playlist_id                    0
playlist_genre                 0
playlist_subgenre              0
danceability                   0
energy                         0
key                            0
loudness                       0
mode                           0
speechiness                    0
acousticness                   0
instrumentalness               0
liveness                       0
valence                        0
tempo                          0
duration_ms                    0
explicit                       0
explicitness_score             0
dtype: int64

In [5]:
#checking for duplicate values
df.duplicated().value_counts()

False    16303
Name: count, dtype: int64

In [6]:
#dropping the duplicate values
df.drop_duplicates(inplace=True)
print(df.iloc[:,23])

0         True
1         True
2         True
3         True
4         True
         ...  
16307    False
16308    False
16309    False
16310    False
16311    False
Name: explicit, Length: 16303, dtype: object


In [7]:
#shape of the dataset
df.shape

(16303, 25)

In [8]:
# Description of the Data
df.describe()

Unnamed: 0,track_popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,explicitness_score
count,16303.0,16303.0,16303.0,16303.0,16303.0,16303.0,16303.0,16303.0,16303.0,16303.0,16303.0,16303.0,16303.0,16303.0
mean,42.935411,0.637117,0.690148,5.343004,-6.975548,0.595289,0.110993,0.182003,0.071944,0.18878,0.515654,121.666302,223426.72134,19.816598
std,23.593105,0.154549,0.18162,3.628919,3.119696,0.490851,0.11012,0.22495,0.208868,0.151297,0.227057,28.249903,58609.290129,58.983886
min,0.0,0.0,0.000175,0.0,-46.448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4000.0,0.0
25%,27.0,0.535,0.574,2.0,-8.4265,0.0,0.0388,0.01615,0.0,0.0943,0.344,99.28,187015.0,0.0
50%,47.0,0.652,0.712,6.0,-6.379,1.0,0.0598,0.0852,1.1e-05,0.127,0.516,120.045,215867.0,0.0
75%,61.0,0.7505,0.832,9.0,-4.8425,1.0,0.14,0.268,0.002765,0.243,0.691,139.9785,252693.0,10.0
max,100.0,0.979,1.0,11.0,1.275,1.0,0.877,0.994,0.994,0.996,0.991,212.137,517125.0,2136.0


In [9]:
fig = px.pie(df.groupby('explicit', as_index=False).size(),
             names='explicit',
             values='size',
             labels={'size': 'Total songs'},
             hole=0.6,
             color_discrete_sequence=['green', 'crimson'],
             template='plotly_dark',
             title='<b>Songs having explicit content')
fig.update_layout(title_x=0.5)


In [10]:
fig = make_subplots(rows=3, cols=3, subplot_titles=('<i>Popularity', '<i>Danceability', '<i>Energy', '<i>Loudness', '<i>Speechiness', '<i>Acousticness', '<i>Liveness', '<i>Valence', '<i>Tempo'))

fig.add_trace(go.Histogram(x=df['track_popularity'], name='popularity'), row=1, col=1)
fig.add_trace(go.Histogram(x=df['danceability'], name='danceability'), row=1, col=2)
fig.add_trace(go.Histogram(x=df['energy'], name='energy'), row=1, col=3)
fig.add_trace(go.Histogram(x=df['loudness'], name='loudness'), row=2, col=1)
fig.add_trace(go.Histogram(x=df['speechiness'], name='speechiness'), row=2, col=2)
fig.add_trace(go.Histogram(x=df['acousticness'], name='acousticness'), row=2, col=3)
fig.add_trace(go.Histogram(x=df['liveness'], name='liveness'), row=3, col=1)
fig.add_trace(go.Histogram(x=df['valence'], name='valence'), row=3, col=2)
fig.add_trace(go.Histogram(x=df['tempo'], name='tempo'), row=3, col=3)

fig.update_layout(height=900, width=900, title_text='<b>Feature Distribution', template='plotly_dark', title_x=0.5)

fig.show()

In [11]:
# Assuming df is your DataFrame containing both numeric and non-numeric data
id_columns = ['track_id', 'track_name', 'track_artist', 'playlist_subgenre', 'playlist_genre', 'track_album_id', 'track_album_name',
              'track_album_release_date', 'playlist_name', 'playlist_id', 'explicit']  # List of columns to exclude

# Create a copy of the DataFrame excluding the ID columns
df_numeric = df.drop(columns=id_columns)

# Convert non-numeric columns to numeric
df_numeric = df_numeric.apply(pd.to_numeric, errors='coerce')

# Compute correlation matrix
corr_matrix = df_numeric.corr()

# Plot the correlation heatmap
fig = px.imshow(corr_matrix,
                text_auto=True,
                height=800,
                width=800,
                color_continuous_scale=px.colors.sequential.Greens,
                aspect='auto',
                title='<b>Pairwise correlation of columns')
fig.update_layout(title_x=0.5)
fig.show()


In [12]:
fig = px.box(df, 
             x='explicit', 
             y='track_popularity', 
             color='explicit', 
             template='plotly_dark',
             color_discrete_sequence=['cyan', 'magenta'],
             title='<b>Popularity Based on Explicit Content')

fig.show()


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

import numpy as np

# Split the data into training and test sets
y_data = df.iloc[:, -2]  # "explicit" column 
y_list = np.array(y_data.tolist())
# Convert the boolean values to integers
le = LabelEncoder()
y = le.fit_transform(y_list)

explicitness_score_data = df.iloc[:, -1]  # "explicitness_score" column (assuming it's the last column)
x_list = np.array(explicitness_score_data.tolist())
x = x_list.reshape(-1, 1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=(3000/16303), random_state=42)
lst = []
# Initialize Logistic Regression classifier
lgr_classifier = LogisticRegression()

# Train the classifier
lgr_classifier.fit(x_train, y_train)

# Make predictions on the test set
y_pred = lgr_classifier.predict(x_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Extracting true positives, false positives, true negatives, and false negatives
TP = conf_matrix[1, 1]
FP = conf_matrix[0, 1]
TN = conf_matrix[0, 0]
FN = conf_matrix[1, 0]

# Calculate Precision (Positive Predictive Value)
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision:.2f}')

# Calculate Recall (True Positive Rate or Sensitivity)
recall = recall_score(y_test, y_pred)  # or simply TP / (TP + FN)
print(f'Recall (True Positive Rate): {recall:.2f}')

# Calculate False Positive Rate
FPR = FP / (FP + TN)
print(f'False Positive Rate: {FPR:.2f}')


Accuracy: 0.79
Confusion Matrix:
 [[2152   60]
 [ 576  212]]
Precision: 0.78
Recall (True Positive Rate): 0.27
False Positive Rate: 0.03


In [41]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

import numpy as np


# Split the data into training and test sets
y_data = df.iloc[:, -2]  # "explicit" column 
y_list = np.array(y_data.tolist())
# Convert the boolean values to integers
le = LabelEncoder()
y = le.fit_transform(y_list)

# Select columns 11 to 23 (including 23)
x_data = df.iloc[:, 11:23]

x_train, x_test, y_train, y_test = train_test_split(x_data, y, test_size=(3000/16303), random_state=42)
lst = []
# Initialize Logistic Regression classifier
lgr_classifier = LogisticRegression()

# Train the classifier
lgr_classifier.fit(x_train, y_train)

# Make predictions on the test set
y_pred = lgr_classifier.predict(x_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Extracting true positives, false positives, true negatives, and false negatives
TP = conf_matrix[1, 1]
FP = conf_matrix[0, 1]
TN = conf_matrix[0, 0]
FN = conf_matrix[1, 0]

# Calculate Precision (Positive Predictive Value)
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision:.2f}')

# Calculate Recall (True Positive Rate or Sensitivity)
recall = recall_score(y_test, y_pred)  # or simply TP / (TP + FN)
print(f'Recall (True Positive Rate): {recall:.2f}')

# Calculate False Positive Rate
FPR = FP / (FP + TN)
print(f'False Positive Rate: {FPR:.2f}')

Accuracy: 0.74
Confusion Matrix:
 [[2212    0]
 [ 788    0]]
Precision: 0.00
Recall (True Positive Rate): 0.00
False Positive Rate: 0.00
