## **Cleaning Data** 

## Import libraries

In [8]:
import numpy as np
import seaborn as sns
import pandas as pd 
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


# Read dataset

In [2]:
music = pd.read_csv("music_genre_dataset.csv")
music

Unnamed: 0,Artist Name,Track Name,Popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_in min/ms,time_signature,Class
0,Bruno Mars,That's What I Like (feat. Gucci Mane),60.0,0.854,0.564,1.0,-4.964,1,0.0485,0.017100,,0.0849,0.8990,134.071,234596.000000,4,5
1,Boston,Hitch a Ride,54.0,0.382,0.814,3.0,-7.230,1,0.0406,0.001100,0.004010,0.1010,0.5690,116.454,251733.000000,4,10
2,The Raincoats,No Side to Fall In,35.0,0.434,0.614,6.0,-8.334,1,0.0525,0.486000,0.000196,0.3940,0.7870,147.681,109667.000000,4,6
3,Deno,Lingo (feat. J.I & Chunkz),66.0,0.853,0.597,10.0,-6.528,0,0.0555,0.021200,,0.1220,0.5690,107.033,173968.000000,4,5
4,Red Hot Chili Peppers,Nobody Weird Like Me - Remastered,53.0,0.167,0.975,2.0,-4.279,1,0.2160,0.000169,0.016100,0.1720,0.0918,199.060,229960.000000,4,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15512,Television,Marquee Moon - Remastered,47.0,0.494,0.657,7.0,-7.543,1,0.0397,0.063200,0.449000,0.0849,0.7370,118.194,645000.000000,4,1
15513,Hulkoff,Martialisk - Svitjod Edition,39.0,0.487,0.984,1.0,-4.790,1,0.1610,0.000439,0.023400,0.3540,0.2910,120.031,225627.000000,4,8
15514,"Red Garland, Paul Chambers, Art Taylor",Almost Like Being In Love,57.0,0.561,0.346,,-12.596,0,0.0409,0.873000,0.399000,0.1040,0.6970,93.689,4.838217,4,7
15515,Squid,Sludge,41.0,0.391,0.835,7.0,-4.944,1,0.0702,0.004850,0.005140,0.1180,0.1090,129.024,300820.000000,4,6


# Type of attributes 

In [3]:
music.dtypes


Artist Name            object
Track Name             object
Popularity            float64
danceability          float64
energy                float64
key                   float64
loudness              float64
mode                    int64
speechiness           float64
acousticness          float64
instrumentalness      float64
liveness              float64
valence               float64
tempo                 float64
duration_in min/ms    float64
time_signature          int64
Class                   int64
dtype: object

# missing percentage  per attribute 

In [4]:
missing_values = music.isna().sum()/len(music)*100
missing_values

Artist Name            0.000000
Track Name             0.000000
Popularity             2.539151
danceability           0.000000
energy                 0.000000
key                   11.232841
loudness               0.000000
mode                   0.000000
speechiness            0.000000
acousticness           0.000000
instrumentalness      23.116582
liveness               0.000000
valence                0.000000
tempo                  0.000000
duration_in min/ms     0.000000
time_signature         0.000000
Class                  0.000000
dtype: float64

In [7]:
music.describe()

Unnamed: 0,Popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_in min/ms,time_signature,Class
count,15123.0,15123.0,15123.0,13433.0,15123.0,15123.0,15123.0,15123.0,11644.0,15123.0,15123.0,15123.0,15123.0,15123.0,15123.0
mean,45.582358,0.541324,0.670336,5.969031,-7.834302,0.651656,0.081049,0.236255,0.17736,0.197262,0.487802,122.877033,201434.0,3.925941,6.586854
std,17.400843,0.167205,0.234423,3.194265,3.984846,0.476461,0.08511,0.308332,0.30255,0.161018,0.239934,29.676659,113980.3,0.360298,3.242448
min,1.0,0.0599,0.00124,1.0,-36.214,0.0,0.0225,0.0,1e-06,0.0119,0.0215,30.557,0.50165,1.0,0.0
25%,34.0,0.429,0.52,3.0,-9.4515,0.0,0.0351,0.00335,9.9e-05,0.0973,0.299,99.732,165229.5,4.0,5.0
50%,45.0,0.543,0.71,6.0,-6.959,1.0,0.0483,0.0663,0.00426,0.129,0.485,120.068,209000.0,4.0,8.0
75%,57.0,0.657,0.866,9.0,-5.142,1.0,0.0849,0.404,0.204,0.26,0.674,142.6065,252946.5,4.0,10.0
max,100.0,0.989,1.0,11.0,1.355,1.0,0.955,0.996,0.996,1.0,0.986,217.416,1477187.0,5.0,10.0


## Handling missing  values in Popularity by drop rows since it's small percentage that missing

In [5]:
music=music.dropna(subset=['Popularity'])


In [6]:
missing_values = music.isna().sum()/len(music)*100
missing_values

Artist Name            0.000000
Track Name             0.000000
Popularity             0.000000
danceability           0.000000
energy                 0.000000
key                   11.175031
loudness               0.000000
mode                   0.000000
speechiness            0.000000
acousticness           0.000000
instrumentalness      23.004695
liveness               0.000000
valence                0.000000
tempo                  0.000000
duration_in min/ms     0.000000
time_signature         0.000000
Class                  0.000000
dtype: float64

# Handling missing values in key and  instrumentalness 

In [9]:
#key 




imp = IterativeImputer(missing_values=np.nan,
                       max_iter=20,tol=0.001)
imp.fit(music[['key']])
music['key'] = data=imp.transform(music[['key']]).ravel()


# instrumentalness 

imp = IterativeImputer(missing_values=np.nan,
                       max_iter=20,tol=0.001)
imp.fit(music[['instrumentalness']])
music['instrumentalness'] = data=imp.transform(music[['instrumentalness']]).ravel()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  music['key'] = data=imp.transform(music[['key']]).ravel()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  music['instrumentalness'] = data=imp.transform(music[['instrumentalness']]).ravel()


In [10]:
missing_values = music.isna().sum()/len(music)*100
missing_values

Artist Name           0.0
Track Name            0.0
Popularity            0.0
danceability          0.0
energy                0.0
key                   0.0
loudness              0.0
mode                  0.0
speechiness           0.0
acousticness          0.0
instrumentalness      0.0
liveness              0.0
valence               0.0
tempo                 0.0
duration_in min/ms    0.0
time_signature        0.0
Class                 0.0
dtype: float64

## **SparkSQL**

## Query number one 

## Query number two

## Query number three

## **Visualization**

## **SparkML**