# Import data

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np

data = pd.read_csv('ml-03-data-processing-songs-dataset.csv')

# Drop missing values

In [None]:
# Drop columns
for x,y in enumerate(data.isna().sum()): #identify missing values in each column
    if y > data.shape[0]/2: #if missing value > 50% of rows
        data.drop(data.columns[x],axis=1,inplace=True)
        print(f'Successfully dropped column: {data.columns[x]}, with {y} null values')

# Drop rows
Index_rows_missing = data[data.isna().sum(axis=1)>= data.shape[1]/2].index # index of rows with missing values > 50%
if Index_rows_missing.size > 0:
    print(f'Successfully dropped {len(Index_rows_missing)} rows.')
    data.drop(Index_rows_missing,axis=0,inplace=True)

# Impute Missing Values

In [None]:
#isolate columns by datatype
columns_numeric = data.select_dtypes(include=np.number)
columns_categorical = data.select_dtypes(include=['object'])

#fit using sklearn
imputer = SimpleImputer(strategy='mean').fit_transform(columns_numeric)
transformed_numeric = pd.DataFrame(imputer) #convert transformed data into pd.dataframe type

imputer = SimpleImputer(strategy='most_frequent').fit_transform(columns_categorical)
transformed_categorical = pd.DataFrame(imputer)

#rename column names
transformed_numeric.columns = columns_numeric.columns
transformed_categorical.columns = columns_categorical.columns

#reassemble into dataframe
data_new = pd.concat([transformed_numeric,transformed_categorical],axis=1)


# Identifying outliers

In [None]:
#Identifying outliers 3 standard deviations away
means = data.describe().loc['mean']
bounds = data.describe().loc['std']*3
columns = data.describe().columns
outliers = []
for i in range(len(means)):
    upperbound = means[i] + bounds[i]
    lowerbound = means[i] - bounds[i]
    outliers = list(filter(lambda x: x > upperbound or x < lowerbound,data_new[columns[i]]))
    outliers.sort()
    if len(outliers) > 0: # if outlier exists
        print(f'{columns[i]} - (Outliers: {len(outliers)}|{round(len(outliers)/len(data_new)*100,2)}%)')
        print(f'\tList: {outliers}')
        print()

In [None]:
data_new[columns[0]].sort_values(ascending=False)

# Date modification

In [None]:
from datetime import datetime,date
# Convert Month (String) to Numeric
try:
    data_new['Month'] = data_new['Month'].apply(lambda x: str(datetime.strptime(x,'%b').month) if not x.isnumeric() else x)
except:
    print('Error when converting')
    
# Handle incorrect values in Year column
for index in data_new[data_new.Year < 1000].Year.index:
    data_new['Year'].at[index] = 1992

data_new['Year'] = data_new['Year'].astype(int) # convert to int, remove decimals

# Convert Date to Datetime (new column)

In [None]:
age = []
data_new['day']= '01' #create a day column for passing value.
_date = pd.concat([data_new['Year'],data_new['Month'],data_new['day']],axis=1)

column_date = pd.to_datetime(_date,format='%m/%Y/%d') #convert to date format
for x in column_date.iteritems():
    age.append(date.today().year - x[1].year)

data_new['Age'] = age # Assign new column: age


In [None]:
# Delete other date format columns
data_new.drop(['Year','Month','day'],axis=1,inplace=True)
data_new

# Create Dummy variables for categorical variables

In [None]:
pd.get_dummies(data.select_dtypes(include='object'),drop_first=True)