# Specialist Certificate in Data Analytics Essentials Assignment

## Chapter 4: Data Preparation and Feature Engineering

## Imports

In [1]:
%cd ..

/home/michael/Documents/python_projects/UCDPA_Michael_Sandilands


In [2]:
import pandas as pd
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from umap import UMAP

## Import Data

In [3]:
invoice_lines_df = pd.read_csv('./00_Data/invoice_lines.csv')

invoice_lines_df.head()

Unnamed: 0,InvoiceLineId,InvoiceId,CustomerId,UnitPrice,Quantity,TrackName,Composer,Milliseconds,Bytes,GenreName,AlbumTitle,ArtistName
0,1,1,2,0.99,1,Balls to the Wall,,342562,5510424,Rock,Balls to the Wall,Accept
1,2,1,2,0.99,1,Restless and Wild,"F. Baltes, R.A. Smith-Diesel, S. Kaufman, U. D...",252051,4331779,Rock,Restless and Wild,Accept
2,3,2,4,0.99,1,Put The Finger On You,"Angus Young, Malcolm Young, Brian Johnson",205662,6713451,Rock,For Those About To Rock We Salute You,AC/DC
3,4,2,4,0.99,1,Inject The Venom,"Angus Young, Malcolm Young, Brian Johnson",210834,6852860,Rock,For Those About To Rock We Salute You,AC/DC
4,5,2,4,0.99,1,Evil Walks,"Angus Young, Malcolm Young, Brian Johnson",263497,8611245,Rock,For Those About To Rock We Salute You,AC/DC


## Gathering Features

I plan to cluster each of our variables, 'GenreName', 'ArtistName' and 'Milliseconds', seperately. Allowing us to make inferences about how customers are segmented across each different variable.

### Categorical Variables - 'GenreName' & 'ArtistName'

- Step 1: For these variables I'm going to count the frequency of each category across each customer. 
- Step 2: I'm then going to transform these categories into a customer - category frequency matrix. 
- Step 3: I'm going to transform these features so they follow an approximate normal distribution
- Step 4: I'm going to normalize (centre and scale) these features. 
- Step 5: Finally, because KMeans does not perform well on high dimensional data (see [k-Means Advantages and Disadvantages](https://developers.google.com/machine-learning/clustering/algorithm/advantages-disadvantages) for reference), I'm going to use [Uniform Manifold Approximation and Projection (UMAP)](https://umap-learn.readthedocs.io/en/latest/) to reduce the number of dimensions to 2.


The function below takes care of steps 1 & 2. A pipeline takes care of the rest of the steps.

In [4]:
def to_customer_item_table(data, column):
    
    customer_item_table = data[['CustomerId', column]] \
    .value_counts(['CustomerId', column]) \
    .reset_index() \
    .pivot(
        index = 'CustomerId',
        columns = column,
        values = 0
    ) \
    .rename_axis(None) \
    .rename_axis(None, axis=1) \
    .fillna(0) \
    .rename(columns= lambda col: column + '_' + col) 
    
    customer_item_table.index.name = 'CustomerId'
    
    return customer_item_table

#### Genre Name

In [5]:
# Step 1 & 2
customer_genre_table = to_customer_item_table(invoice_lines_df, 'GenreName')

# Step 3, 4 & 5
genre_pipe = make_pipeline(PowerTransformer(), StandardScaler(), UMAP(n_components=2, random_state=42))
genre_features = genre_pipe.fit_transform(customer_genre_table)

customer_genre_table.head()

Unnamed: 0_level_0,GenreName_Alternative,GenreName_Alternative & Punk,GenreName_Blues,GenreName_Bossa Nova,GenreName_Classical,GenreName_Comedy,GenreName_Drama,GenreName_Easy Listening,GenreName_Electronica/Dance,GenreName_Heavy Metal,...,GenreName_Pop,GenreName_R&B/Soul,GenreName_Reggae,GenreName_Rock,GenreName_Rock And Roll,GenreName_Sci Fi & Fantasy,GenreName_Science Fiction,GenreName_Soundtrack,GenreName_TV Shows,GenreName_World
CustomerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,3.0,14.0,0.0,2.0,0.0,2.0,0.0,0.0
2,0.0,2.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,17.0,0.0,0.0,0.0,3.0,0.0,0.0
3,0.0,4.0,0.0,4.0,4.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,2.0,0.0,0.0,0.0,1.0,0.0
4,4.0,2.0,0.0,0.0,5.0,0.0,2.0,0.0,2.0,0.0,...,0.0,0.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0,1.0
5,0.0,4.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,4.0,0.0,0.0,15.0,0.0,0.0,0.0,0.0,2.0,0.0


First 10 rows of the clustering ready 'GenreName' data.

In [6]:
print(genre_features[:10])

[[13.044911   3.6601636]
 [13.011441   2.765037 ]
 [15.623071   0.6658808]
 [14.938987   3.08864  ]
 [15.064433   0.9512974]
 [15.852441   2.0758345]
 [14.083157   1.0859814]
 [14.003555   2.5418613]
 [13.279124   2.4457643]
 [13.550403   2.8698995]]


#### Artist Name

In [7]:
# Step 1 & 2
customer_artist_table = to_customer_item_table(invoice_lines_df, 'ArtistName')

# Step 3, 4 & 5
artist_pipe = make_pipeline(PowerTransformer(), StandardScaler(), UMAP(n_components=3, random_state=42))
artist_features = artist_pipe.fit_transform(customer_artist_table)

customer_artist_table.head()

Unnamed: 0_level_0,ArtistName_AC/DC,ArtistName_Academy of St. Martin in the Fields & Sir Neville Marriner,"ArtistName_Academy of St. Martin in the Fields, John Birch, Sir Neville Marriner & Sylvia McNair","ArtistName_Academy of St. Martin in the Fields, Sir Neville Marriner & Thurston Dart",ArtistName_Accept,ArtistName_Adrian Leaper & Doreen de Feis,ArtistName_Aerosmith,ArtistName_Alanis Morissette,ArtistName_Alice In Chains,ArtistName_Amy Winehouse,...,ArtistName_Toquinho & Vinícius,ArtistName_U2,ArtistName_UB40,ArtistName_Van Halen,ArtistName_Various Artists,ArtistName_Velvet Revolver,ArtistName_Vinícius De Moraes,ArtistName_Yehudi Menuhin,ArtistName_Yo-Yo Ma,ArtistName_Zeca Pagodinho
CustomerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,4.0,0.0,2.0,1.0,0.0,0.0,2.0


First 10 rows of the clustering ready 'ArtistName' data.

In [8]:
print(artist_features[:10])

[[12.814907    2.0431075   1.6870314 ]
 [12.104961    2.0627768   1.7475295 ]
 [11.652204    3.268887    3.799478  ]
 [11.668995    0.76642877  3.5991812 ]
 [12.5094595   0.99819195  2.247469  ]
 [12.21816     1.1085975   3.6892366 ]
 [12.1940975   1.2894894   3.0009885 ]
 [12.489316    2.464878    3.8191035 ]
 [11.2674885   0.4835746   3.6181312 ]
 [11.646283    1.8149014   1.708857  ]]


### Numeric Variable - 'Milliseconds'

- Step 1: For each customer calculate the minimum, the 25% quantile, the 50% quantile (median), the 75% quantile, and the maximum.
- Step 2: Transform these quantiles into a customer - qunatile matrix.
- Step 3: I'm going to transform these features so they follow an approximate normal distribution
- Step 4: I'm going to normalize (centre and scale) these features. 

There is no need for dimension reduction as there are relatively few dimensions.

#### Milliseconds

The code below performs steps 1 & 2. A pipeline takes care of steps 3 & 4. 

In [9]:
customer_milliseconds_quantile = invoice_lines_df[['CustomerId','Milliseconds']] \
    .groupby('CustomerId') \
    .quantile(q=[0.1, 0.25, 0.5, 0.75, 0.9]) \
    .reset_index() \
    .pivot(
        index = 'CustomerId',
        columns = 'level_1',
        values = 'Milliseconds'
    ) \
    .reset_index() \
    .set_axis(
        ['CustomerId', '0.0', '0.25', '0.5', '0.75', '1'],
        axis=1
    ) \
    .set_index('CustomerId') \
    .rename(columns= lambda col: 'MillisecondsQuantile_' + col)
    
customer_milliseconds_quantile.head()

Unnamed: 0_level_0,MillisecondsQuantile_0.0,MillisecondsQuantile_0.25,MillisecondsQuantile_0.5,MillisecondsQuantile_0.75,MillisecondsQuantile_1
CustomerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,171908.1,196342.5,231960.0,282038.0,384544.3
2,146650.7,208404.25,233534.0,277648.25,368072.9
3,148492.6,201377.25,256221.5,380734.25,498658.0
4,208368.2,233018.25,272554.0,360554.25,529030.2
5,184267.5,205720.5,270833.5,329567.0,618323.2


First 10 rows of the clustering ready 'Milliseconds' data.

In [10]:
milliseconds_pipe = make_pipeline(PowerTransformer(method='box-cox'), StandardScaler())
milliseconds_features = milliseconds_pipe.fit_transform(customer_milliseconds_quantile)

print(milliseconds_features[:10])

[[ 0.11111969 -0.83633527 -1.0979374  -1.02137992 -0.46334155]
 [-1.10109171 -0.15428685 -1.0307586  -1.02137992 -0.64331936]
 [-1.00687014 -0.55529934 -0.06328289  1.21052435  0.3706499 ]
 [ 1.61575366  1.3297512   0.63227949  0.7641435   0.51466785]
 [ 0.6494586  -0.30864279  0.55904178  0.31776264  0.83248907]
 [-0.92946309  1.44075602  2.05219438  2.54966691  1.79904734]
 [ 1.04425042  0.66102843  0.8988359   1.21052435  1.79373918]
 [ 0.64150063  0.25298413  0.21603122 -0.12861821 -0.91099878]
 [ 0.10692681 -0.45153531 -0.60456043 -1.02137992 -0.40927931]
 [ 0.75805622  0.63453757 -0.39617841 -0.57499907 -0.16204017]]


## Saving Data

### Writing to CSV

I'll save the customer - item tables as csv files.

In [11]:
customer_genre_table.to_csv('./00_Data/genre_clustering_data.csv')

customer_artist_table.to_csv('./00_Data/artist_clustering_data.csv')

customer_milliseconds_quantile.to_csv('./00_Data/milliseconds_clustering_data.csv')

### Writing to Pickle

I'll save the cluster ready numpy arrays as pickle files.

In [12]:
pd.to_pickle(milliseconds_features, './00_Data/processed_milliseconds.py')

pd.to_pickle(genre_features, './00_Data/processed_genre.py')

pd.to_pickle(artist_features, './00_Data/processed_artist.py')