# Homework - Principal Components Analysis


## Libraries

In [722]:
import pandas as pd
import numpy as np
import scipy
import sklearn
import matplotlib
import yaml

## 0. Preparation (Setting the Random State):

In [723]:
pellegrino = 331438
mungaicoppolino = 246120
rs = min(pellegrino, mungaicoppolino)
np.random.seed(rs)
initial_random_state = np.random.get_state()
np.random.set_state(initial_random_state)

## 1. Loading and Preparing the Data


### 1.1 Storing the csv file in a DataFrame Variable

We store in the variable df_tot the dataframe obtained from the csv file.

In [724]:
df_tot = pd.read_csv("cla4lsp_customers.csv", sep='\t')
df_tot.head()


Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,...,5,0,0,0,0,0,0,3,11,0


### 1.2 Sub-DFs Creation

Creation of the sub dataframes *workdf*: 2/3 of the rows are taken from the original dataframes df_tot

In [725]:
workdf = df_tot.sample(frac=2/3, random_state=rs)
workdf.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
762,6406,1988,Master,Married,78285.0,0,0,28-10-2013,13,647,...,3,0,0,0,0,0,0,3,11,0
2206,1092,1980,Graduation,Married,61014.0,0,1,11-08-2012,17,269,...,7,0,0,0,0,0,0,3,11,0
419,8581,1971,Master,Married,49505.0,1,1,05-03-2013,4,604,...,8,0,0,0,0,0,0,3,11,0
1370,5948,1975,Graduation,Single,57338.0,0,1,29-04-2014,96,143,...,5,0,0,0,0,0,0,3,11,0
1775,8910,1955,Graduation,Together,42586.0,1,1,29-10-2012,7,194,...,8,0,0,0,0,0,0,3,11,1


### 1.3 Labels and Features

Discarding the ID, Z_CostContact and Z_Revenue columns

In [726]:
labels = ['NumDealsPurchases',
'AcceptedCmp1',
'AcceptedCmp2',
'AcceptedCmp3',
'AcceptedCmp4',
'AcceptedCmp5',
'Response',
'Complain',
'Recency'] #len(labels) == 9

features = [
    'Year_Birth',
    'Education',
    'Marital_Status',
    'Income',
    'Kidhome',
    'Teenhome',
    'Dt_Customer',
    'MntWines', 
    'MntFruits', 
    'MntMeatProducts',
    'MntFishProducts',
    'MntSweetProducts', 
    'MntGoldProds', 
    'NumWebPurchases', 
    'NumCatalogPurchases',
    'NumStorePurchases',
    'NumWebVisitsMonth'
] #len(features) == 17 
workdf = workdf.loc[:, labels + features]
workdf.head()


Unnamed: 0,NumDealsPurchases,AcceptedCmp1,AcceptedCmp2,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,Response,Complain,Recency,Year_Birth,...,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth
762,1,0,0,0,0,0,0,0,13,1988,...,647,107,391,175,67,40,6,4,10,3
2206,4,0,0,0,0,0,0,0,17,1980,...,269,129,495,182,43,29,9,3,4,7
419,9,0,0,0,0,0,0,0,4,1971,...,604,0,100,19,0,28,10,2,8,8
1370,2,0,0,0,0,0,0,0,96,1975,...,143,6,52,11,8,17,4,1,5,5
1775,5,0,0,0,0,0,1,0,7,1955,...,194,2,56,0,0,0,4,1,6,8


### 1.4 Removing randomly from workdf one feature

In [727]:
purchase_features = ['MntWines', 
                    'MntFruits', 
                    'MntMeatProducts',
                    'MntFishProducts',
                    'MntSweetProducts', 
                    'MntGoldProds', 
                    'NumWebPurchases', 
                    'NumCatalogPurchases',
                    'NumStorePurchases']

removed_feature = np.random.choice(purchase_features)
workdf = workdf.drop(columns=removed_feature)
features.remove(removed_feature)
print(f"Feature removed: {removed_feature}\nFirst elements of the workdf Dataframes:")
display(workdf.head())



Feature removed: NumStorePurchases
First elements of the workdf Dataframes:


Unnamed: 0,NumDealsPurchases,AcceptedCmp1,AcceptedCmp2,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,Response,Complain,Recency,Year_Birth,...,Dt_Customer,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumWebPurchases,NumCatalogPurchases,NumWebVisitsMonth
762,1,0,0,0,0,0,0,0,13,1988,...,28-10-2013,647,107,391,175,67,40,6,4,3
2206,4,0,0,0,0,0,0,0,17,1980,...,11-08-2012,269,129,495,182,43,29,9,3,7
419,9,0,0,0,0,0,0,0,4,1971,...,05-03-2013,604,0,100,19,0,28,10,2,8
1370,2,0,0,0,0,0,0,0,96,1975,...,29-04-2014,143,6,52,11,8,17,4,1,5
1775,5,0,0,0,0,0,1,0,7,1955,...,29-10-2012,194,2,56,0,0,0,4,1,8


### 1.5 Clean the dataset workdf

NaN value were found in the column *'Income'*. For sake of simplicity, rows with NaN value in that column are suppressed.

In [728]:
tmp_df = workdf.loc[:, features]
df_nan = tmp_df[tmp_df.isna().any(axis=1)]
print("Dataframes where there are NaN values (df_nan):\n")
display(df_nan)
workdf = workdf.dropna()
print("workdf without df_nan:\n")
display(workdf)

Dataframes where there are NaN values (df_nan):



Unnamed: 0,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumWebPurchases,NumCatalogPurchases,NumWebVisitsMonth
58,1982,Graduation,Single,,1,0,17-06-2013,11,3,22,2,2,6,2,0,6
1382,1958,Graduation,Together,,1,1,03-09-2012,19,4,12,2,2,6,1,0,5
71,1973,2n Cycle,Married,,1,0,14-09-2012,25,3,43,17,4,17,3,0,8
90,1957,PhD,Married,,2,1,19-11-2012,230,42,192,49,37,53,7,2,9
133,1963,Graduation,Married,,0,1,11-08-2013,231,65,196,38,71,124,6,5,4
1386,1972,PhD,Together,,1,0,02-03-2014,25,1,13,0,0,3,1,0,7
312,1989,Graduation,Married,,0,0,03-06-2013,861,138,461,60,30,61,6,5,3
2061,1981,PhD,Single,,1,0,31-05-2013,23,0,15,0,2,7,3,0,6
10,1983,Graduation,Married,,1,0,15-11-2013,5,5,6,0,2,1,1,0,7
48,1951,Graduation,Single,,2,1,01-01-2014,48,5,48,6,10,7,2,1,6


workdf without df_nan:



Unnamed: 0,NumDealsPurchases,AcceptedCmp1,AcceptedCmp2,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,Response,Complain,Recency,Year_Birth,...,Dt_Customer,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumWebPurchases,NumCatalogPurchases,NumWebVisitsMonth
762,1,0,0,0,0,0,0,0,13,1988,...,28-10-2013,647,107,391,175,67,40,6,4,3
2206,4,0,0,0,0,0,0,0,17,1980,...,11-08-2012,269,129,495,182,43,29,9,3,7
419,9,0,0,0,0,0,0,0,4,1971,...,05-03-2013,604,0,100,19,0,28,10,2,8
1370,2,0,0,0,0,0,0,0,96,1975,...,29-04-2014,143,6,52,11,8,17,4,1,5
1775,5,0,0,0,0,0,1,0,7,1955,...,29-10-2012,194,2,56,0,0,0,4,1,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1732,1,0,0,1,0,0,0,0,90,1955,...,27-04-2013,18,2,10,0,0,25,1,1,7
358,1,0,0,0,0,0,0,1,65,1943,...,20-08-2013,629,17,177,69,0,8,5,3,2
1118,1,0,0,0,0,0,0,0,47,1966,...,16-01-2013,2,23,11,8,6,46,2,1,8
1520,1,0,0,0,0,0,1,0,73,1983,...,03-06-2013,910,111,724,49,74,55,10,2,3


## 2. Exercise 2 (Encoding of Categorical Data)

In [729]:
workdf.iloc[1,:]

NumDealsPurchases               4
AcceptedCmp1                    0
AcceptedCmp2                    0
AcceptedCmp3                    0
AcceptedCmp4                    0
AcceptedCmp5                    0
Response                        0
Complain                        0
Recency                        17
Year_Birth                   1980
Education              Graduation
Marital_Status            Married
Income                    61014.0
Kidhome                         0
Teenhome                        1
Dt_Customer            11-08-2012
MntWines                      269
MntFruits                     129
MntMeatProducts               495
MntFishProducts               182
MntSweetProducts               43
MntGoldProds                   29
NumWebPurchases                 9
NumCatalogPurchases             3
NumWebVisitsMonth               7
Name: 2206, dtype: object

In [730]:
workdf['Education'].unique()

array(['Master', 'Graduation', '2n Cycle', 'PhD', 'Basic'], dtype=object)

In [731]:
workdf['Marital_Status'].unique()

array(['Married', 'Single', 'Together', 'Divorced', 'Alone', 'Widow',
       'Absurd', 'YOLO'], dtype=object)

In [732]:
workdf['Dt_Customer'].unique()

array(['28-10-2013', '11-08-2012', '05-03-2013', '29-04-2014',
       '29-10-2012', '19-06-2014', '05-08-2012', '23-01-2014',
       '11-08-2013', '04-02-2014', '01-08-2013', '15-11-2013',
       '23-03-2014', '10-01-2013', '07-11-2012', '16-02-2014',
       '12-01-2013', '07-05-2014', '02-02-2014', '07-09-2012',
       '08-10-2013', '03-11-2012', '15-05-2013', '16-07-2013',
       '17-01-2013', '08-06-2013', '20-06-2014', '19-09-2013',
       '15-09-2013', '29-03-2013', '12-12-2012', '31-03-2013',
       '28-05-2013', '01-05-2013', '08-04-2014', '30-06-2013',
       '16-04-2014', '28-11-2012', '23-11-2012', '18-09-2012',
       '29-03-2014', '10-03-2013', '24-06-2013', '29-05-2014',
       '06-07-2013', '23-11-2013', '05-10-2012', '28-09-2012',
       '04-10-2013', '02-01-2013', '14-09-2013', '22-11-2013',
       '25-10-2013', '07-04-2014', '13-04-2014', '31-08-2013',
       '01-04-2014', '25-04-2014', '17-04-2013', '16-08-2012',
       '15-01-2014', '01-10-2012', '20-11-2012', '07-11