0. Preparation (Setting the Random State)

In [7]:
import numpy as np
import scipy as sc
import pandas as pd 
import matplotlib.pyplot as plt
import sklearn


rs = min(328964, 328830)
np.random.seed(rs) # you have to call in every cell you use a function that requires a random state


1. Exercise 1 (Loading and Preparing the Data)

In [8]:
np.random.seed(rs)

df_tot = pd.read_csv("cla4lsp_customers.csv", sep='\t', parse_dates=['Dt_Customer'], date_format='%d-%m-%Y')
print(df_tot.info())
workdf = df_tot.sample(frac=2/3, random_state=rs)
print(df_tot.shape)
print(workdf.shape)
workdf.drop(['ID', 'Z_CostContact', 'Z_Revenue'], axis=1, inplace=True)
print(workdf.shape)
labels = ['NumDealsPurchases', 'AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5',
          'Response', 'Complain', 'Recency']
features = list(workdf.drop(labels, axis=1).columns)
print(labels)
print(features)
habits = features[7:16]
print(habits)
removed = np.random.choice(habits, 1)
print(removed)
workdf.drop(removed, axis=1, inplace=True)
features.remove(removed)
print(workdf.shape)
print(features)


FileNotFoundError: [Errno 2] No such file or directory: 'cla4lsp_customers.csv'

Cleaning the working dataset from missing values

In [None]:
print(workdf[features].isna().sum())
workdf['Income'].fillna(0, inplace=True)

Year_Birth              0
Education               0
Marital_Status          0
Income                 15
Kidhome                 0
Teenhome                0
Dt_Customer             0
MntWines                0
MntFruits               0
MntMeatProducts         0
MntFishProducts         0
MntGoldProds            0
NumWebPurchases         0
NumCatalogPurchases     0
NumStorePurchases       0
NumWebVisitsMonth       0
dtype: int64


2. Exercise 2 (Encoding of Categorical Data) \
Here we have only two categorical features: Education and Marital_Status; since there are not too many possible values, we can use one-hot encoding by means of the pd.get_dummies() function.

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder


# enc = OneHotEncoder()
# transformed = enc.fit_transform(workdf[['Education', 'Marital_Status']])
# print(transformed)
# ohe_df = pd.DataFrame(transformed, columns=[['Education', 'Marital_Status']])
# Xworkdf = pd.concat([workdf, ohe_df], axis=1).drop(['Education', 'Marital_Status'], axis=1)

# print(workdf[features].iloc[:5])
Xworkdf = pd.get_dummies(data=workdf[features], columns=['Education', 'Marital_Status'], drop_first=True)
new_cols = list(Xworkdf.columns[~Xworkdf.columns.isin(workdf.columns)])
print(new_cols)
years = Xworkdf['Year_Birth']
dates = Xworkdf['Dt_Customer']
le = LabelEncoder()
enc_years = le.fit_transform(years)
enc_dates = le.fit_transform(dates)
Xworkdf['Year_Birth'] = enc_years
Xworkdf['Dt_Customer'] = enc_dates
# print(Xworkdf.shape)
# print(Xworkdf.iloc[:5])



['Education_Basic', 'Education_Graduation', 'Education_Master', 'Education_PhD', 'Marital_Status_Alone', 'Marital_Status_Divorced', 'Marital_Status_Married', 'Marital_Status_Single', 'Marital_Status_Together', 'Marital_Status_Widow', 'Marital_Status_YOLO']


 Exercise 3 (Preprocessing and full-PCA)

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
std = StandardScaler()
mm = MinMaxScaler()
# print(features)
# numerical_features = ['Income', 'Kidhome', 'Teenhome', 'MntWines', 'MntFruits','MntMeatProducts', 'MntFishProducts',
#                        'MntGoldProds', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth']
# categorical_features = ['Year_Birth', 'Dt_Customer']
# categorical_features.extend(new_cols)
# print(categorical_features)

std_features = std.fit_transform(Xworkdf.values)
Xworkdf_std = pd.DataFrame(std_features, index=Xworkdf.index, columns=Xworkdf.columns)
# print(Xworkdf_std.iloc[:5])
mm_features = mm.fit_transform(Xworkdf.values)
Xworkdf_mm = pd.DataFrame(mm_features, index=Xworkdf.index, columns=Xworkdf.columns)
print(Xworkdf_mm.iloc[:5]) 




In [None]:
print(5)

5


Exercise 4 (Dimensionality Reduction and Interpretation of the PCs)