

# Predicting Customer Satisfaction on Rent the Runway

##  III. Feature Importance Selection

### Katrin Ayrapetov


<font style="font-size: 2rem; color: blue">


 
</font>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
import warnings
warnings.filterwarnings("ignore")

In [2]:
import itertools
from sklearn import model_selection
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler

In [3]:
df =  pd.read_csv('../Data/df_clean.csv')

In [4]:
df.drop(columns=["Retail_price"],inplace=True)

In [5]:
# For Train set
lencoders = {}
for col in df.select_dtypes(include=['object']).columns:
    lencoders[col] = LabelEncoder()
    df[col] = lencoders[col].fit_transform(df[col])

### Select top ten features using the Chi Square 

In [6]:
r_scaler = preprocessing.MinMaxScaler()
r_scaler.fit(df)
modified_data = pd.DataFrame(r_scaler.transform(df), columns=df.columns)


In [7]:
X = modified_data.loc[:,modified_data.columns!='Rating']
y = modified_data[['Rating']]


In [8]:
y=y.astype('int')

In [9]:
selector = SelectKBest(chi2, k=10)

In [10]:
selector.fit(X, y)

SelectKBest(score_func=<function chi2 at 0x0000023F55B39940>)

In [11]:
X_new = selector.transform(X)
print(X.columns[selector.get_support(indices=True)])

Index(['Type_of_Customer', 'Size', 'Overall_fit', 'Age', 'Weight', 'Date',
       'Brand', 'Number_of_reviews', 'BMI', 'Neckline'],
      dtype='object')


In [12]:
imp_feat_chi_squared = list(X.columns[selector.get_support(indices=True)])

In [17]:
print("Important features using Chi Square")
for i in range(10):
    print(f"Feature {i + 1} is {imp_feat_chi_squared[i]}")

Important features using Chi Square
Feature 1 is Type_of_Customer
Feature 2 is Size
Feature 3 is Overall_fit
Feature 4 is Age
Feature 5 is Weight
Feature 6 is Date
Feature 7 is Brand
Feature 8 is Number_of_reviews
Feature 9 is BMI
Feature 10 is Neckline


### Select top ten features using ADA Boost 

In [25]:
df.columns

Index(['Type_of_Customer', 'Size', 'Overall_fit', 'Rented_for',
       'Size_usually_worn', 'Height', 'Age', 'Bust_size', 'Body_type',
       'Weight', 'Rating', 'Date', 'Brand', 'Rent_price', 'Number_of_reviews',
       'BMI', 'Sleeves', 'Neckline', 'Dress_Style'],
      dtype='object')

In [18]:
features = ['Type_of_Customer', 'Size', 'Overall_fit', 'Rented_for', 'Size_usually_worn',
            'Height', 'Age',"Weight", 'Bust_size', 'Body_type',  'Date','Rent_price', 'Number_of_reviews', 'Sleeves', 'Neckline', 'Dress_Style','BMI','Brand']
target = ['Rating']


In [20]:
imp_feat_ADA_boost = []

In [21]:
for i in range(10):
    X = df[features]
    y=df[target]
    from sklearn.ensemble import AdaBoostClassifier
    model = AdaBoostClassifier(n_estimators=1)
    model.fit(X,y)
    important_feature = X.columns[model.feature_importances_.argmax()]
    print(f"Feature {i + 1} is {important_feature}")
    features.remove(important_feature)
    imp_feat_ADA_boost.append(important_feature)

Feature 1 is Size
Feature 2 is Overall_fit
Feature 3 is Number_of_reviews
Feature 4 is Type_of_Customer
Feature 5 is Rented_for
Feature 6 is Age
Feature 7 is Dress_Style
Feature 8 is Neckline
Feature 9 is Weight
Feature 10 is BMI


In [23]:
#Join Two lists together and take out repeats. 
all_important_features = imp_feat_chi_squared + imp_feat_ADA_boost

In [24]:
all_important_features = list(set(all_important_features))

In [27]:
print("Important features:")
for i in range(len(all_important_features)):
    print(f"Feature {i + 1} is {all_important_features[i]}")

Important features:
Feature 1 is Dress_Style
Feature 2 is Size
Feature 3 is Age
Feature 4 is Overall_fit
Feature 5 is Type_of_Customer
Feature 6 is Brand
Feature 7 is BMI
Feature 8 is Rented_for
Feature 9 is Weight
Feature 10 is Neckline
Feature 11 is Date
Feature 12 is Number_of_reviews
