In [1]:
#Import items

from matplotlib import pyplot as plt
from sklearn.datasets import make_classification
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Raw data file
file_to_load = "user_profiles.csv"

# Read purchasing file and store into pandas data frame
df= pd.read_csv(file_to_load)
len(df)

59946

In [3]:
# Keep columns that will assist on determining body type.

working_df = df.drop(columns=['sign', 'speaks', 'status', 'last_online', 'income', 'location', 'job', 'education', 'orientation', 'religion', 'ethnicity'])
working_df = working_df.drop(columns=['offspring', 'pets', 'Unnamed: 0'])
working_df

Unnamed: 0,age,body_type,diet,drinks,drugs,height,sex,smokes
0,22,a little extra,strictly anything,socially,never,75.0,m,sometimes
1,35,average,mostly other,often,sometimes,70.0,m,no
2,38,thin,anything,socially,,68.0,m,no
3,23,thin,vegetarian,socially,,71.0,m,no
4,29,athletic,,socially,never,66.0,m,no
...,...,...,...,...,...,...,...,...
59941,59,,,socially,never,62.0,f,no
59942,24,fit,mostly anything,often,sometimes,72.0,m,no
59943,42,average,mostly anything,not at all,never,71.0,m,no
59944,27,athletic,mostly anything,socially,often,73.0,m,trying to quit


In [4]:
# Drop NaN values
working_df = working_df.dropna()
working_df

Unnamed: 0,age,body_type,diet,drinks,drugs,height,sex,smokes
0,22,a little extra,strictly anything,socially,never,75.0,m,sometimes
1,35,average,mostly other,often,sometimes,70.0,m,no
7,31,average,mostly anything,socially,never,65.0,f,no
9,37,athletic,mostly anything,not at all,never,65.0,m,no
11,28,average,mostly anything,socially,never,72.0,m,no
...,...,...,...,...,...,...,...,...
59935,33,curvy,anything,socially,never,67.0,f,when drinking
59936,25,average,mostly anything,socially,never,61.0,f,no
59942,24,fit,mostly anything,often,sometimes,72.0,m,no
59943,42,average,mostly anything,not at all,never,71.0,m,no


In [6]:
#See what columns we're working with    

list(working_df.columns)

['age', 'body_type', 'diet', 'drinks', 'drugs', 'height', 'sex', 'smokes']

In [7]:
# Remove inaccurate ages

working_df = working_df[working_df.age != 109]
working_df = working_df[working_df.age != 111]

# Combine data that makes sense
working_df = working_df.replace({'diet':{'strictly anything': 'anything',
        'mostly anything': 'anything',
        'strictly halal': 'halal',
        'mostly halal':'halal',
        'strictly kosher': 'kosher',
        'mostly kosher':'kosher',
        'strictly vegan':'vegan',
        'mostly vegan': 'vegan',
        'strictly vegetarian':'vegetarian',
        'mostly vegetarian':'vegetarian',
        'strictly other': 'other',
        'mostly other': 'other'}})


# Remove values that have declined to answer, since they will be unhelpful        

values = ['declined to answer']        

working_df = working_df[working_df.age.isin(values) == False]
working_df = working_df[working_df.diet.isin(values) == False]
working_df = working_df[working_df.body_type.isin(values) == False]
working_df = working_df[working_df.drinks.isin(values) == False]
working_df = working_df[working_df.drugs.isin(values) == False]
working_df = working_df[working_df.height.isin(values) == False]
working_df = working_df[working_df.sex.isin(values) == False]
working_df = working_df[working_df.smokes.isin(values) == False]
working_df = working_df[working_df.body_type != 'rather not say']

working_df


Unnamed: 0,age,body_type,diet,drinks,drugs,height,sex,smokes
0,22,a little extra,anything,socially,never,75.0,m,sometimes
1,35,average,other,often,sometimes,70.0,m,no
7,31,average,anything,socially,never,65.0,f,no
9,37,athletic,anything,not at all,never,65.0,m,no
11,28,average,anything,socially,never,72.0,m,no
...,...,...,...,...,...,...,...,...
59935,33,curvy,anything,socially,never,67.0,f,when drinking
59936,25,average,anything,socially,never,61.0,f,no
59942,24,fit,anything,often,sometimes,72.0,m,no
59943,42,average,anything,not at all,never,71.0,m,no


In [8]:
# Get unique body type values
working_df['body_type'].value_counts()

average           6802
fit               5742
athletic          5366
thin              2191
curvy             1843
a little extra    1312
skinny             804
full figured       464
overweight         227
jacked             191
used up            182
Name: body_type, dtype: int64

In [9]:
# Clean up body type values
cleaned_BT_df = working_df.replace({'body_type': {'athletic':'fit', 'full figured':'curvy', 'a little extra':'curvy', 'jacked':'fit', 'skinny':'thin'}})
cleaned_BT_df['body_type'].value_counts()

fit           11299
average        6802
curvy          3619
thin           2995
overweight      227
used up         182
Name: body_type, dtype: int64

In [10]:
# Get unique values of diet
cleaned_BT_df['smokes'].value_counts()

no                20349
sometimes          1679
when drinking      1425
yes                1023
trying to quit      648
Name: smokes, dtype: int64

In [11]:
cleaned_BT_df.shape

(25124, 8)

# Testing with Target Data = Body Types

In [12]:
# Separate the dataset into data and target
X = cleaned_BT_df.drop(['body_type'], axis=1)
y = cleaned_BT_df['body_type']

In [13]:
# Do get dummies for data and label encoder for target
X_dummies = pd.get_dummies(X)

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
y_label = LabelEncoder().fit_transform(cleaned_BT_df['body_type'])
y_label

array([1, 0, 0, ..., 2, 0, 2])

In [14]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y_label,test_size=.5)

In [15]:
# Test with Random Forest to see how well it works

RF_clf = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)
print(f'Training Score: {RF_clf.score(X_train, y_train)}')
print(f'Testing Score: {RF_clf.score(X_test, y_test)}')

Training Score: 0.7474924375099506
Testing Score: 0.3879955421111288


With Random Forest, the Training went pretty well, but the resulting testing score was horrible. Unsure if it's because of the data or if it's because of the model.
We'll test it with other models and see how those results pan out.

In [16]:
# Test with Logistic Regression

from sklearn.linear_model import LogisticRegression
LR_clf = LogisticRegression()

LR_clf.fit(X_train, y_train)

print(f"Training Data Score: {LR_clf.score(X_train, y_train)}")
print(f"Testing Data Score: {LR_clf.score(X_test, y_test)}")

Training Data Score: 0.4556599267632543
Testing Data Score: 0.45757045056519663


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


With Logistic Regression, even though there's more than one target data, just to see how it fares, it results in both training and testing being giving horrible results. This is most likely due to the model being better suited to work with target datas with two values (i.e. "Yes" & "No")

For now, we'll try using Neural Network.

In [17]:
# Create scaler instance
import sklearn as skl
import tensorflow as tf

X_scaler = skl.preprocessing.StandardScaler()

# Fit the scaler
X_scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [18]:
# Create the Keras Sequential model
nn_model = tf.keras.models.Sequential()

In [19]:
# Add our first Dense layer, including the input layer
nn_model.add(tf.keras.layers.Dense(units=5, activation="relu", input_dim=X_train_scaled.shape[1]))

In [20]:
# Add the output layer that uses a probability activation function
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

In [21]:
# Check the structure of the Sequential model
nn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 5)                 125       
                                                                 
 dense_1 (Dense)             (None, 1)                 6         
                                                                 
Total params: 131
Trainable params: 131
Non-trainable params: 0
_________________________________________________________________


In [22]:
# Compile the Sequential model together and customize metrics
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Fit the model to the training data
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


So even with the Neural Networking, the score plateau'd and stayed consistently at low accuracy. This is looking more like an issue with the data rather than the models since it's bad with each selected model so far.

To switch things up, instead of body types, I'll switch the target data to be the Sex instead.

In [23]:
# Separate the dataset into data and target
X = cleaned_BT_df.drop(['sex'], axis=1)
y = cleaned_BT_df['sex']

In [24]:
# Do get dummies for data and label encoder for target
X_dummies = pd.get_dummies(X)

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
y_label = LabelEncoder().fit_transform(cleaned_BT_df['sex'])
y_label

array([1, 1, 0, ..., 1, 1, 1])

In [25]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y_label,test_size=.5)

In [26]:
# Test with Random Forest to see how well it works

RF_clf = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)
print(f'Training Score: {RF_clf.score(X_train, y_train)}')
print(f'Testing Score: {RF_clf.score(X_test, y_test)}')

Training Score: 0.9402165260308868
Testing Score: 0.8205699729342462


Surprisingly, the accuracy is great when the models are subjected to pick either Male or Female. Perhaps it's because of it's requiring to pick between two values instead of six? For consistency sake, I'll rerun the other models as well to see if RandomForest's good scores were a fluke.

In [27]:
# Test with Logistic Regression

from sklearn.linear_model import LogisticRegression
LR_clf = LogisticRegression()

LR_clf.fit(X_train, y_train)

print(f"Training Data Score: {LR_clf.score(X_train, y_train)}")
print(f"Testing Data Score: {LR_clf.score(X_test, y_test)}")

Training Data Score: 0.8421429708645121
Testing Data Score: 0.8374462665180704


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
# Create scaler instance
import sklearn as skl
import tensorflow as tf

X_scaler = skl.preprocessing.StandardScaler()

# Fit the scaler
X_scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [29]:
# Create the Keras Sequential model
nn_model = tf.keras.models.Sequential()

In [30]:
# Add our first Dense layer, including the input layer
nn_model.add(tf.keras.layers.Dense(units=5, activation="relu", input_dim=X_train_scaled.shape[1]))

In [31]:
# Add the output layer that uses a probability activation function
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

In [32]:
# Check the structure of the Sequential model
nn_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_2 (Dense)             (None, 5)                 145       
                                                                 
 dense_3 (Dense)             (None, 1)                 6         
                                                                 
Total params: 151
Trainable params: 151
Non-trainable params: 0
_________________________________________________________________


In [33]:
# Compile the Sequential model together and customize metrics
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Fit the model to the training data
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


With our target data being Sex, we managed to have a higher accuracy score for all three models. It's interesting since this having less target values to predict might be better, in correlation with the data we have. With having more target values, in this case the body types, the models we selected did poorly regarding predicting the correct body types. In the future, we might test with more models to see if the there are any that are more compatible with our data, or if we can clean this to be more concise so we can introduce more variables that may help with future predictions.