In [1]:
#Import items

from matplotlib import pyplot as plt
from sklearn.datasets import make_classification
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
# Raw data file
file_to_load = "user_profiles.csv"

# Read purchasing file and store into pandas data frame
df= pd.read_csv(file_to_load)
len(df)

59946

In [7]:
# Keep columns that will assist on determining body type.

working_df = df.drop(columns=['sign', 'speaks', 'status', 'last_online', 'income', 'location', 'job', 'education', 'orientation', 'religion', 'ethnicity'])
working_df = working_df.drop(columns=['offspring', 'pets', 'Unnamed: 0'])
working_df

Unnamed: 0,age,body_type,diet,drinks,drugs,height,sex,smokes
0,22,a little extra,strictly anything,socially,never,75.0,m,sometimes
1,35,average,mostly other,often,sometimes,70.0,m,no
2,38,thin,anything,socially,,68.0,m,no
3,23,thin,vegetarian,socially,,71.0,m,no
4,29,athletic,,socially,never,66.0,m,no
...,...,...,...,...,...,...,...,...
59941,59,,,socially,never,62.0,f,no
59942,24,fit,mostly anything,often,sometimes,72.0,m,no
59943,42,average,mostly anything,not at all,never,71.0,m,no
59944,27,athletic,mostly anything,socially,often,73.0,m,trying to quit


In [8]:
# Drop NaN values
working_df = working_df.dropna()
working_df

Unnamed: 0,age,body_type,diet,drinks,drugs,height,sex,smokes
0,22,a little extra,strictly anything,socially,never,75.0,m,sometimes
1,35,average,mostly other,often,sometimes,70.0,m,no
7,31,average,mostly anything,socially,never,65.0,f,no
9,37,athletic,mostly anything,not at all,never,65.0,m,no
11,28,average,mostly anything,socially,never,72.0,m,no
...,...,...,...,...,...,...,...,...
59935,33,curvy,anything,socially,never,67.0,f,when drinking
59936,25,average,mostly anything,socially,never,61.0,f,no
59942,24,fit,mostly anything,often,sometimes,72.0,m,no
59943,42,average,mostly anything,not at all,never,71.0,m,no


In [29]:
#working_df = working_df.astype(object).replace(np.nan, 'None')

#Clean up verbage

# working_df = working_df.replace({'doesn&rsquo;t have kids, but might want them': 'does not have kids, but might want them',
#     'doesn&rsquo;t want kids': 'does not want kids',
#     'doesn&rsquo;t have kids, but wants them': 'does not have kids, but wants them',
#     'doesn&rsquo;t have kids': 'does not have kids',
#     'doesn&rsquo;t have kids, and doesn&rsquo;t want any': 'does not have kids, and does not want any',
#     'has kids, but doesn&rsquo;t want more': 'has kids, but does not want more',
#     'has a kid, but doesn&rsquo;t want more': 'has a kid, but does not want more',
#     'None': 'declined to answer'})

In [9]:
#See what columns we're working with    

list(working_df.columns)

['age', 'body_type', 'diet', 'drinks', 'drugs', 'height', 'sex', 'smokes']

In [10]:
# Remove inaccurate ages

working_df = working_df[working_df.age != 109]
working_df = working_df[working_df.age != 111]

# Combine data that makes sense
working_df = working_df.replace({'diet':{'strictly anything': 'anything',
        'mostly anything': 'anything',
        'strictly halal': 'halal',
        'mostly halal':'halal',
        'strictly kosher': 'kosher',
        'mostly kosher':'kosher',
        'strictly vegan':'vegan',
        'mostly vegan': 'vegan',
        'strictly vegetarian':'vegetarian',
        'mostly vegetarian':'vegetarian',
        'strictly other': 'other',
        'mostly other': 'other'}})


# Remove values that have declined to answer, since they will be unhelpful        

values = ['declined to answer']        

working_df = working_df[working_df.age.isin(values) == False]
working_df = working_df[working_df.diet.isin(values) == False]
working_df = working_df[working_df.body_type.isin(values) == False]
working_df = working_df[working_df.drinks.isin(values) == False]
working_df = working_df[working_df.drugs.isin(values) == False]
# working_df = working_df[working_df.ethnicity.isin(values) == False]
working_df = working_df[working_df.height.isin(values) == False]
# working_df = working_df[working_df.offspring.isin(values) == False]
# working_df = working_df[working_df.pets.isin(values) == False]
working_df = working_df[working_df.sex.isin(values) == False]
working_df = working_df[working_df.smokes.isin(values) == False]
working_df = working_df[working_df.body_type != 'rather not say']

working_df


Unnamed: 0,age,body_type,diet,drinks,drugs,height,sex,smokes
0,22,a little extra,anything,socially,never,75.0,m,sometimes
1,35,average,other,often,sometimes,70.0,m,no
7,31,average,anything,socially,never,65.0,f,no
9,37,athletic,anything,not at all,never,65.0,m,no
11,28,average,anything,socially,never,72.0,m,no
...,...,...,...,...,...,...,...,...
59935,33,curvy,anything,socially,never,67.0,f,when drinking
59936,25,average,anything,socially,never,61.0,f,no
59942,24,fit,anything,often,sometimes,72.0,m,no
59943,42,average,anything,not at all,never,71.0,m,no


In [11]:
# Get unique body type values
working_df['body_type'].value_counts()

average           6802
fit               5742
athletic          5366
thin              2191
curvy             1843
a little extra    1312
skinny             804
full figured       464
overweight         227
jacked             191
used up            182
Name: body_type, dtype: int64

In [12]:
# Clean up body type values
cleaned_BT_df = working_df.replace({'body_type': {'athletic':'fit', 'full figured':'curvy', 'a little extra':'curvy', 'jacked':'fit', 'skinny':'thin'}})
cleaned_BT_df['body_type'].value_counts()

fit           11299
average        6802
curvy          3619
thin           2995
overweight      227
used up         182
Name: body_type, dtype: int64

In [13]:
# Get unique values of diet
cleaned_BT_df['smokes'].value_counts()

no                20349
sometimes          1679
when drinking      1425
yes                1023
trying to quit      648
Name: smokes, dtype: int64

In [35]:
# Assign Int Value to Sex
# Male = 1, Female = 0
# def changeStatus(status):
#     if status == "m":
#         return 1
#     else:
#         return 0

# Along with replace() and map(), this is another way to encode the gender column into numbers.
# cleaned_BT_df["sex"] = cleaned_BT_df["sex"].apply(changeStatus)
# cleaned_BT_df.head(10)

In [14]:
cleaned_BT_df.shape

(25124, 8)

In [15]:
# Separate the dataset into data and target
X = cleaned_BT_df.drop(['sex'], axis=1)
y = cleaned_BT_df['sex']

In [16]:
# Do get dummies for data and label encoder for target
X_dummies = pd.get_dummies(X)

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
y_label = LabelEncoder().fit_transform(cleaned_BT_df['sex'])
y_label

array([1, 1, 0, ..., 1, 1, 1])

In [17]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y_label,test_size=.5)

In [18]:
# Test with Random Forest to see how well it works

RF_clf = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)
print(f'Training Score: {RF_clf.score(X_train, y_train)}')
print(f'Testing Score: {RF_clf.score(X_test, y_test)}')

Training Score: 0.9386244228626015
Testing Score: 0.8216844451520459


In [41]:
# x = RF_clf.feature_importances_>0.1
# RF_clf.feature_importances_[x]

# cols = X_dummies.columns[x]

In [42]:
# Split into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X_dummies[cols], y_label)

In [19]:
# Test with Logistic Regression

from sklearn.linear_model import LogisticRegression
LR_clf = LogisticRegression()

LR_clf.fit(X_train, y_train)

print(f"Training Data Score: {LR_clf.score(X_train, y_train)}")
print(f"Testing Data Score: {LR_clf.score(X_test, y_test)}")

Training Data Score: 0.8364113994586849
Testing Data Score: 0.8270179907658016


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
# Create scaler instance
import sklearn as skl
import tensorflow as tf

X_scaler = skl.preprocessing.StandardScaler()

# Fit the scaler
X_scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [21]:
# Create the Keras Sequential model
nn_model = tf.keras.models.Sequential()

In [22]:
# Add our first Dense layer, including the input layer
nn_model.add(tf.keras.layers.Dense(units=5, activation="relu", input_dim=X_train_scaled.shape[1]))

In [23]:
# Add the output layer that uses a probability activation function
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

In [24]:
# Check the structure of the Sequential model
nn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 5)                 145       
                                                                 
 dense_1 (Dense)             (None, 1)                 6         
                                                                 
Total params: 151
Trainable params: 151
Non-trainable params: 0
_________________________________________________________________


In [25]:
# Compile the Sequential model together and customize metrics
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Fit the model to the training data
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Testing Body_types with Supervised Learning resulted in high training scores for RandomForest but severely low testing scores, possible overfitting. However, trying it with NN
resulted in also consistently low scores as well. Switching it with a target data that consists of two values, in this case it is the sex of the profile, it had an easier time working 
with the code and produced better results. Unsure if this is a result of data vs target or if we just have bad data that is ill-suited for Machine Learning.