In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/predict-online-dating-matches-dataset/Online_Dating_Behavior_Dataset.csv


In [2]:
import pandas as pd 
import numpy as np 
from sklearn.metrics import accuracy_score, mean_squared_error, confusion_matrix, classification_report 


In [3]:
# Read File
date_aset = pd.read_csv('/kaggle/input/predict-online-dating-matches-dataset/Online_Dating_Behavior_Dataset.csv')

In [4]:
date_aset.describe()

Unnamed: 0,Gender,PurchasedVIP,Income,Children,Age,Attractiveness,Matches
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.51,0.474,50988.447,0.978,34.616,5.624,76.05
std,0.50015,0.499573,9889.336141,0.997251,9.147799,2.824288,52.71315
min,0.0,0.0,20786.0,0.0,18.0,1.0,0.0
25%,0.0,0.0,44358.5,0.0,27.0,3.0,0.0
50%,1.0,0.0,50841.5,1.0,35.0,6.0,70.0
75%,1.0,1.0,57396.0,2.0,43.0,8.0,120.0
max,1.0,1.0,81931.0,3.0,49.0,10.0,160.0


*From the description, it seems that all of the column values all have numerical values, therefore no encoding is needed. The standard deviation for the income seems a bit high so we need to use some standard scaler feature scaling technique and a min-max scaling for Matches which can help the model to learn better.*

In [5]:
date_aset["Matches"].unique()

array([ 70, 130,   0, 110, 140, 120, 150, 160, 100,  90,  80])

*We'll have to use a regression model for this one*

In [6]:
# Check for any columns with null type that needs to be imputed
date_aset.isnull().any()

Gender            False
PurchasedVIP      False
Income            False
Children          False
Age               False
Attractiveness    False
Matches           False
dtype: bool

In [7]:
# Check for any duplicates 
duplicates = date_aset.duplicated()
duplicated_rows = date_aset[duplicates]
duplicates.sum()

0

# Create dataset

In [8]:
# Define features and target variables 
features = ["Gender", "PurchasedVIP", "Income", "Children", "Age", "Attractiveness"]
target = ["Matches"]

In [9]:
# Dataset
X = date_aset[features]
y = date_aset[target]
labels = date_aset.columns.to_numpy() # Save labels for later 

# Splitting Dataset

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
RANDOM_STATE = 1

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = RANDOM_STATE)


In [13]:
print(f"{X_train.shape} {y_train.shape}")
print(f"{X_test.shape} {y_test.shape}")

(600, 6) (600, 1)
(400, 6) (400, 1)


# Feature Scaling

In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
scaler = StandardScaler() 
scaler.fit(X_train)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [16]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns = labels[:-1])
X_train_scaled.describe()

Unnamed: 0,Gender,PurchasedVIP,Income,Children,Age,Attractiveness
count,600.0,600.0,600.0,600.0,600.0,600.0
mean,1.051011e-16,-9.769963000000001e-17,3.212245e-16,-2.664535e-17,-6.513308000000001e-17,-1.480297e-18
std,1.000834,1.000834,1.000834,1.000834,1.000834,1.000834
min,-1.027032,-0.913708,-2.966589,-1.013525,-1.858921,-1.598354
25%,-1.027032,-0.913708,-0.6461503,-1.013525,-0.8609915,-0.9029137
50%,0.9736796,-0.913708,-0.01205916,-0.003367194,0.02605705,0.1402472
75%,0.9736796,1.094442,0.6046105,1.006791,0.9131056,0.8356878
max,0.9736796,1.094442,2.533881,2.016949,1.578392,1.531128


In [17]:
X_test_scaled = pd.DataFrame(X_test_scaled, columns = labels[:-1])
X_test_scaled.describe()

Unnamed: 0,Gender,PurchasedVIP,Income,Children,Age,Attractiveness
count,400.0,400.0,400.0,400.0,400.0,400.0
mean,1.1102230000000002e-17,1.7763570000000002e-17,-1.332268e-16,3.5527140000000005e-17,1.953993e-16,-4.440892e-18
std,1.001252,1.001252,1.001252,1.001252,1.001252,1.001252
min,-1.010051,-1.005013,-3.162272,-0.9346874,-1.758386,-1.701968
25%,-1.010051,-1.005013,-0.7241434,-0.9346874,-0.7929766,-0.9722927
50%,0.9900495,0.9950124,0.005557212,0.0596609,-0.04210258,0.1222207
75%,0.9900495,0.9950124,0.6837093,1.054009,0.8160391,0.8518962
max,0.9900495,0.9950124,3.125875,2.048357,1.566913,1.581572


*We now see our standard deviation for our datasets are 1* 

# Creating and Training the Model

In [18]:
from sklearn.ensemble import RandomForestRegressor

In [19]:
param_grid = {
    "n_estimators": [50, 100, 200], 
    "max_depth": [10, 25, 50, 75],
    "min_samples_split": [2, 10, 50], 
    "min_samples_leaf": [1, 2, 4, 8]
}

In [20]:
# Hyperparamater Tuning 
from sklearn.model_selection import GridSearchCV 

grid_search = GridSearchCV(
    estimator=RandomForestRegressor(),  
    param_grid=param_grid,              
    cv=3,                               # Number of cross-validation folds
    scoring='neg_mean_squared_error',   # Scoring method
    n_jobs=-1,                          # Use all available cores
    verbose=3                           # Verbosity level
)

In [21]:
grid_search.fit(X_train_scaled, y_train.values.ravel())

Fitting 3 folds for each of 144 candidates, totalling 432 fits
[CV 1/3] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=-0.000 total time=   0.3s
[CV 2/3] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=-0.000 total time=   0.8s
[CV 2/3] END max_depth=10, min_samples_leaf=1, min_samples_split=10, n_estimators=100;, score=-0.000 total time=   0.4s
[CV 1/3] END max_depth=10, min_samples_leaf=1, min_samples_split=10, n_estimators=200;, score=-0.000 total time=   0.8s
[CV 3/3] END max_depth=10, min_samples_leaf=1, min_samples_split=50, n_estimators=50;, score=-18.583 total time=   0.3s
[CV 3/3] END max_depth=10, min_samples_leaf=1, min_samples_split=50, n_estimators=100;, score=-18.782 total time=   0.4s
[CV 3/3] END max_depth=10, min_samples_leaf=1, min_samples_split=50, n_estimators=200;, score=-17.231 total time=   0.7s
[CV 2/3] END max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=-0.0

In [22]:
print(f"{-grid_search.best_score_:.6f}")

-0.000000


In [23]:
grid_search.best_params_

{'max_depth': 10,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 50}

# Evaluating the Model

In [24]:
best_params = grid_search.best_params_

In [25]:
best_model = RandomForestRegressor(**best_params)

In [26]:
best_model.fit(X_train_scaled, y_train.values.ravel())

In [27]:
from sklearn.metrics import mean_squared_error, r2_score

# Evaluate on the cross validation set 
y_test_pred = best_model.predict(X_test_scaled.to_numpy())
mse = mean_squared_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)

print(f"Mean Squared Error (MSE) on Test Set: {mse:.6f}")
print(f"R-squared Score (R2) on Test Set: {r2:.6f}")

Mean Squared Error (MSE) on Test Set: 0.000000
R-squared Score (R2) on Test Set: 1.000000




In [28]:
# Check for Bias and Variance (Overfitting)
y_train_pred = best_model.predict(X_train_scaled.to_numpy())
mse_train = mean_squared_error(y_train, y_train_pred)
print(f"Training Set MSE: {mse_train:.6f} vs. Test Set MSE: {mse:.6f}")

Training Set MSE: 0.000000 vs. Test Set MSE: 0.000000




*Seems like we have a remarkable seemingly 100% accuracy here. Let's do a cross validation score test across the subsets of our data* 

# Cross Validation

In [29]:
from sklearn.model_selection import cross_val_score

In [30]:
# Perform cross-validation
cv_scores = cross_val_score(best_model, X, y.values.ravel(), cv=5, scoring='neg_mean_squared_error')

# Convert negative MSE to positive
cv_scores = -cv_scores

print(f"Cross-validated MSE scores: {cv_scores}")
print(f"Mean cross-validated MSE: {cv_scores.mean():.6f}")
print(f"Standard deviation of cross-validated MSE: {cv_scores.std():.6f}")

Cross-validated MSE scores: [0. 0. 0. 0. 0.]
Mean cross-validated MSE: 0.000000
Standard deviation of cross-validated MSE: 0.000000
