In [1]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.decomposition import PCA
from scipy.stats import entropy

In [2]:
df = pd.read_csv("valentine_dataset.csv")
df

Unnamed: 0,Name,Age,Gender,Income,Appearance_Score,Interests_Score,Confidence_Score,Educational_Status,Job_Type,Valentine_Date
0,Leon Cruz,26,Male,77648,63.94,88.18,30.09,Bsc,Employed,1
1,Samuel Brown,23,Male,71617,2.50,49.98,76.45,Bsc,Employed,1
2,Lori Baker,34,Male,38616,27.50,33.90,33.76,PhD,Employed,1
3,Tina Fitzgerald,37,Male,79473,22.32,25.47,43.32,Master,Self Employed,1
4,Jason Simmons,38,Male,43510,73.65,43.63,32.44,Bsc,Employed,0
...,...,...,...,...,...,...,...,...,...,...
19995,Michael Vega,26,Female,36355,65.62,45.80,87.23,High School,Employed,0
19996,Debra Horn,21,Female,55355,32.17,19.26,7.88,High School,Employed,1
19997,Aaron Burgess,31,Male,31910,45.02,13.93,39.64,PhD,Self Employed,1
19998,Rebecca Morton,33,Female,22483,73.67,64.29,29.53,Master,Self Employed,1


In [3]:
df.isna().sum()

Name                  0
Age                   0
Gender                0
Income                0
Appearance_Score      0
Interests_Score       0
Confidence_Score      0
Educational_Status    0
Job_Type              0
Valentine_Date        0
dtype: int64

In [4]:
df.duplicated().sum()

0

In [5]:
df.dtypes

Name                   object
Age                     int64
Gender                 object
Income                  int64
Appearance_Score      float64
Interests_Score       float64
Confidence_Score      float64
Educational_Status     object
Job_Type               object
Valentine_Date          int64
dtype: object

In [6]:
df["Name"].value_counts()

Name
Michael Johnson      13
Jennifer Williams     9
Michael Davis         8
Ashley Jones          8
Michael Williams      8
                     ..
Brianna Campbell      1
Michael Cox           1
Michael Cole          1
Jesse Meyer           1
Erik Curry            1
Name: count, Length: 17795, dtype: int64

In [7]:
df["Educational_Status"].value_counts()

Educational_Status
High School    5107
Bsc            4985
PhD            4955
Master         4953
Name: count, dtype: int64

In [8]:
df["Job_Type"].value_counts()

Job_Type
Employed         10014
Self Employed     9986
Name: count, dtype: int64

In [9]:
df["Gender"].value_counts()

Gender
Male      10047
Female     9953
Name: count, dtype: int64

In [10]:
educational_values = {"High School": 0, "Bsc":1, "PhD":2, "Master":3}

df["Gender"] = df["Gender"].apply(lambda x: 1 if x == "Male" else 0)
df["Job_Type"] = df["Job_Type"].apply(lambda x: 1 if x == "Employed" else 0)
df["Educational_Status"] = df["Educational_Status"].apply(lambda x: educational_values[x])

In [11]:
df

Unnamed: 0,Name,Age,Gender,Income,Appearance_Score,Interests_Score,Confidence_Score,Educational_Status,Job_Type,Valentine_Date
0,Leon Cruz,26,1,77648,63.94,88.18,30.09,1,1,1
1,Samuel Brown,23,1,71617,2.50,49.98,76.45,1,1,1
2,Lori Baker,34,1,38616,27.50,33.90,33.76,2,1,1
3,Tina Fitzgerald,37,1,79473,22.32,25.47,43.32,3,0,1
4,Jason Simmons,38,1,43510,73.65,43.63,32.44,1,1,0
...,...,...,...,...,...,...,...,...,...,...
19995,Michael Vega,26,0,36355,65.62,45.80,87.23,0,1,0
19996,Debra Horn,21,0,55355,32.17,19.26,7.88,0,1,1
19997,Aaron Burgess,31,1,31910,45.02,13.93,39.64,2,0,1
19998,Rebecca Morton,33,0,22483,73.67,64.29,29.53,3,0,1
