# 1) import Library & files

In [None]:
#Library
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

#Display all columns
pd.options.display.max_columns = None

In [None]:
#files
data = pd.read_csv("https://full-stack-assets.s3.eu-west-3.amazonaws.com/M03-EDA/Speed+Dating+Data.csv", encoding='unicode_escape')
data.head()

# 2) functions

In [None]:
def checking_missing_value(df):
    if df.isnull().values.any() == True:
        print(f"Missing data !")
        for i in range(len(df.columns)):
            if df.iloc[:,i].isnull().any()== True:
                print(f"{df.columns[i]} : {round(((len(df)-(df.iloc[:,i].isnull().sum()))/len(df))*100,1)} % data available at index {i}")
            else:
                continue
    else:
        print(f"No missing data !")


# 3) Basic statistics

In [None]:
# Basic stats
print(f"Number of rows : {data.shape[0]}")
print(f"Number of columns : {data.shape[1]}")

print("Basics statistics: ")
display(data.describe(include='all'))

print("Percentage of missing values: ")
print(data.isna() .sum() / data.shape[0])

In [None]:
#Count the number of unique identifiers
unique_identifiers = data["iid"].nunique()
print(f"Number of unique identifiers: {unique_identifiers}")

#Calculate the gender ratio
gender_ratio = data.drop_duplicates(subset=['iid'])['gender'].value_counts()
print("Gender ratio:")
print(gender_ratio)


# 4) Questions

In [None]:
#keeping only one answer by idd
df_unique_idd = data.drop_duplicates(subset=['iid'])

#keeping only the match
df_match = data[data['match'] == 1]

#keeping only the match and one row by iid
df_match_unique_idd = df_match.drop_duplicates(subset=['iid']) 

#for filtering by series
list_attribute = ['attr','sinc','intel','fun','amb','shar']

## 4.1) What are the least desirable attributes in a male partner? Does this differ for female partners?

In [None]:
#serie to analyse
serie = "1_1"

#selecting data
mask_attribute = [ x + serie for x in list_attribute] + ['gender']
df = df_unique_idd[mask_attribute]

#checking data
checking_missing_value(df)

#analyse
df_by_gender = df.groupby('gender').mean().reset_index().T
df_by_gender.drop(['gender'], axis=0,inplace=True)
df_by_gender.columns = ['female_answer', 'male_answer']
df_by_gender

Using the serie 1_1, the survey before the first date :
- for male, the least desirable attribute in a female partner is the shared interets & hobbies
- for female partners, this is the ambitious

In [None]:
# Visualization
fig = make_subplots(rows=1, cols=2, subplot_titles=['Female Answer', 'Male Answer'])

# Female answer
sorted_female = df_by_gender.sort_values(by='female_answer')
fig.add_trace(
    go.Bar(x=sorted_female.index, y=sorted_female['female_answer'], name='Female Answer'),
    row=1, col=1,
)

# Male answer
sorted_male = df_by_gender.sort_values(by='male_answer')
fig.add_trace(
    go.Bar(x=sorted_male.index, y=sorted_male['male_answer'], name='Male Answer'),
    row=1, col=2,
)

fig.update_layout(height=600, width=1200, title_text='What are the least desirable attributes in a male partner? Does this differ for female partners?')

fig.show()

## 4.2) How important do people think attractiveness is in potential mate selection vs. its real impact?

In [None]:
df = pd.DataFrame(index=list_attribute)

#serie_4_1:
serie = '4_1'
mask_attribute = [ x + serie for x in list_attribute]
checking_missing_value(df_match_unique_idd[mask_attribute])
df["people_think"] = df_match_unique_idd[mask_attribute].dropna().mean().values

#Scorecard given:
serie = ''
mask_attribute = [ x + serie for x in list_attribute]
checking_missing_value(df_match[mask_attribute])
df["scorecard"] = df_match[mask_attribute].dropna().mean().values

df

==> People who have match thinks that attractiveness is the more important attribute for a mate selection, it's true for male and female

In [None]:
# Visualization
fig = make_subplots(rows=1, cols=2, subplot_titles=["people_think", 'scorecard'])

# people_think
df_sorted = df.sort_values(by='people_think', ascending=False)
fig.add_trace(
    go.Bar(x=df_sorted.index, y=df_sorted["people_think"], name="people_think",
    marker_color='blue'),
    row=1, col=1,
)

# scorecard
df_sorted = df.sort_values(by='scorecard', ascending=False)
fig.add_trace(
    go.Bar(x=df_sorted.index, y=df_sorted['scorecard'], name='scorecard',
    marker_color='orange'),
    row=1, col=2,
)

fig.update_layout(height=600, width=1200, title_text='How important do people think attractiveness is in potential mate selection vs. its real impact?')

fig.show()

---> based on the scorecard of people who have match, this is the intellegince attribute tge more important. All attributes are more balanced.

## 4.3) Are shared interests more important than a shared racial background?

In [None]:
#selecting data

mask_attribute = ['imprace', 'shar','samerace']
df = df_match[mask_attribute]  
df.mean()

In [None]:
#selecting data
mask_attribute = ['imprace', 'shar','samerace']
df = df_match_unique_idd[mask_attribute]  
df.mean()

---> based on the answer of matching people in the scorecard and the signup survey, the 

## 4.4) Can people accurately predict their own perceived value in the dating market?

In [None]:
# Data preparation
list_attribute = ['attr', 'sinc', 'intel', 'fun', 'amb']
df = pd.DataFrame(index=list_attribute)

series = ['5_1', '_o']

# Assuming 'data' is your original DataFrame and 'checking_missing_value' is a defined function
for serie in series:
    mask_attribute = [x + serie for x in list_attribute] + ['iid']
    checking_missing_value(data[mask_attribute])
    df[str(serie)] = (data[mask_attribute].dropna().groupby('iid').mean()).mean().values
df['ratio_pred/recieved'] = df['5_1'] / df['_o']

display(df)

# Calculate Mean Absolute Percentage Error (MAPE)
mape = round(np.mean(np.abs((y_true - y_pred) / y_true)) * 100, 2)

print(f"MAPE: {mape}%")

In [None]:
# Create the figure
fig = go.Figure()

# Add bars for predicted values
fig.add_trace(go.Bar(
    x=df.index,
    y=df['5_1'],
    name='Self Predicted score',
    marker_color='blue'
))

# Add bars for received values
fig.add_trace(go.Bar(
    x=df.index,
    y=df['_o'],
    name='Received score',
    marker_color='orange'
))

# Update layout
fig.update_layout(
    title='Can people accurately predict their own perceived value in the dating market?',
    xaxis_title='Attributes',
    yaxis_title='Score',
    barmode='group'
)

# Show the figure
fig.show()

It's not so bad, people surestimate their value by 12 %, but it's constant for all attributes. 

## 4.5) In terms of getting a second date, is it better to be someone's first speed date of the night or their last?

In [None]:
mask_attribute = ['order', 'dec_o', 'dec' ]
# the order of speed datind in night
# dec_o : second date recieved
# dec : second date given

df = data[mask_attribute]
df['total'] = 1
df_by_dec = df.groupby('order').sum()
df_by_dec['ratio_recieved'] = df_by_dec['dec_o'] / df_by_dec['total']
df_by_dec['ratio_given'] = df_by_dec['dec'] / df_by_dec['total']
df_by_dec

In [None]:
fig = px.line(df_by_dec, x=df_by_dec.index, y=["ratio_recieved", "ratio_given"], range_y=[0,0.6], title="In terms of getting a second date, is it better to be someone's first speed date of the night or their last?")
fig.show()

--> The first speed date give the better result, but it's well balanced during all the night long.