In [None]:
# Importing required libraries and frameworks
import pandas as pd
import numpy as np
import glob
import warnings
import missingno as msno
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
warnings.filterwarnings('ignore')
from fancyimpute import IterativeImputer
import re
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GroupKFold,train_test_split
from sklearn.model_selection import GroupShuffleSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Importing the dataset

In [None]:
#Importing the Dataset
path = r"C:\Users\SHURI\Desktop\Final Emapthy\EyeT\EyeT_group_dataset_III_image_name_letter_card_participant_**_trial_*.csv"
filename = glob.glob(path)
df_pre= []
for file in filename:
    df_pre.append(pd.read_csv(file))
df = pd.concat(df_pre, ignore_index=True)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
#Importing the Questionnaire dataset
df_questionnaire =pd.read_csv(r"C:\Users\SHURI\Desktop\Final Emapthy\Questionnaire_datasetIA.csv", encoding = 'ISO-8859-1')

In [None]:
df_questionnaire.head()

# Preprocessing the dataset

In [None]:
#Dropping rows with missing values in "Pupil diameter" as pupil changes are slower than eye movements 
#and a 40 Hz frequency is sufficient for studying pupil diameter evolution over time.
df = df.dropna(subset = ['Pupil diameter left','Pupil diameter right'])

In [None]:
#Plotting missing values for each columns
msno.bar(df)

Missing value plot shows that there are coloumns which are having majority of its value missing.So We are taking 75% as the threshold limits and going to remove those coloumns which are having missing values of more than 75%

In [None]:
#Function to remove columns which are having for 75% of missing values
thres = int(0.75 * len(df))
df = df.dropna(thresh=thres, axis=1)

# Display missing value visualization
msno.bar(df)

The visualizations reveal the successful elimination of columns with over 75% missing values. However, it's evident that certain columns still contain gaps that require imputation.
Furthermore, it's worth noting that certain columns within the DataFrame exhibit a consistent value across all rows. These columns, characterized by having zero variance, are not likely to contribute meaningfully to the model. As a result, we are eliminating such columns from consideration.

In [None]:
# Removing columns with the same value in every row
unique_values = []
for column in df.columns:
    if df[column].nunique() <= 1:
        unique_values.append(column)
df = df.drop(columns=unique_values)

In [None]:
df.info()

It's also noticeable that certain columns intended to be in integer or float data formats are currently presented as objects. A more detailed examination of these columns reveals that many of them use ',' instead of '.' as the decimal separator, hindering a straightforward transformation to floats. Thus, the ',' should be replaced with '.' for consistency.

Furthermore, some columns are in an incorrect format that necessitates a type conversion. Adding to this, there's an absence of the target column, which corresponds to the empathy score, within this dataset.

In [None]:
# Replacing comma with dot and convert object datapoints to numeric
df = df.replace(',', '.', regex=True)
for col in df.columns:
    if not pd.api.types.is_numeric_dtype(df[col]):
        df[col] = pd.to_numeric(df[col], errors='ignore')

In [None]:
# Creating empathy scores dictionary
empathy_scores = {}
for index, row in df_questionnaire.iterrows():
    empathy_scores[int(row['Participant nr'])] = row['Total Score extended']

# Extracting participant numbers and converting to int
df['Participant name'] = df['Participant name'].str[-2:].astype(int)

# Sorting the DataFrame by participant name
df = df.sort_values('Participant name')

# Adding Empathy Score column to the DataFrame
df['Empathy Score'] = df['Participant name'].apply(lambda x: empathy_scores.get(x, 0))

# Sorting the DataFrame again by participant name
df = df.sort_values('Participant name')

In [None]:
eyetrackertimestamp = df['Eyetracker timestamp'].value_counts(dropna=False)
eyetrackertimestamp.hist()

The graph indicates anomalies in data recording, where a single timestamp should ideally correspond to only one observation. However, our dataset exhibits multiple entries for certain timestamps. This discrepancy highlights the existence of duplicates that require elimination.

In [None]:
# Removing duplicates based on 'Eyetracker timestamp' column
df = df.drop_duplicates(subset='Eyetracker timestamp')

# Plotting histogram of 'Eyetracker timestamp'
eyetracker_timestamp = df['Eyetracker timestamp'].value_counts(dropna=False)
eyetracker_timestamp.hist()
# The DataFrame 'df' will now contains the duplicates removed and the histogram plotted

Above Graph has shows us that we have removed all the datapoints which are having duplicate values based on eyetracker time stamp

In [None]:
df.info()

In [None]:
missing_val_columns = ['Gaze point X (MCSnorm)', 'Gaze point Y (MCSnorm)',
                      'Gaze point left X (MCSnorm)', 'Gaze point left Y (MCSnorm)',
                      'Gaze point right X (MCSnorm)', 'Gaze point right Y (MCSnorm)']

# Impute missing values using IterativeImputer
imputer = IterativeImputer()
df[missing_val_columns] = imputer.fit_transform(df[missing_val_columns])

# Display missing value visualization
msno.bar(df)
# The DataFrame 'df' now contains the missing values imputed and the missing value visualization displayed



This plot shows us that all the missing values from the dataset III has been imputed

# Exploratory Data Analysis and Visualisation

In [None]:
# Plotting the first time series
plt.figure(figsize=(20, 10))
sns.scatterplot(x=df['Eyetracker timestamp'], y=df['Pupil diameter left'], data=df)
plt.xlabel('Time Stamp')
plt.ylabel('Pupil diameter left')
plt.title('Time Series Plot - Pupil diameter (left) vs Time Stamp')
plt.show()

# Plotting the second time series
plt.figure(figsize=(20, 10))
sns.scatterplot(x=df['Eyetracker timestamp'], y=df['Pupil diameter right'], data=df)
plt.xlabel('Time Stamp')
plt.ylabel('Pupil diameter right')
plt.title('Time Series Plot - Pupil diameter (right) vs Time Stamp')
plt.show()

In [None]:
df.hist(bins=50, figsize=(20,15))

The presented plot illustrates the value distribution across each column. Notably, certain columns exhibit significant skewness in their distributions, warranting their removal.


Both the plot and the referenced paper lead to the inference that Gaze event duration, Pupil diameters, Gaze points, Gaze directions, and Eye Positions are the pivotal predictor variables for the model.

In [None]:
gaze_event_duration = {}
for index, row in df.iterrows():
    participant_name = row['Participant name']
    gaze_duration = row['Gaze event duration']
    if participant_name in gaze_event_duration:
        gaze_event_duration[participant_name] += gaze_duration
    else:
        gaze_event_duration[participant_name] = gaze_duration

new_df = pd.DataFrame(gaze_event_duration.items(), columns=["Participant name", "Gaze event duration"])
new_df['Empathy Score'] = new_df['Participant name'].apply(lambda x: empathy_scores.get(x, 0))

# Scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x=new_df['Gaze event duration'], y=new_df['Empathy Score'], data=new_df)
plt.xlabel('Total Gaze event duration')
plt.ylabel('Empathy Score')
plt.title('Total Gaze event duration vs Empathy Score')
plt.show()


In [None]:
total_record_duration = {}
for index, row in df.iterrows():
    participant_name = row['Participant name']
    record_duration = row['Gaze event duration']
    if participant_name in total_record_duration:
        total_record_duration[participant_name] += record_duration
    else:
        total_record_duration[participant_name] = record_duration

new_df = pd.DataFrame(total_record_duration.items(), columns=["Participant name", "Total Record Duration"])
new_df['Empathy Score'] = new_df['Participant name'].apply(lambda x: empathy_scores.get(x, 0))

# Scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x=new_df['Total Record Duration'], y=new_df['Empathy Score'], data=new_df)
plt.xlabel('Total Record Duration')
plt.ylabel('Empathy Score')
plt.title('Total Record Duration vs Empathy Score')
plt.show()

In [None]:
# Dropping non-numeric columns before computing the correlation matrix
numeric_df = df.select_dtypes(include=['number'])

# Computing the correlation matrix
cor_matrix = numeric_df.drop('Empathy Score', axis=1).corr()

# Plotting the correlation heatmap
plt.subplots(figsize=(42, 42))
plt.title('Pearson Correlation Matrix')
sns.heatmap(cor_matrix, vmax=0.13, annot=True)

# Identifying columns with high correlation
cor_col = set()
for i in range(len(cor_matrix.columns)):
    for j in range(i):
        if abs(cor_matrix.iloc[i, j]) > 0.7:
            col_n = cor_matrix.columns[i]
            cor_col.add(col_n)

print('Columns with Correlation are -', cor_col)

# Dropping correlating columns
df = df.drop(columns=cor_col, axis=1)