### Process the Dataset


In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("data-raw.csv", sep='\t')
df = df.sample(frac=0.5)
columns_to_delete = [ 'EXT1_E', 'EXT2_E', 'EXT3_E', 'EXT4_E', 'EXT5_E', 'EXT6_E', 'EXT7_E', 'EXT8_E', 'EXT9_E', 'EXT10_E', 'EST1_E', 'EST2_E', 'EST3_E', 'EST4_E', 'EST5_E', 'EST6_E', 'EST7_E', 'EST8_E', 'EST9_E', 'EST10_E', 'AGR1_E', 'AGR2_E', 'AGR3_E', 'AGR4_E', 'AGR5_E', 'AGR6_E', 'AGR7_E', 'AGR8_E', 'AGR9_E', 'AGR10_E', 'CSN1_E', 'CSN2_E', 'CSN3_E', 'CSN4_E', 'CSN5_E', 'CSN6_E', 'CSN7_E', 'CSN8_E', 'CSN9_E', 'CSN10_E', 'OPN1_E', 'OPN2_E', 'OPN3_E', 'OPN4_E', 'OPN5_E', 'OPN6_E', 'OPN7_E', 'OPN8_E', 'OPN9_E', 'OPN10_E', 'dateload', 'screenw', 'screenh', 'introelapse', 'testelapse', 'endelapse', 'IPC', 'country', 'lat_appx_lots_of_err', 'long_appx_lots_of_err']  # List of column names to delete
df = df.drop(columns=columns_to_delete)

In [None]:
def calculate_personality_scores(row):
    scores = {
        "extroversion": row["EXT1"] + row["EXT2"] + row["EXT3"] + row["EXT4"] + row["EXT5"] + row["EXT6"] + row["EXT7"] + row["EXT8"] + row["EXT9"] + row["EXT10"],
        "agreeableness": row["AGR1"] + row["AGR2"] + row["AGR3"] + row["AGR4"] + row["AGR5"] + row["AGR6"] + row["AGR7"] + row["AGR8"] + row["AGR9"] + row["AGR10"],
        "conscientiousness": row["CSN1"] + row["CSN2"] + row["CSN3"] + row["CSN4"] + row["CSN5"] + row["CSN6"] + row["CSN7"] + row["CSN8"] + row["CSN9"] + row["CSN10"],
        "neuroticism": row["EST1"] + row["EST2"] + row["EST3"] + row["EST4"] + row["EST5"] + row["EST6"] + row["EST7"] + row["EST8"] + row["EST9"] + row["EST10"],
        "openness_to_experience": row["OPN1"] + row["OPN2"] + row["OPN3"] + row["OPN4"] + row["OPN5"] + row["OPN6"] + row["OPN7"] + row["OPN8"] + row["OPN9"] + row["OPN10"]
    }
    return pd.Series(scores)

# Apply the function to create personality score columns
df_scores = df.apply(calculate_personality_scores, axis=1)

# Merge the score columns with the original DataFrame
df = pd.concat([df, df_scores], axis=1)

In [None]:
df.iloc[:, -1].values

array([38., 31., 35., ..., 36., 34., 30.])

In [None]:
df.head()

Unnamed: 0,EXT1,EXT2,EXT3,EXT4,EXT5,EXT6,EXT7,EXT8,EXT9,EXT10,...,OPN6,OPN7,OPN8,OPN9,OPN10,extroversion,agreeableness,conscientiousness,neuroticism,openness_to_experience
269303,1.0,4.0,4.0,4.0,3.0,2.0,2.0,2.0,4.0,4.0,...,2.0,5.0,5.0,5.0,5.0,30.0,34.0,30.0,34.0,38.0
776543,2.0,3.0,3.0,3.0,4.0,2.0,5.0,3.0,2.0,4.0,...,1.0,4.0,3.0,5.0,4.0,31.0,28.0,35.0,28.0,31.0
496098,3.0,1.0,4.0,2.0,4.0,2.0,3.0,2.0,4.0,3.0,...,1.0,5.0,1.0,5.0,5.0,28.0,33.0,32.0,33.0,35.0
785726,4.0,4.0,3.0,2.0,1.0,1.0,4.0,2.0,4.0,4.0,...,3.0,5.0,3.0,5.0,3.0,29.0,33.0,35.0,29.0,31.0
682530,5.0,1.0,5.0,1.0,5.0,1.0,5.0,1.0,5.0,2.0,...,1.0,5.0,4.0,4.0,5.0,31.0,33.0,28.0,23.0,37.0


In [None]:
# Get the full size of the DataFrame
num_rows, num_columns = df.shape

print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_columns}")

Number of rows: 507670
Number of columns: 55


In [None]:
df = df.dropna()

In [None]:

nan_count = df.isna().sum().sum()
inf_count = (df == np.inf).sum().sum()

print(f"Number of NaN values: {nan_count}")
print(f"Number of infinite values: {inf_count}")

Number of NaN values: 0
Number of infinite values: 0


In [None]:
df.to_csv('data.csv', index=False)