# Calculate Big Five IPIP-NEO-120 Scores 🌊

Official repository version:

- [https://osf.io/tbmh5/](https://osf.io/tbmh5/)

About dataset:

- [https://osf.io/2kfhe](https://osf.io/2kfhe)

In [5]:
import pyreadstat

import pandas as pd
import utility as U

from ipipneo import IpipNeo

import warnings
warnings.filterwarnings("ignore")

pd.options.display.max_columns = 20
pd.options.display.max_rows = 20

In [16]:
ORIGINAL_IPIP_NEO_DATASET = "./datasets/raw/IPIP120.por"
NQ_IPIP_NEO_DATASET = "./datasets/refined/IPIP120-SCORES.csv.gz"

## 1. Original dataset in SPSS portable file format 📂

- Files in '.por' format are problematic ⚠️. Let's convert them 🔄.

In [7]:
df0, _ = pyreadstat.read_por(ORIGINAL_IPIP_NEO_DATASET)

dflen = len(df0)
assert dflen == 619150

df0.head(5)

Unnamed: 0,CASE,SEX,AGE,SEC,MIN,HOUR,DAY,MONTH,YEAR,COUNTRY,...,I111,I112,I113,I114,I115,I116,I117,I118,I119,I120
0,1.0,2.0,19.0,8.0,41.0,16.0,30.0,6.0,101.0,South Afr,...,5.0,2.0,4.0,4.0,4.0,2.0,4.0,1.0,5.0,4.0
1,2.0,2.0,22.0,24.0,45.0,16.0,30.0,6.0,101.0,USA,...,1.0,4.0,3.0,3.0,4.0,4.0,2.0,3.0,4.0,3.0
2,4.0,2.0,22.0,3.0,57.0,16.0,30.0,6.0,101.0,USA,...,2.0,3.0,2.0,4.0,4.0,2.0,4.0,2.0,5.0,4.0
3,5.0,2.0,22.0,44.0,4.0,17.0,30.0,6.0,101.0,USA,...,1.0,5.0,5.0,5.0,4.0,1.0,5.0,3.0,5.0,3.0
4,6.0,1.0,13.0,14.0,6.0,17.0,30.0,6.0,101.0,USA,...,1.0,2.0,4.0,3.0,5.0,2.0,4.0,4.0,3.0,5.0


## 2. Values ​​with zero in items are removed 📜🚀 

- The data is saved in *JSON* format in the *results/batch/* directory.

In [8]:
df0_float_cols = df0.select_dtypes(include=["float64"]).columns
df0[df0_float_cols] = df0[df0_float_cols].astype(int)
df0.columns = df0.columns.str.strip().str.lower()

print(f"Dataset original total: {dflen}")
print(f"Is there any missing value? {df0.isnull().values.any()}")
print(f"How many missing values: {df0.isnull().values.sum()}")

if "IPIP120" in ORIGINAL_IPIP_NEO_DATASET:
    print("Removing invalid data from dataset: IPIP120")
    df0 = df0[(df0.loc[:, "i1":"i120"] != 0).all(axis=1)]

df0 = df0.dropna(inplace=False)

print(f"New dataset total: {len(df0)}")
print(f"Number of countries: {len(df0.country.unique())}")

Dataset original total: 619150
Is there any missing value? False
How many missing values: 0
Removing invalid data from dataset: IPIP120
New dataset total: 410376
Number of countries: 241


## 3. Calculates scores

In [10]:
USER_SCORE = []

def compute_big5_score_120(r) -> bool | BaseException:
    item = []

    for x in range(1, 121):
        item.append(
            {
                "id_question": x,
                "id_select": r[f"i{x}"],
                "reverse_scored": 0,
            }
        )

    try:
        calc_score = IpipNeo(question=120, test=True).compute(
            sex="M" if int(r["sex"]) == 1 else "F",
            age=int(r["age"]),
            answers={"answers": item},
        )
        calc_score["case"] = r["case"]

        USER_SCORE.append(calc_score)
    except BaseException as e:
        raise BaseException(f"Failed: {e}")
    return True

In [11]:
assert len(USER_SCORE) == 0, "should be zero"
df0.apply(compute_big5_score_120, axis=1)

0         True
1         True
4         True
5         True
6         True
          ... 
619142    True
619143    True
619144    True
619147    True
619149    True
Length: 410376, dtype: bool

In [12]:
USER_SCORE_NORMALIZED = []

for x in USER_SCORE:
    try:
        USER_SCORE_NORMALIZED.append(
            U.extract_personalities(
                idx=x["case"], result=x["person"]["result"]["personalities"]
            )
        )
    except BaseException as e:
        raise BaseException(f"Failed: {e}")


In [13]:
df1 = pd.DataFrame(USER_SCORE_NORMALIZED)
df1_cols_numerics = df1.select_dtypes(include=["int", "float"]).columns

assert len(df0) == len(df1), "should be the same"

df2 = pd.merge(df0, df1, on="case", how="inner")
assert len(df2) == len(df0), "should be the same"

In [14]:
df2.head(10)

Unnamed: 0,case,sex,age,sec,min,hour,day,month,year,country,...,facet_cooperation,facet_modesty,facet_sympathy,neuroticism,facet_anxiety,facet_anger,facet_depression,facet_self_consciousness,facet_immoderation,facet_vulnerability
0,1,2,19,8,41,16,30,6,101,South Afr,...,76.776519,74.954334,77.384533,35.573367,27.993157,11.549711,58.018551,39.725121,96.864432,11.048211
1,2,2,22,24,45,16,30,6,101,USA,...,4.270395,69.646885,67.981348,66.99855,58.854938,89.843818,89.88057,57.486716,1.0,80.327511
2,6,1,13,14,6,17,30,6,101,USA,...,3.278791,24.01817,12.400848,22.577376,33.979027,86.802916,29.000871,7.509259,6.890607,15.131043
3,7,2,18,25,11,17,30,6,101,USA,...,52.320264,38.542547,41.268613,12.4668,12.195029,57.719898,18.108659,8.870845,44.831762,5.285188
4,8,2,24,19,25,17,30,6,101,USA,...,82.330021,50.199377,55.939489,26.353107,21.800273,7.899414,26.475441,38.817649,62.697315,53.029251
5,9,2,20,26,30,17,30,6,101,USA,...,52.320264,8.000591,18.490918,64.820778,66.339344,73.862903,32.943919,58.257428,84.677109,46.924899
6,10,2,18,18,36,17,30,6,101,USA,...,69.067606,74.954334,66.031128,51.344982,90.223084,49.233759,11.912872,58.257428,55.597358,46.924899
7,12,1,50,27,48,17,30,6,101,USA,...,90.392067,50.42621,98.512902,1.0,3.760732,4.344188,6.334921,3.908683,12.104661,6.911371
8,13,1,28,48,50,17,30,6,101,Finland,...,93.626251,45.718112,41.705378,8.96722,1.0,8.881105,22.370701,58.278921,12.269957,28.83768
9,16,2,19,19,20,18,30,6,101,USA,...,60.839243,98.147071,94.491314,27.069092,66.339344,2.841271,99.651096,30.798323,1.0,11.048211


## Save

In [19]:
assert len(df2) == 410376, "the count should be 410376!"

df2.to_csv(NQ_IPIP_NEO_DATASET, compression="gzip", index=False)

print(f"Dataset saved in: {NQ_IPIP_NEO_DATASET}")

Dataset saved in: ./datasets/refined/IPIP120-SCORES.csv.gz
