# IPIP-NEO-300 Scores 🌊

Based on user responses we calculate scores in the Big-Five version [IPIP-NEO](https://ipip.ori.org/).

Official repository version:

- [https://osf.io/tbmh5](https://osf.io/tbmh5/)

About dataset:

- [https://osf.io/2kfhe](https://osf.io/2kfhe)

In [2]:
import pyreadstat

import pandas as pd
import traits as T

from ipipneo import IpipNeo

import warnings
warnings.filterwarnings("ignore")

pd.options.display.max_columns = 10
pd.options.display.max_rows = 10

In [13]:
ORIGINAL_IPIP_NEO_DATASET = "./datasets/raw/IPIP300.por"
NQ_IPIP_NEO_DATASET = "./datasets/refined/IPIP300-SCORES.csv.gz"

## 1. Original dataset in SPSS portable file format 🗂️

In [5]:
df0, _ = pyreadstat.read_por(ORIGINAL_IPIP_NEO_DATASET)

dflen = len(df0)
assert dflen == 307313

df0.head(5)

Unnamed: 0,CASE,SEX,AGE,SEC,MIN,...,I296,I297,I298,I299,I300
0,1.0,1.0,24.0,43.0,57.0,...,2.0,1.0,1.0,2.0,4.0
1,3.0,2.0,24.0,59.0,5.0,...,4.0,4.0,5.0,4.0,2.0
2,4.0,2.0,36.0,44.0,11.0,...,1.0,5.0,1.0,5.0,2.0
3,5.0,1.0,19.0,4.0,15.0,...,2.0,3.0,2.0,3.0,2.0
4,6.0,1.0,17.0,28.0,23.0,...,2.0,5.0,5.0,4.0,2.0


## 2. Values ​​with zero in items are removed ❌ 

In [6]:
df0_float_cols = df0.select_dtypes(include=["float64"]).columns
df0[df0_float_cols] = df0[df0_float_cols].astype(int)
df0.columns = df0.columns.str.strip().str.lower()

print(f"Dataset original total: {dflen}")
print(f"Is there any missing value? {df0.isnull().values.any()}")
print(f"How many missing values: {df0.isnull().values.sum()}")

print("Removing invalid data from dataset: IPIP300")
df0 = df0[(df0.loc[:, "i1":"i300"] != 0).all(axis=1)]

df0 = df0.dropna(inplace=False)

print(f"New dataset total: {len(df0)}")
print(f"Number of countries: {len(df0.country.unique())}")

Dataset original total: 307313
Is there any missing value? False
How many missing values: 0
Removing invalid data from dataset: IPIP300
New dataset total: 145388
Number of countries: 224


## 3. Calculates scores 🧮

In [7]:
USER_SCORE = []

def compute_big5_score_300(r: pd.Series) -> bool | BaseException:
    item = []

    for x in range(1, 301):
        item.append(
            {
                "id_question": x,
                "id_select": r[f"i{x}"],
                "reverse_scored": 0,
            }
        )

    try:
        calc_score = IpipNeo(question=300, test=True).compute(
            sex="M" if int(r["sex"]) == 1 else "F",
            age=int(r["age"]),
            answers={"answers": item},
        )
        calc_score["case"] = r["case"]

        USER_SCORE.append(calc_score)
    except BaseException as e:
        raise BaseException(f"Failed: {e}")
    return True
    

In [8]:
assert len(USER_SCORE) == 0, "should be zero"
df0.apply(compute_big5_score_300, axis=1)

2         True
4         True
7         True
9         True
13        True
          ... 
307305    True
307306    True
307307    True
307311    True
307312    True
Length: 145388, dtype: bool

In [9]:
USER_SCORE_NORMALIZED = []

for x in USER_SCORE:
    try:
        USER_SCORE_NORMALIZED.append(
            T.extract_personalities(
                idx=x["case"], result=x["person"]["result"]["personalities"]
            )
        )
    except BaseException as e:
        raise BaseException(f"Failed: {e}")


In [10]:
df1 = pd.DataFrame(USER_SCORE_NORMALIZED)
df1_cols_numerics = df1.select_dtypes(include=["int", "float"]).columns

assert len(df0) == len(df1), "should be the same"

df2 = pd.merge(df0, df1, on="case", how="inner")
assert len(df2) == len(df0), "should be the same"

In [11]:
df2.head(10)

Unnamed: 0,case,sex,age,sec,min,...,facet_anger,facet_depression,facet_self_consciousness,facet_immoderation,facet_vulnerability
0,4,2,36,44,11,...,1.0,2.166562,1.0,14.207987,1.0
1,6,1,17,28,23,...,42.309529,37.735136,38.549978,39.505356,55.076238
2,9,2,28,0,30,...,31.027056,11.415774,49.525488,59.215147,21.709331
3,11,1,17,36,36,...,22.331027,81.67317,99.671293,23.966803,92.646537
4,15,2,48,34,49,...,38.763436,85.981729,72.496716,68.600585,79.863411
5,17,1,22,51,55,...,7.643848,78.800543,55.608646,30.491174,66.089166
6,21,2,38,24,1,...,11.325256,48.107381,1.0,49.512488,25.805531
7,23,1,43,59,4,...,99.26008,81.748078,81.414757,17.875995,89.660551
8,24,2,30,1,5,...,46.766679,93.095213,80.634463,30.491174,87.058952
9,28,2,17,58,15,...,50.782145,35.723458,17.475468,39.505356,51.872705


## 4. Save refined datasets ✅

In [14]:
assert len(df2) == 145388, "the count should be 145388!"

df2.to_csv(NQ_IPIP_NEO_DATASET, compression="gzip", index=False)

print(f"Dataset saved in: {NQ_IPIP_NEO_DATASET}")

Dataset saved in: ./datasets/refined/IPIP300-SCORES.csv.gz
