# Cleaning the dataset

In [None]:
# Installing required packages
!pip3 install pandas

In [None]:
#Importing libraries

import pandas as pd

### 1. Load datasets:

In [None]:
# Paths to the data

file_path_mh = "/Users/justina/Desktop/Data Science/data/sharew8_rel9-0-0_mh.dta" # Our main dataset
file_path_ep = "/Users/justina/Desktop/Data Science/data/sharew8_rel9-0-0_ep.dta" # Extra dataset from which the data is merged to the main one.
file_path_dn = "/Users/justina/Desktop/Data Science/data/sharew8_rel9-0-0_dn.dta" # Extra dataset from which the data is merged to the main one.

# Load as a pandas DataFrame

df_mh = pd.read_stata(file_path_mh) # mental health data set
df_ep = pd.read_stata(file_path_ep) # this data set consists of information on who is retired and who is not: df_ep["ep005_"]
df_dn = pd.read_stata(file_path_dn) # this data set consists of yeart of birth


In [128]:
# Select only the needed columns before merging
df_dn_sub = df_dn[["mergeid", "dn003_"]]
df_ep_sub = df_ep[["mergeid", "ep329_"]]
df_dn_gender = df_dn[["mergeid", "dn042_"]]
df_ret = df_ep[["mergeid", "ep005_"]]

# Start with df_mh and merge the two others
df_merged = (
    df_mh
    .merge(df_dn_sub, on="mergeid", how="left") # Year of birth
    .merge(df_ep_sub, on="mergeid", how="left") # Year of retirement
    .merge(df_dn_gender, on="mergeid", how="left") # Gender
    .merge(df_ret, on="mergeid", how="left") # status of retirement or not
)


print(df_merged.shape)

(53695, 31)


In [129]:
#Categorical to numeric
# Convert both to numeric (in case they're categorical)
df_merged['dn003_'] = pd.to_numeric(df_merged['dn003_'], errors='coerce')
df_merged['ep329_'] = pd.to_numeric(df_merged['ep329_'], errors='coerce')

# Map 'Male' to 0 and 'Female' to 1
df_merged['dn042_'] = df_merged['dn042_'].map({'Male': 0, 'Female': 1})

In [130]:
# Filtering data: only observations from retired participants

retirement = df_merged[df_merged["ep005_"] == "Retired"]

In [131]:
# Filtering data: only observations from respondents, and not proxies

retirement = retirement[retirement["mh032_"] == "Respondent"]

In [132]:
print(retirement.shape)

(33758, 31)


In [133]:
# Calculating the age of retirement

retirement['age_ret'] = retirement['ep329_'] - retirement['dn003_']
print(retirement['age_ret'].describe())


count    22603.000000
mean        59.186435
std          6.667533
min          8.000000
25%         56.000000
50%         60.000000
75%         63.000000
max         91.000000
Name: age_ret, dtype: float64


In [134]:
# Remove rows where 'age_ret' is NaN and overwrite df_merged_retired
retirement = retirement[retirement['age_ret'].notna()]

# Check shape to confirm
print(retirement.shape)

(22603, 32)


In [135]:
# Count number of NaN entries in 'age_ret'
num_nans = retirement['age_ret'].isna().sum()
print(f"Number of NaN values in age_ret: {num_nans}")

Number of NaN values in age_ret: 0


In [136]:
retirement.columns

Index(['mergeid', 'hhid8', 'mergeidp8', 'coupleid8', 'country', 'language',
       'mh002_', 'mh003_', 'mh004_', 'mh005_', 'mh006_', 'mh007_', 'mh008_',
       'mh009_', 'mh010_', 'mh011_', 'mh012_', 'mh013_', 'mh014_', 'mh015_',
       'mh016_', 'mh017_', 'mh032_', 'mh034_', 'mh035_', 'mh036_', 'mh037_',
       'dn003_', 'ep329_', 'dn042_', 'ep005_', 'age_ret'],
      dtype='object')

In [None]:
#retirement["mh002_"].isnull().sum() #0 empty rows
#retirement["mh003_"].isnull().sum() #0
#retirement["mh004_"].isnull().sum() #0
#retirement["mh005_"].isnull().sum() #0
#retirement["mh006_"].isnull().sum() #19360
#retirement["mh007_"].isnull().sum() #0
#retirement["mh008_"].isnull().sum() #0
#retirement["mh009_"].isnull().sum() #21822
#retirement["mh010_"].isnull().sum() #0
#retirement["mh011_"].isnull().sum() #0
#retirement["mh012_"].isnull().sum() #22191
#retirement["mh013_"].isnull().sum() #0
#retirement["mh014_"].isnull().sum() #0
#retirement["mh015_"].isnull().sum() #0
#retirement["mh016_"].isnull().sum() #0
#retirement["mh017_"].isnull().sum() #0
#retirement["mh032_"].isnull().sum() #0
#retirement["mh034_"].isnull().sum() #0
#retirement["mh035_"].isnull().sum() #0
#retirement["mh036_"].isnull().sum() #0
#retirement["mh037_"].isnull().sum() #0
#retirement["dn003_"].isnull().sum() #0
#retirement["ep329_"].isnull().sum() #0
#retirement["dn042_"].isnull().sum() #0
#retirement["ep005_"].isnull().sum() #0

np.int64(0)

In [137]:
retirement = retirement.drop(columns=["mh006_", "mh009_", "mh012_", "ep005_", "dn003_", "ep329_"])


In [140]:
retirement = retirement.drop(columns=['hhid8', 'mergeidp8', 'coupleid8','language'])

In [147]:
retirement = retirement.drop(columns=['mh032_'])

In [148]:
retirement.shape

(22603, 21)

In [149]:
# Save df_final to a CSV file
retirement.to_csv("/Users/justina/Desktop/Data Science/data/retirement_data.csv", index=False)

# Confirm
print("retirement has been saved successfully!")


retirement has been saved successfully!


## XGBoost

In [150]:
# [1] Separating features and target

X = retirement.drop("age_ret", axis=1).values  # replace 'target_column' with your actual target name
y = retirement["age_ret"].values

In [151]:
retirement

Unnamed: 0,mergeid,country,mh002_,mh003_,mh004_,mh005_,mh007_,mh008_,mh010_,mh011_,...,mh014_,mh015_,mh016_,mh017_,mh034_,mh035_,mh036_,mh037_,dn042_,age_ret
0,AT-001215-01,Austria,No,Any hopes mentioned,No such feelings,No such feelings,Trouble with sleep or recent change in pattern,No mention of loss of interest,No,No diminution in desire for food,...,No such difficulty mentioned,No such difficulty mentioned,Mentions any enjoyment from activity,No,Hardly ever or never,Hardly ever or never,Hardly ever or never,Hardly ever or never,1,45.0
4,AT-001881-02,Austria,No,No hopes mentioned,No such feelings,No such feelings,No trouble sleeping,No mention of loss of interest,No,No diminution in desire for food,...,No such difficulty mentioned,No such difficulty mentioned,Mentions any enjoyment from activity,No,Hardly ever or never,Hardly ever or never,Hardly ever or never,Hardly ever or never,0,60.0
8,AT-002525-01,Austria,No,Any hopes mentioned,No such feelings,No such feelings,No trouble sleeping,Non-specific or uncodeable response,No,No diminution in desire for food,...,Difficulty in concentrating,Difficulty in concentrating,Mentions any enjoyment from activity,No,Hardly ever or never,Hardly ever or never,Hardly ever or never,Hardly ever or never,1,59.0
9,AT-002525-02,Austria,No,Any hopes mentioned,No such feelings,No such feelings,No trouble sleeping,No mention of loss of interest,No,No diminution in desire for food,...,No such difficulty mentioned,No such difficulty mentioned,Mentions any enjoyment from activity,No,Hardly ever or never,Hardly ever or never,Hardly ever or never,Hardly ever or never,0,60.0
10,AT-003194-01,Austria,No,Any hopes mentioned,No such feelings,No such feelings,No trouble sleeping,No mention of loss of interest,No,No diminution in desire for food,...,No such difficulty mentioned,No such difficulty mentioned,Mentions any enjoyment from activity,No,Hardly ever or never,Hardly ever or never,Hardly ever or never,Hardly ever or never,0,58.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53686,SK-989881-02,Slovakia,No,Any hopes mentioned,No such feelings,No such feelings,No trouble sleeping,No mention of loss of interest,No,No diminution in desire for food,...,No such difficulty mentioned,No such difficulty mentioned,Mentions any enjoyment from activity,No,Some of the time,Often,Some of the time,Hardly ever or never,1,63.0
53689,SK-992332-01,Slovakia,No,No hopes mentioned,No such feelings,No such feelings,Trouble with sleep or recent change in pattern,No mention of loss of interest,No,No diminution in desire for food,...,No such difficulty mentioned,No such difficulty mentioned,Mentions any enjoyment from activity,No,Hardly ever or never,Hardly ever or never,Hardly ever or never,Hardly ever or never,0,60.0
53690,SK-992332-02,Slovakia,No,Any hopes mentioned,No such feelings,No such feelings,No trouble sleeping,No mention of loss of interest,No,No diminution in desire for food,...,No such difficulty mentioned,No such difficulty mentioned,Mentions any enjoyment from activity,No,Hardly ever or never,Hardly ever or never,Hardly ever or never,Hardly ever or never,1,55.0
53691,SK-993822-01,Slovakia,Yes,No hopes mentioned,No such feelings,No such feelings,No trouble sleeping,No mention of loss of interest,No,No diminution in desire for food,...,No such difficulty mentioned,No such difficulty mentioned,Fails to mention any enjoyable activity,Yes,Some of the time,Some of the time,Some of the time,Some of the time,1,57.0


In [None]:
# Generate a random dataset with 100 features
X, y = make_classification(n_samples=1000, n_features=100, n_informative=10, n_redundant=90, random_state=42)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an XGBClassifier
model = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Get the feature importance scores
importance_scores = model.feature_importances_

# Select the top 10 most important features
selected_features = importance_scores.argsort()[-10:]

# Create a new XGBClassifier with the selected features
selected_model = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

# Train the new model using only the selected features
selected_model.fit(X_train[:, selected_features], y_train)

# Evaluate the original model
y_pred = model.predict(X_test)
original_accuracy = accuracy_score(y_test, y_pred)

# Evaluate the model with selected features
y_pred_selected = selected_model.predict(X_test[:, selected_features])
selected_accuracy = accuracy_score(y_test, y_pred_selected)

print(f"Original Model Accuracy: {original_accuracy:.4f}")
print(f"Selected Features Model Accuracy: {selected_accuracy:.4f}")

# Using scikit-learn's SelectFromModel for feature selection
selector = SelectFromModel(model, prefit=True)
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

# Train a new model using the selected features
selected_model_pipeline = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
selected_model_pipeline.fit(X_train_selected, y_train)

# Evaluate the model with selected features using the pipeline
y_pred_pipeline = selected_model_pipeline.predict(X_test_selected)
selected_accuracy_pipeline = accuracy_score(y_test, y_pred_pipeline)

print(f"Selected Features Model Accuracy (Pipeline): {selected_accuracy_pipeline:.4f}")