# Part 1: Get Reduced Feature List with LASSO

In [1]:
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_rows = None

import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

ALPHA = 0.5

In [3]:
# takes about 1 minute to load
df_student = pd.read_csv('../cleaned_data/clean_STU_QQQ.tsv', sep='\t', header=0, index_col=0, dtype={"LANGTEST_PAQ": "object"})
new_nan_values_count = df_student.isnull().sum().sum()
assert new_nan_values_count == 34114739, f"Failed check having {new_nan_values_count} NaNs"

In [4]:
# drop columns of student weights, date of test
df = df_student.drop(columns=df_student.filter(regex='W_')).drop(columns=["SENWT", "VER_DAT", "CYC", "NatCen", ])
# drop columns of student test scores
df = df[df.columns.drop(list(df.filter(regex='PV')))]

# add the PV1_ colums back
df_student = df.merge(
    df_student.filter(regex='^PV1[A-Z]+$'),
    how="inner",
    left_index=True,
    right_index=True,
)

df_student.shape

(613744, 1094)

In [5]:
# Drop schools with sample of less than 20 students
col = "CNTSCHID"
print("# schools:", len(df_student[col].unique()))
temp = df_student.groupby(col).count()["CNTSTUID"] >= 20
big_school_samples = temp[temp].index.to_list()
print("# schools with sample of at least 20 students:", len(big_school_samples))
df_student = df_student[df_student[col].isin(big_school_samples)]
df_student.shape

# schools: 21629
# schools with sample of at least 20 students: 16123


(557097, 1094)

In [6]:
# map country ids into country codes
df_student["CNTRYID"] = df_student["CNT"].astype("category").cat.codes
df_student[["CNTRYID", "CNT"]].groupby("CNT").min().sort_values("CNTRYID").to_csv("../map/country_codes.csv")

## Split into Train & Val

In [7]:
from sklearn.model_selection import train_test_split

# create train validation split
X = df_student.drop(columns=["PV1MATH"])
y = df_student['PV1MATH'].values
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.4, random_state=0)

print("Shape of Train Features: {}".format(X_train.shape))
print("Shape of Val Features: {}".format(X_val.shape))
print("Shape of Train Target: {}".format(y_train.shape))
print("Shape of Val Target: {}".format(y_val.shape))

Shape of Train Features: (334258, 1093)
Shape of Val Features: (222839, 1093)
Shape of Train Target: (334258,)
Shape of Val Target: (222839,)


In [8]:
X_train.select_dtypes("object").head()

Unnamed: 0,CNT,STRATUM,SUBNATIO,REGION,LANGTEST_QQQ,LANGTEST_COG,LANGTEST_PAQ,BOOKID,ST250D06JA,ST250D07JA,ST251D08JA,ST251D09JA,ST330D10WA,OCOD1,OCOD2,OCOD3,PROGN,COBN_S,COBN_M,COBN_F,LANGN
276918,Kosovo,KSV - stratum 08: Large schools,Kosovo,Kosovo,Albanian,Albanian,,Form 9,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,No response,No response,Invalid,Kosovo : Upper Secondary - Vocational,Missing,I don’t know.,Missing,Albanian
435054,Qatar,QAT - stratum 01: Government,Qatar,Qatar,Arabic,Arabic,,Form 65,Qatar :Office-Yes,Qatar :Cinema-No,Qatar :Digital video camera-One,Qatar :Video games console-One,Qatar : I researched for sponsors to complete ...,Education methods specialists,Sales and marketing managers,Fitness and recreation instructors and program...,Qatar : Upper Secondary Government,Another country (QAT),Another country (QAT),Another country (QAT),Arabic
348306,Malta,MLT - stratum 01: State school,Malta,Malta,English,Maltese,,Form 3,Not Applicable,Not Applicable,Malta :Smart TV with Internet access-None,Not Applicable,Not Applicable,Home-based personal care workers,Plasterers,Journalists,Malta : Secondary education (upper),Malta,Malta,Malta,Maltese
1747,Albania,ALB - stratum 05: Urban / South / Public,Albania,Albania,Albanian,Albanian,,Form 24,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,"Doing housework, bringing up children",Bricklayers and related workers,Dentists,Albania : Upper secondary education,Albania,Albania,Albania,Albanian
208035,Germany,Undisclosed STRATUM - Germany,Germany,Germany,German,German,,Form 19,Germany :A desk to study at-Yes,Germany :A quiet place to study-Yes,Not Applicable,Not Applicable,Germany : I visited a vocational Information C...,Kitchen helpers,Buyers,Buyers,"Germany : Lower secondary, access to upper sec...",Germany,Another country (DEU),Germany,German


In [9]:
# drop object features
X_train = X_train.select_dtypes(exclude="object")
X_val = X_val.select_dtypes(exclude="object")

# drop PV features
X_train = X_train.drop(columns=X_train.filter(regex="PV.*").columns)
X_val = X_val.drop(columns=X_val.filter(regex="PV.*").columns)

print("Shape of Train Features: {}".format(X_train.shape))
print("Shape of Val Features: {}".format(X_val.shape))
print("Shape of Train Target: {}".format(y_train.shape))
print("Shape of Val Target: {}".format(y_val.shape))

Shape of Train Features: (334258, 1062)
Shape of Val Features: (222839, 1062)
Shape of Train Target: (334258,)
Shape of Val Target: (222839,)


In [10]:
X_train.to_csv("../cleaned_data/X_train.csv")
np.savetxt("../cleaned_data/y_train.csv", y_train, delimiter=",")
X_val.to_csv("../cleaned_data/X_val.csv")
np.savetxt("../cleaned_data/y_val.csv", y_val, delimiter=",")

In [11]:
(X_train.std() == 0).sum()

np.int64(0)

In [12]:
X_train.info(verbose = True, show_counts = True)

<class 'pandas.core.frame.DataFrame'>
Index: 334258 entries, 276918 to 343045
Data columns (total 1062 columns):
 #     Column       Non-Null Count   Dtype  
---    ------       --------------   -----  
 0     CNTRYID      334258 non-null  int8   
 1     CNTSCHID     334258 non-null  float64
 2     CNTSTUID     334258 non-null  float64
 3     OECD         334258 non-null  int64  
 4     ADMINMODE    334258 non-null  int64  
 5     Option_CT    334258 non-null  int64  
 6     Option_FL    334258 non-null  int64  
 7     Option_ICTQ  334258 non-null  int64  
 8     Option_WBQ   334258 non-null  int64  
 9     Option_PQ    334258 non-null  int64  
 10    Option_TQ    334258 non-null  int64  
 11    Option_UH    334258 non-null  int64  
 12    ST001D01T    334258 non-null  int64  
 13    ST003D02T    334258 non-null  int64  
 14    ST003D03T    327198 non-null  float64
 15    ST004D01T    334258 non-null  int64  
 16    ST250Q01JA   334258 non-null  int64  
 17    ST250Q02JA   334258 non-n

## Data Processing

In [13]:
#  handle NaNs
from sklearn.impute import SimpleImputer

# Don't know if this is legit, but want LASSO to put less weight on columns with missing features
imputer = SimpleImputer(missing_values=np.nan, add_indicator=False, strategy="constant", fill_value=-1)
imputer.fit(X_train)

X_train = imputer.transform(X_train)
X_val = imputer.transform(X_val)

print("Shape of Train Features: {}".format(X_train.shape))
print("Shape of Val Features: {}".format(X_val.shape))
print("Shape of Train Target: {}".format(y_train.shape))
print("Shape of Val Target: {}".format(y_val.shape))


Shape of Train Features: (334258, 1062)
Shape of Val Features: (222839, 1062)
Shape of Train Target: (334258,)
Shape of Val Target: (222839,)


In [15]:
# scale data to 0 mean 1 var
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

print("Shape of Train Features: {}".format(X_train.shape))
print("Shape of Val Features: {}".format(X_val.shape))
print("Shape of Train Target: {}".format(y_train.shape))
print("Shape of Val Target: {}".format(y_val.shape))

Shape of Train Features: (334258, 1062)
Shape of Val Features: (222839, 1062)
Shape of Train Target: (334258,)
Shape of Val Target: (222839,)


In [16]:
X_train

array([[-0.06625226, -0.23656993, -0.23530156, ..., -0.28616594,
        -0.98654175, -0.32061971],
       [ 0.80265192,  0.75187211,  0.75635198, ..., -0.28616594,
        -0.98654175,  0.24560801],
       [ 0.14062969,  0.10603396,  0.10838913, ..., -0.28616594,
        -0.98654175, -0.36417569],
       ...,
       [-1.0592856 , -0.99267009, -0.99397977, ...,  1.14325211,
        -0.98654175,  0.81183574],
       [ 1.09228665,  1.03147458,  1.03704614, ..., -0.28616594,
        -0.98654175, -1.67085507],
       [ 0.0992533 ,  0.05878292,  0.06109283, ..., -0.28616594,
         0.88387153,  0.81183574]])

In [17]:
X_train.shape

(334258, 1062)

In [18]:
from sklearn.linear_model import Lasso

# Create a Lasso Regression model
alpha = ALPHA # Regularization strength (adjust as needed)
lasso_reg = Lasso(alpha=alpha)

# Fit the model to the training data
lasso_reg.fit(X_train, y_train)

# Print the coefficients and intercept
print(f"Coefficient: {lasso_reg.coef_}")
print(f"Intercept: {lasso_reg.intercept_}")

Coefficient: [ 1.57005143  0.          0.         ... -0.         -0.
 -0.        ]
Intercept: 443.4480492703243


In [19]:
coefs = pd.DataFrame(
    lasso_reg.coef_,
    columns=["Coefficients importance"],
    index=list(imputer.get_feature_names_out()),
)
coefs.to_csv(f"LASSO_coef_alpha_{ALPHA}_NO_PV.csv")

In [20]:
X_train = X_train[:, (coefs["Coefficients importance"] != 0).values]
X_val = X_val[:, (coefs["Coefficients importance"] != 0).values]

print("Shape of Reduced Train Features: {}".format(X_train.shape))
print("Shape of Reduced Val Features: {}".format(X_val.shape))
print("Shape of Reduced Train Target: {}".format(y_train.shape))
print("Shape of Reduced Val Target: {}".format(y_val.shape))

Shape of Reduced Train Features: (334258, 384)
Shape of Reduced Val Features: (222839, 384)
Shape of Reduced Train Target: (334258,)
Shape of Reduced Val Target: (222839,)
