In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

### read raw data & data cleaning

In [11]:
df = pd.read_csv('./raw_data.csv')
df = df.dropna()

df = df[df['SchoolID'].notna()]
df = df[df['grade'].notna()]
df = df[df['class'].notna()]
df['classID'] = df['SchoolID'].astype(int).astype(str) + '-' + df['grade'].astype(int).astype(str) + '-' + df['class'].astype(str)

df = df[df['age'] <= 18]
df = df.groupby('classID').filter(lambda x: len(x) <= 100)
df = df.groupby('classID').filter(lambda x: len(x) >= 10)
numeric_vars = ['PMH', 'SWLS', 'BLS', 'siblings', 'bo', 'BMI', 'drink', 'smoke', 'shs', 'PEI', 'NEI', 'sfsc', 'sssc', 'OBSES', 'chat', 'ecf', 'sleep', 'rise', 'ST', 'sport', 'screen', 'game', 'video', 'book', 'friends', 'myopia']
num_classes = df['classID'].unique().shape[0]
num_students = df.shape[0]

num_classes, num_students

(8556, 370237)

### find neighbor

In [12]:
neighbor_idx = np.empty([num_classes], dtype='object')
neighbor_dep = np.empty([num_classes], dtype='object')

def standardize(df):
    return (df - df.min()) / (df.max() - df.min() + 0.00001)

df_by_class = df.groupby('classID')
class_idx = 0
for idx, class_df in df_by_class:
    class_df_std = standardize(class_df[numeric_vars])
    num_neighbors = int(class_df.shape[0] / 10)
    knn = NearestNeighbors()
    knn.fit(class_df_std)
    result = knn.kneighbors(class_df_std, n_neighbors=num_neighbors+1, return_distance=False)

    def map_idx(idx):
        return class_df.index[idx]
    vec = np.vectorize(map_idx)
    result = vec(result)    

    def map_dep(idx):
        return df.loc[idx, 'DEP']
    vec = np.vectorize(map_dep)
    dep_result = vec(result)
    
    neighbor_idx[class_idx] = result
    neighbor_dep[class_idx] = dep_result
    class_idx += 1

### generate local Moran's I

In [13]:
student_data = df.copy()
student_data['local_moran'] = 0

indi_dep_mean = df['DEP'].mean()
for cla_idx, cla in enumerate(neighbor_idx):
    for stu_idx, stu in enumerate(neighbor_idx[cla_idx]):
        deps = neighbor_dep[cla_idx][stu_idx]
        moran = (deps[0] - indi_dep_mean) * (deps[1:] - indi_dep_mean).mean()
        student_data.loc[stu[0], 'local_moran'] = moran

In [16]:
student_data['local_moran'].min(), student_data['local_moran'].max(), student_data['local_moran'].mean(), student_data['local_moran'].std()

(-6.8118121039119215,
 20.063702768993817,
 0.5496989947679892,
 1.3740310043895567)

### generate classroom Moran's I

In [17]:
df_by_class = student_data.groupby('classID')
class_data = df_by_class.mean()
class_data['class_size'] = df_by_class.size().tolist()

In [20]:
class_idx = 0
tmp = np.zeros([num_classes])
for idx, class_df in df_by_class:
    mean = class_df['DEP'].mean()
    var = class_df['DEP'].var() * (class_df.shape[0] - 1) + 0.00001
    sum = (class_df.shape[0]) * (int(class_df.shape[0] / 10))

    total_var = 0
    for idx, row in enumerate(neighbor_dep[class_idx]):
        total_var += (row[0] - mean) *(row[1:].sum() - (class_df.shape[1] - 1)*mean)
    moran = (class_df.shape[0] / sum) * (total_var / var)

    tmp[class_idx] = moran
    class_idx += 1

tmp[np.isnan(tmp)] = 0
class_data['moran'] = tmp

In [21]:
class_data['classID'] = class_data.index

### save data with Moran's I

In [19]:
student_data.to_csv('./student_data2.csv', index=False)
class_data.to_csv('./class_data2.csv', index=False)