**Data Loading**

In [52]:
import pandas as pd
import os

# Define data path
data_path = r'c:\Users\Jack\Desktop\GroupWork\Code&Data\Data\raw'

# 1. Load assessments dataset
assessments = pd.read_csv(os.path.join(data_path, 'assessments.csv'))
print(f"assessments: {assessments.shape[0]:,} rows x {assessments.shape[1]} columns")

# 2. Load courses dataset
courses = pd.read_csv(os.path.join(data_path, 'courses.csv'))
print(f"courses: {courses.shape[0]:,} rows x {courses.shape[1]} columns")

# 3. Load studentAssessment dataset
studentAssessment = pd.read_csv(os.path.join(data_path, 'studentAssessment.csv'))
print(f"studentAssessment: {studentAssessment.shape[0]:,} rows x {studentAssessment.shape[1]} columns")

# 4. Load studentInfo dataset
studentInfo = pd.read_csv(os.path.join(data_path, 'studentInfo.csv'))
print(f"studentInfo: {studentInfo.shape[0]:,} rows x {studentInfo.shape[1]} columns")

# 5. Load studentRegistration dataset
studentRegistration = pd.read_csv(os.path.join(data_path, 'studentRegistration.csv'))
print(f"studentRegistration: {studentRegistration.shape[0]:,} rows x {studentRegistration.shape[1]} columns")

# 6. Load studentVle dataset
studentVle = pd.read_csv(os.path.join(data_path, 'studentVle.csv'))
print(f"studentVle: {studentVle.shape[0]:,} rows x {studentVle.shape[1]} columns")

# 7. Load vle dataset
vle = pd.read_csv(os.path.join(data_path, 'vle.csv'))
print(f"vle: {vle.shape[0]:,} rows x {vle.shape[1]} columns")

# Summary
print("\nDataset Summary:")
print(f"  assessments        : {assessments.shape[0]:>10,} rows x {assessments.shape[1]} columns")
print(f"  courses            : {courses.shape[0]:>10,} rows x {courses.shape[1]} columns")
print(f"  studentAssessment  : {studentAssessment.shape[0]:>10,} rows x {studentAssessment.shape[1]} columns")
print(f"  studentInfo        : {studentInfo.shape[0]:>10,} rows x {studentInfo.shape[1]} columns")
print(f"  studentRegistration: {studentRegistration.shape[0]:>10,} rows x {studentRegistration.shape[1]} columns")
print(f"  studentVle         : {studentVle.shape[0]:>10,} rows x {studentVle.shape[1]} columns")
print(f"  vle                : {vle.shape[0]:>10,} rows x {vle.shape[1]} columns")

assessments: 206 rows x 6 columns
courses: 22 rows x 3 columns
studentAssessment: 173,912 rows x 5 columns
studentInfo: 32,593 rows x 12 columns
studentRegistration: 32,593 rows x 5 columns
studentVle: 10,655,280 rows x 6 columns
vle: 6,364 rows x 6 columns

Dataset Summary:
  assessments        :        206 rows x 6 columns
  courses            :         22 rows x 3 columns
  studentAssessment  :    173,912 rows x 5 columns
  studentInfo        :     32,593 rows x 12 columns
  studentRegistration:     32,593 rows x 5 columns
  studentVle         : 10,655,280 rows x 6 columns
  vle                :      6,364 rows x 6 columns


**Data Understanding for each dataset**

In [53]:
# assessments - Detailed Data Understanding
print("ASSESSMENTS - Detailed Data Understanding")

print(f"\n[1. Data Scale]")
print(f"  Rows: {assessments.shape[0]:,}")
print(f"  Columns: {assessments.shape[1]}")

print(f"\n[2. Column Names and Data Types]")
print(assessments.dtypes.to_string())

print(f"\n[3. First 5 Rows]")
display(assessments.head())

print(f"\n[4. Missing Value Analysis]")
missing = assessments.isnull().sum()
missing_pct = (missing / len(assessments) * 100).round(2)
if missing.sum() > 0:
    missing_df = pd.DataFrame({'Missing Count': missing, 'Percentage %': missing_pct})
    display(missing_df[missing_df['Missing Count'] > 0])
else:
    print("  No missing values")

print(f"\n[5. Numeric Column Statistics]")
numeric_cols = assessments.select_dtypes(include=['int64', 'float64']).columns
if len(numeric_cols) > 0:
    display(assessments[numeric_cols].describe().round(2))
else:
    print("  No numeric columns")

print(f"\n[6. Categorical Column Analysis]")
cat_cols = assessments.select_dtypes(include=['object']).columns
for col in cat_cols:
    print(f"  {col}: {assessments[col].nunique()} unique values")
    if assessments[col].nunique() <= 10:
        print(f"    Distribution: {assessments[col].value_counts().to_dict()}")

print(f"\n[7. Data Quality Check]")
print(f"  Duplicate Rows: {assessments.duplicated().sum()}")
print(f"  Unique Assessments: {assessments['id_assessment'].nunique()}")
print(f"  Memory Usage: {assessments.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


ASSESSMENTS - Detailed Data Understanding

[1. Data Scale]
  Rows: 206
  Columns: 6

[2. Column Names and Data Types]
code_module           object
code_presentation     object
id_assessment          int64
assessment_type       object
date                 float64
weight               float64

[3. First 5 Rows]


Unnamed: 0,code_module,code_presentation,id_assessment,assessment_type,date,weight
0,AAA,2013J,1752,TMA,19.0,10.0
1,AAA,2013J,1753,TMA,54.0,20.0
2,AAA,2013J,1754,TMA,117.0,20.0
3,AAA,2013J,1755,TMA,166.0,20.0
4,AAA,2013J,1756,TMA,215.0,30.0



[4. Missing Value Analysis]


Unnamed: 0,Missing Count,Percentage %
date,11,5.34



[5. Numeric Column Statistics]


Unnamed: 0,id_assessment,date,weight
count,206.0,195.0,206.0
mean,26473.98,145.01,20.87
std,10098.63,76.0,30.38
min,1752.0,12.0,0.0
25%,15023.25,71.0,0.0
50%,25364.5,152.0,12.5
75%,34891.75,222.0,24.25
max,40088.0,261.0,100.0



[6. Categorical Column Analysis]
  code_module: 7 unique values
    Distribution: {'FFF': 52, 'BBB': 42, 'DDD': 35, 'GGG': 30, 'CCC': 20, 'EEE': 15, 'AAA': 12}
  code_presentation: 4 unique values
    Distribution: {'2014J': 57, '2014B': 57, '2013J': 53, '2013B': 39}
  assessment_type: 3 unique values
    Distribution: {'TMA': 106, 'CMA': 76, 'Exam': 24}

[7. Data Quality Check]
  Duplicate Rows: 0
  Unique Assessments: 206
  Memory Usage: 0.04 MB


### Data Understanding — 问题总结
**1. 完整性**

* `date` 缺失 11 条（5.34%）
  可能影响时序分析，建议填补而非删除

**2. 类型与语义**

* `date` 为 `float`，应为时间/天数 → 建议转 `int` 或 `datetime`
* `id_assessment` 为 ID，不应作为数值特征参与建模

**3. 数值异常**

* `weight` 存在大量 0（25% 分位数为 0）
  → 需确认是否合理或应视为缺失
* `weight` 最大值 100 合理，但需验证每门课程权重总和是否为 100

**4. 分布问题**

* `assessment_type` 不均衡（Exam 样本最少）
* `code_module` 各模块样本量差异较大

**5. 一致性风险**

建议检查：

* 每门课程是否都有 Exam
* 同一课程下权重总和是否为 100

In [54]:
# courses - Detailed Data Understanding
print("COURSES - Detailed Data Understanding")

print(f"\n[1. Data Scale]")
print(f"  Rows: {courses.shape[0]:,}")
print(f"  Columns: {courses.shape[1]}")

print(f"\n[2. Column Names and Data Types]")
print(courses.dtypes.to_string())

print(f"\n[3. First 5 Rows]")
display(courses.head())

print(f"\n[4. Missing Value Analysis]")
missing = courses.isnull().sum()
if missing.sum() > 0:
    missing_pct = (missing / len(courses) * 100).round(2)
    missing_df = pd.DataFrame({'Missing Count': missing, 'Percentage %': missing_pct})
    display(missing_df[missing_df['Missing Count'] > 0])
else:
    print("  No missing values")

print(f"\n[5. Numeric Column Statistics]")
numeric_cols = courses.select_dtypes(include=['int64', 'float64']).columns
if len(numeric_cols) > 0:
    display(courses[numeric_cols].describe().round(2))
else:
    print("  No numeric columns")

print(f"\n[6. Categorical Column Analysis]")
cat_cols = courses.select_dtypes(include=['object']).columns
for col in cat_cols:
    print(f"  {col}: {courses[col].nunique()} unique values")
    if courses[col].nunique() <= 10:
        print(f"    Distribution: {courses[col].value_counts().to_dict()}")

print(f"\n[7. Data Quality Check]")
print(f"  Duplicate Rows: {courses.duplicated().sum()}")
print(f"  Memory Usage: {courses.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


COURSES - Detailed Data Understanding

[1. Data Scale]
  Rows: 22
  Columns: 3

[2. Column Names and Data Types]
code_module                   object
code_presentation             object
module_presentation_length     int64

[3. First 5 Rows]


Unnamed: 0,code_module,code_presentation,module_presentation_length
0,AAA,2013J,268
1,AAA,2014J,269
2,BBB,2013J,268
3,BBB,2014J,262
4,BBB,2013B,240



[4. Missing Value Analysis]
  No missing values

[5. Numeric Column Statistics]


Unnamed: 0,module_presentation_length
count,22.0
mean,255.55
std,13.65
min,234.0
25%,241.0
50%,261.5
75%,268.0
max,269.0



[6. Categorical Column Analysis]
  code_module: 7 unique values
    Distribution: {'BBB': 4, 'DDD': 4, 'FFF': 4, 'EEE': 3, 'GGG': 3, 'AAA': 2, 'CCC': 2}
  code_presentation: 4 unique values
    Distribution: {'2014J': 7, '2013J': 6, '2014B': 6, '2013B': 3}

[7. Data Quality Check]
  Duplicate Rows: 0
  Memory Usage: 0.00 MB


### Data Understanding — Courses 表问题总结

**1. 完整性**

* 无缺失值
  → 数据完整性良好

**2. 类型与语义**

* `module_presentation_length` 为数值型，语义合理
* `code_module`、`code_presentation` 为类别变量，类型正确

**3. 数值分布**

* 课程周期范围：234–269 天
* 标准差 13.65
  → 波动较小，无明显异常值

**4. 分布问题**

* 不同 module 数量分布不均（如 AAA、CCC 较少）
* 不同 presentation 数量略有差异（2013B 最少）

**5. 数据质量**

* 无重复值
* 结构清晰，可直接用于后续建模或关联分析


In [55]:
# studentAssessment - Detailed Data Understanding
print("STUDENTASSESSMENT - Detailed Data Understanding")

print(f"\n[1. Data Scale]")
print(f"  Rows: {studentAssessment.shape[0]:,}")
print(f"  Columns: {studentAssessment.shape[1]}")

print(f"\n[2. Column Names and Data Types]")
print(studentAssessment.dtypes.to_string())

print(f"\n[3. First 5 Rows]")
display(studentAssessment.head())

print(f"\n[4. Missing Value Analysis]")
missing = studentAssessment.isnull().sum()
missing_pct = (missing / len(studentAssessment) * 100).round(2)
if missing.sum() > 0:
    missing_df = pd.DataFrame({'Missing Count': missing, 'Percentage %': missing_pct})
    display(missing_df[missing_df['Missing Count'] > 0])
else:
    print("  No missing values")

print(f"\n[5. Numeric Column Statistics]")
numeric_cols = studentAssessment.select_dtypes(include=['int64', 'float64']).columns
if len(numeric_cols) > 0:
    display(studentAssessment[numeric_cols].describe().round(2))
else:
    print("  No numeric columns")

print(f"\n[6. Categorical Column Analysis]")
cat_cols = studentAssessment.select_dtypes(include=['object']).columns
if len(cat_cols) > 0:
    for col in cat_cols:
        print(f"  {col}: {studentAssessment[col].nunique()} unique values")
else:
    print("  No categorical columns")

print(f"\n[7. Data Quality Check]")
print(f"  Duplicate Rows: {studentAssessment.duplicated().sum()}")
print(f"  Unique Students: {studentAssessment['id_student'].nunique():,}")
print(f"  Unique Assessments: {studentAssessment['id_assessment'].nunique()}")
print(f"  Memory Usage: {studentAssessment.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


STUDENTASSESSMENT - Detailed Data Understanding

[1. Data Scale]
  Rows: 173,912
  Columns: 5

[2. Column Names and Data Types]
id_assessment       int64
id_student          int64
date_submitted      int64
is_banked           int64
score             float64

[3. First 5 Rows]


Unnamed: 0,id_assessment,id_student,date_submitted,is_banked,score
0,1752,11391,18,0,78.0
1,1752,28400,22,0,70.0
2,1752,31604,17,0,72.0
3,1752,32885,26,0,69.0
4,1752,38053,19,0,79.0



[4. Missing Value Analysis]


Unnamed: 0,Missing Count,Percentage %
score,173,0.1



[5. Numeric Column Statistics]


Unnamed: 0,id_assessment,id_student,date_submitted,is_banked,score
count,173912.0,173912.0,173912.0,173912.0,173739.0
mean,26553.8,705150.72,116.03,0.01,75.8
std,8829.78,552395.19,71.48,0.1,18.8
min,1752.0,6516.0,-11.0,0.0,0.0
25%,15022.0,504429.0,51.0,0.0,65.0
50%,25359.0,585208.0,116.0,0.0,80.0
75%,34883.0,634498.0,173.0,0.0,90.0
max,37443.0,2698588.0,608.0,1.0,100.0



[6. Categorical Column Analysis]
  No categorical columns

[7. Data Quality Check]
  Duplicate Rows: 0
  Unique Students: 23,369
  Unique Assessments: 188
  Memory Usage: 6.63 MB


### Data Understanding — StudentAssessment 表

**1. 完整性**

* `score` 缺失 173 条（0.1%）
  → 缺失比例极低，可直接删除或简单填补

**2. 类型与语义**

* `id_assessment`、`id_student` 为标识符
  → 不应作为连续数值特征
* `date_submitted` 为数值型时间变量
  → 建议明确单位（天数）后使用

**3. 数值异常**

* `date_submitted` 最小值为 **-11**
  → 存在异常（提前提交或数据错误）
* `score` 范围 0–100
  → 合理，但需关注 0 分是否为缺考或异常

**4. 分布特征**

* `is_banked` 几乎全为 0（均值 0.01）
  → 该特征信息量极低，建模价值有限

**5. 数据质量**

* 无重复记录
* 学生数：23,369
* assessment 数：188
* 数据规模较大，适合做统计与建模分析

In [56]:
# studentInfo - Detailed Data Understanding
print("STUDENTINFO - Detailed Data Understanding")

print(f"\n[1. Data Scale]")
print(f"  Rows: {studentInfo.shape[0]:,}")
print(f"  Columns: {studentInfo.shape[1]}")

print(f"\n[2. Column Names and Data Types]")
print(studentInfo.dtypes.to_string())

print(f"\n[3. First 5 Rows]")
display(studentInfo.head())

print(f"\n[4. Missing Value Analysis]")
missing = studentInfo.isnull().sum()
missing_pct = (missing / len(studentInfo) * 100).round(2)
if missing.sum() > 0:
    missing_df = pd.DataFrame({'Missing Count': missing, 'Percentage %': missing_pct})
    display(missing_df[missing_df['Missing Count'] > 0])
else:
    print("  No missing values")

print(f"\n[5. Numeric Column Statistics]")
numeric_cols = studentInfo.select_dtypes(include=['int64', 'float64']).columns
if len(numeric_cols) > 0:
    display(studentInfo[numeric_cols].describe().round(2))
else:
    print("  No numeric columns")

print(f"\n[6. Categorical Column Analysis]")
cat_cols = studentInfo.select_dtypes(include=['object']).columns
for col in cat_cols:
    print(f"  {col}: {studentInfo[col].nunique()} unique values")
    if studentInfo[col].nunique() <= 10:
        print(f"    Distribution: {studentInfo[col].value_counts().to_dict()}")

print(f"\n[7. Data Quality Check]")
print(f"  Duplicate Rows: {studentInfo.duplicated().sum()}")
print(f"  Unique Students: {studentInfo['id_student'].nunique():,}")
print(f"  Memory Usage: {studentInfo.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


STUDENTINFO - Detailed Data Understanding

[1. Data Scale]
  Rows: 32,593
  Columns: 12

[2. Column Names and Data Types]
code_module             object
code_presentation       object
id_student               int64
gender                  object
region                  object
highest_education       object
imd_band                object
age_band                object
num_of_prev_attempts     int64
studied_credits          int64
disability              object
final_result            object

[3. First 5 Rows]


Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result
0,AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass
1,AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass
2,AAA,2013J,30268,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,Withdrawn
3,AAA,2013J,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,Pass
4,AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,Pass



[4. Missing Value Analysis]


Unnamed: 0,Missing Count,Percentage %
imd_band,1111,3.41



[5. Numeric Column Statistics]


Unnamed: 0,id_student,num_of_prev_attempts,studied_credits
count,32593.0,32593.0,32593.0
mean,706687.67,0.16,79.76
std,549167.31,0.48,41.07
min,3733.0,0.0,30.0
25%,508573.0,0.0,60.0
50%,590310.0,0.0,60.0
75%,644453.0,0.0,120.0
max,2716795.0,6.0,655.0



[6. Categorical Column Analysis]
  code_module: 7 unique values
    Distribution: {'BBB': 7909, 'FFF': 7762, 'DDD': 6272, 'CCC': 4434, 'EEE': 2934, 'GGG': 2534, 'AAA': 748}
  code_presentation: 4 unique values
    Distribution: {'2014J': 11260, '2013J': 8845, '2014B': 7804, '2013B': 4684}
  gender: 2 unique values
    Distribution: {'M': 17875, 'F': 14718}
  region: 13 unique values
  highest_education: 5 unique values
    Distribution: {'A Level or Equivalent': 14045, 'Lower Than A Level': 13158, 'HE Qualification': 4730, 'No Formal quals': 347, 'Post Graduate Qualification': 313}
  imd_band: 10 unique values
    Distribution: {'20-30%': 3654, '30-40%': 3539, '10-20': 3516, '0-10%': 3311, '40-50%': 3256, '50-60%': 3124, '60-70%': 2905, '70-80%': 2879, '80-90%': 2762, '90-100%': 2536}
  age_band: 3 unique values
    Distribution: {'0-35': 22944, '35-55': 9433, '55<=': 216}
  disability: 2 unique values
    Distribution: {'N': 29429, 'Y': 3164}
  final_result: 4 unique values
    Distr

### Data Understanding — StudentInfo 表问题总结

**1. 完整性（Completeness）**

* `imd_band` 存在缺失：1111 条（3.41%）
  → 建议将缺失作为单独类别（`Unknown`），避免信息损失

**2. 类型与语义（Validity）**

* `id_student` 为标识符
  → 不应作为数值特征
* 其余字段类型与语义基本一致

**3. 数值字段问题（Numeric）**

* `num_of_prev_attempts` 极度偏斜（75% 为 0）
  → 可能需离散化（0 vs ≥1）
* `studied_credits` 存在极端值（最大 655）
  → 需检查是否为异常或跨课程累积

**4. 分布问题（整体层面）**

* module 与 presentation 分布不均
  → 课程与批次效应可能干扰分析

**5. 数据质量**

* 无重复记录
* 数据规模充足，适合建模分析


**6. 类别变量专项总结（Categorical-focused）**

**核心结论：多个类别字段存在“长尾 + 不均衡 + 高基数”问题，直接 one-hot 可能引入噪声与不稳定性。**

### 主要风险点

* 极小类别（不稳定）：

  * `AAA` module
  * `55<=` age_band
  * `Post Graduate` / `No Formal quals`
  * `Distinction`
* 明显不均衡：

  * `final_result`
  * `disability`
  * `age_band`
* 高基数：

  * `region`（13 类）
  * `imd_band`（10 类）

### 建议处理策略

* 合并长尾类别 → Other
* 小样本类别在报告中明确说明局限性
* 建模时使用 class weight 或分层采样
* `imd_band` 缺失单独建类
* 检查 `region`、`imd_band`、`highest_education` 的相关性，防止信息冗余

In [57]:
# studentRegistration - Detailed Data Understanding
print("STUDENTREGISTRATION - Detailed Data Understanding")

print(f"\n[1. Data Scale]")
print(f"  Rows: {studentRegistration.shape[0]:,}")
print(f"  Columns: {studentRegistration.shape[1]}")

print(f"\n[2. Column Names and Data Types]")
print(studentRegistration.dtypes.to_string())

print(f"\n[3. First 5 Rows]")
display(studentRegistration.head())

print(f"\n[4. Missing Value Analysis]")
missing = studentRegistration.isnull().sum()
missing_pct = (missing / len(studentRegistration) * 100).round(2)
if missing.sum() > 0:
    missing_df = pd.DataFrame({'Missing Count': missing, 'Percentage %': missing_pct})
    display(missing_df[missing_df['Missing Count'] > 0])
else:
    print("  No missing values")

print(f"\n[5. Numeric Column Statistics]")
numeric_cols = studentRegistration.select_dtypes(include=['int64', 'float64']).columns
if len(numeric_cols) > 0:
    display(studentRegistration[numeric_cols].describe().round(2))
else:
    print("  No numeric columns")

print(f"\n[6. Categorical Column Analysis]")
cat_cols = studentRegistration.select_dtypes(include=['object']).columns
for col in cat_cols:
    print(f"  {col}: {studentRegistration[col].nunique()} unique values")
    if studentRegistration[col].nunique() <= 10:
        print(f"    Distribution: {studentRegistration[col].value_counts().to_dict()}")

print(f"\n[7. Data Quality Check]")
print(f"  Duplicate Rows: {studentRegistration.duplicated().sum()}")
print(f"  Unique Students: {studentRegistration['id_student'].nunique():,}")
print(f"  Memory Usage: {studentRegistration.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


STUDENTREGISTRATION - Detailed Data Understanding

[1. Data Scale]
  Rows: 32,593
  Columns: 5

[2. Column Names and Data Types]
code_module             object
code_presentation       object
id_student               int64
date_registration      float64
date_unregistration    float64

[3. First 5 Rows]


Unnamed: 0,code_module,code_presentation,id_student,date_registration,date_unregistration
0,AAA,2013J,11391,-159.0,
1,AAA,2013J,28400,-53.0,
2,AAA,2013J,30268,-92.0,12.0
3,AAA,2013J,31604,-52.0,
4,AAA,2013J,32885,-176.0,



[4. Missing Value Analysis]


Unnamed: 0,Missing Count,Percentage %
date_registration,45,0.14
date_unregistration,22521,69.1



[5. Numeric Column Statistics]


Unnamed: 0,id_student,date_registration,date_unregistration
count,32593.0,32548.0,10072.0
mean,706687.67,-69.41,49.76
std,549167.31,49.26,82.46
min,3733.0,-322.0,-365.0
25%,508573.0,-100.0,-2.0
50%,590310.0,-57.0,27.0
75%,644453.0,-29.0,109.0
max,2716795.0,167.0,444.0



[6. Categorical Column Analysis]
  code_module: 7 unique values
    Distribution: {'BBB': 7909, 'FFF': 7762, 'DDD': 6272, 'CCC': 4434, 'EEE': 2934, 'GGG': 2534, 'AAA': 748}
  code_presentation: 4 unique values
    Distribution: {'2014J': 11260, '2013J': 8845, '2014B': 7804, '2013B': 4684}

[7. Data Quality Check]
  Duplicate Rows: 0
  Unique Students: 28,785
  Memory Usage: 4.04 MB


### Data Understanding — StudentRegistration 表问题总结

**1. 完整性（Completeness）**

* `date_registration` 缺失 45 条（0.14%）
  → 比例极低，可直接删除或简单填补
* `date_unregistration` 缺失 **69.10%**
  → 很可能代表“未退课学生”，属于**有业务含义的缺失**，不应简单填补

**2. 类型与语义（Validity）**

* `id_student` 为标识符
  → 不作为数值特征
* `date_registration` / `date_unregistration` 为时间变量
  → 建议后续转为相对天数差或持续时间特征

**3. 数值异常与逻辑问题（Critical）**

* `date_registration` 大量为负数（min = -322）
  → 表示在课程开始前注册，语义合理
* `date_unregistration` 存在负值（min = -365）
  → 表示课程开始前即退课，需要确认是否合理
* `date_unregistration` 最大值 444
  → 存在非常晚退课行为，可能为异常或延迟记录

> 这一表最重要的问题：**时间变量的业务解释必须明确，否则后续特征工程会出错。**

**4. 分布问题**

* module 与 presentation 分布仍然不均
  → 与 StudentInfo 表一致，说明不是抽样问题而是原始结构如此

**5. 数据质量**

* 无重复记录
* 28,785 名唯一学生
* 数据规模良好，可与 StudentInfo / Assessment 关联建模

**核心结论**

> StudentRegistration 最大风险不在“缺失”，而在于**时间变量的业务语义与边界合理性需要验证**。
> 它是后续构建「注册时长」「是否提前退课」「学习坚持度」等核心特征的关键数据源。

In [58]:
# studentVle - Detailed Data Understanding
print("STUDENTVLE - Detailed Data Understanding")

print(f"\n[1. Data Scale]")
print(f"  Rows: {studentVle.shape[0]:,}")
print(f"  Columns: {studentVle.shape[1]}")

print(f"\n[2. Column Names and Data Types]")
print(studentVle.dtypes.to_string())

print(f"\n[3. First 5 Rows]")
display(studentVle.head())

print(f"\n[4. Missing Value Analysis]")
missing = studentVle.isnull().sum()
missing_pct = (missing / len(studentVle) * 100).round(2)
if missing.sum() > 0:
    missing_df = pd.DataFrame({'Missing Count': missing, 'Percentage %': missing_pct})
    display(missing_df[missing_df['Missing Count'] > 0])
else:
    print("  No missing values")

print(f"\n[5. Numeric Column Statistics]")
numeric_cols = studentVle.select_dtypes(include=['int64', 'float64']).columns
if len(numeric_cols) > 0:
    display(studentVle[numeric_cols].describe().round(2))
else:
    print("  No numeric columns")

print(f"\n[6. Categorical Column Analysis]")
cat_cols = studentVle.select_dtypes(include=['object']).columns
for col in cat_cols:
    print(f"  {col}: {studentVle[col].nunique()} unique values")

print(f"\n[7. Data Quality Check]")
print(f"  Duplicate Rows: {studentVle.duplicated().sum():,}")
print(f"  Unique Students: {studentVle['id_student'].nunique():,}")
print(f"  Memory Usage: {studentVle.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


STUDENTVLE - Detailed Data Understanding

[1. Data Scale]
  Rows: 10,655,280
  Columns: 6

[2. Column Names and Data Types]
code_module          object
code_presentation    object
id_student            int64
id_site               int64
date                  int64
sum_click             int64

[3. First 5 Rows]


Unnamed: 0,code_module,code_presentation,id_student,id_site,date,sum_click
0,AAA,2013J,28400,546652,-10,4
1,AAA,2013J,28400,546652,-10,1
2,AAA,2013J,28400,546652,-10,1
3,AAA,2013J,28400,546614,-10,11
4,AAA,2013J,28400,546714,-10,1



[4. Missing Value Analysis]
  No missing values

[5. Numeric Column Statistics]


Unnamed: 0,id_student,id_site,date,sum_click
count,10655280.0,10655280.0,10655280.0,10655280.0
mean,733333.57,738323.42,95.17,3.72
std,582705.98,131219.62,76.07,8.85
min,6516.0,526721.0,-25.0,1.0
25%,507743.0,673519.0,25.0,1.0
50%,588236.0,730069.0,86.0,2.0
75%,646484.0,877030.0,156.0,3.0
max,2698588.0,1049562.0,269.0,6977.0



[6. Categorical Column Analysis]
  code_module: 7 unique values
  code_presentation: 4 unique values

[7. Data Quality Check]
  Duplicate Rows: 787,170
  Unique Students: 26,074
  Memory Usage: 1402.31 MB


### Data Understanding — StudentVLE 表问题总结

**1. 数据规模（Scalability Risk）**

* 行数：**10,655,280**
* 内存占用：**1402 MB**
  → 数据量极大，后续分析需先做**聚合（aggregation）**，否则难以直接建模

**2. 完整性（Completeness）**

* 无缺失值
  → 表面质量良好，但需关注“重复数据”和“异常值”

**3. 类型与语义（Validity）**

* `id_student`、`id_site` 为标识符
  → 不应直接作为数值特征
* `date` 为相对天数变量
  → 存在负值（min = -25），表示课程开始前行为，语义合理
* `sum_click` 为行为强度指标
  → 是该表最重要的有效特征

**4. 数值异常（Critical Issues）**

* `sum_click` 最大值 = **6977**
  → 存在极端活跃行为，分布高度右偏
* 大部分点击量很低：

  * 25% 分位 = 1
  * 中位数 = 2
    → 行为数据高度稀疏，典型“长尾行为数据”

建议：

* 使用对数变换（log）
* 或按学生进行聚合（如总点击数、周均点击、峰值点击）

**5. 数据质量问题**

* 存在大量重复行：**787,170 条**
  → 需确认：

  * 是真实重复记录？
  * 还是同一学生同一天同一页面的多次记录？
* 不建议直接删除，应基于：
  `(id_student, id_site, date)` 聚合为 sum_click 总和

**6. 分布问题**

* module 与 presentation 分布不均
  → 与前几张表一致，说明是数据结构本身特征

**核心结论**

> StudentVLE 不是“是否有问题”，而是**必须重构后才能使用**。
> 正确使用方式应是：
> **先聚合 → 再构造行为特征 → 再进入建模**

典型可构造特征包括：

* 每个学生总点击量
* 平均每日点击量
* 活跃天数
* 前两周点击量
* 是否存在早期流失（连续多天无点击）

In [59]:
# vle - Detailed Data Understanding
print("VLE - Detailed Data Understanding")

print(f"\n[1. Data Scale]")
print(f"  Rows: {vle.shape[0]:,}")
print(f"  Columns: {vle.shape[1]}")

print(f"\n[2. Column Names and Data Types]")
print(vle.dtypes.to_string())

print(f"\n[3. First 5 Rows]")
display(vle.head())

print(f"\n[4. Missing Value Analysis]")
missing = vle.isnull().sum()
missing_pct = (missing / len(vle) * 100).round(2)
if missing.sum() > 0:
    missing_df = pd.DataFrame({'Missing Count': missing, 'Percentage %': missing_pct})
    display(missing_df[missing_df['Missing Count'] > 0])
else:
    print("  No missing values")

print(f"\n[5. Numeric Column Statistics]")
numeric_cols = vle.select_dtypes(include=['int64', 'float64']).columns
if len(numeric_cols) > 0:
    display(vle[numeric_cols].describe().round(2))
else:
    print("  No numeric columns")

print(f"\n[6. Categorical Column Analysis]")
cat_cols = vle.select_dtypes(include=['object']).columns
for col in cat_cols:
    print(f"  {col}: {vle[col].nunique()} unique values")
    if vle[col].nunique() <= 10:
        print(f"    Distribution: {vle[col].value_counts().to_dict()}")

print(f"\n[7. Data Quality Check]")
print(f"  Duplicate Rows: {vle.duplicated().sum()}")
print(f"  Memory Usage: {vle.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


VLE - Detailed Data Understanding

[1. Data Scale]
  Rows: 6,364
  Columns: 6

[2. Column Names and Data Types]
id_site                int64
code_module           object
code_presentation     object
activity_type         object
week_from            float64
week_to              float64

[3. First 5 Rows]


Unnamed: 0,id_site,code_module,code_presentation,activity_type,week_from,week_to
0,546943,AAA,2013J,resource,,
1,546712,AAA,2013J,oucontent,,
2,546998,AAA,2013J,resource,,
3,546888,AAA,2013J,url,,
4,547035,AAA,2013J,resource,,



[4. Missing Value Analysis]


Unnamed: 0,Missing Count,Percentage %
week_from,5243,82.39
week_to,5243,82.39



[5. Numeric Column Statistics]


Unnamed: 0,id_site,week_from,week_to
count,6364.0,1121.0,1121.0
mean,726099.09,15.2,15.21
std,128315.14,8.79,8.78
min,526721.0,0.0,0.0
25%,661592.75,8.0,8.0
50%,730096.5,15.0,15.0
75%,814016.25,22.0,22.0
max,1077905.0,29.0,29.0



[6. Categorical Column Analysis]
  code_module: 7 unique values
    Distribution: {'FFF': 1967, 'DDD': 1708, 'BBB': 1154, 'CCC': 419, 'AAA': 413, 'GGG': 367, 'EEE': 336}
  code_presentation: 4 unique values
    Distribution: {'2013J': 1772, '2014B': 1671, '2014J': 1670, '2013B': 1251}
  activity_type: 20 unique values

[7. Data Quality Check]
  Duplicate Rows: 0
  Memory Usage: 1.13 MB


### Data Understanding — VLE 表问题总结

**1. 完整性（Completeness）**

* `week_from` 与 `week_to` 缺失率高达 **82.39%**
  → 属于**结构性缺失**，大多数活动无明确周区间，时间信息利用价值有限

**2. 类型与语义（Validity）**

* `id_site` 为标识符
  → 不应作为数值特征
* `week_from` / `week_to` 表示活动开放周区间
  → 但仅对少数记录有效，需谨慎使用

**3. 数值分布特征**

* 有效周区间范围：0–29 周
  → 符合课程周期长度
* `week_from` 与 `week_to` 统计几乎一致
  → 多数情况下活动持续窗口较固定

**4. 类别变量问题**

* `activity_type` 高基数（20 类）
  → One-hot 后维度较大，可能引入噪声
* module 与 presentation 分布不均
  → 与前几表一致，存在结构性偏斜

**5. 数据质量**

* 无重复值
* 数据规模适中（6,364 行）
* 表本身质量尚可，但**时间字段信息缺失限制了可用性**

**核心结论**

> VLE 表的关键问题是：
> **大部分活动缺乏有效时间区间（week_from / week_to），导致其在行为时序建模中的价值受限。**
> 更适合作为：
>
> * 活动类型特征
> * 资源结构辅助表
> * 与 StudentVLE 进行结构关联（而非独立建模）
