<a href="https://colab.research.google.com/github/Lexi-Zhou/stats201-project-zzz/blob/main/Code/W3_1_Revise_rating_groups.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Load data

In [None]:
import re
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
main_path = "/content/drive/MyDrive/Colab Notebooks/stats201_final_project/Week_2/09_RMP_prof_gender_manual_updated.csv"
merge_path = "/content/drive/MyDrive/Colab Notebooks/stats201_final_project/Week_2/RMP_merged.csv"

main = pd.read_csv(main_path)
merged = pd.read_csv(merge_path)

print("Main shape:", main.shape)
print("Merged shape:", merged.shape)

main.head(2)

Main shape: (19685, 10)
Merged shape: (19685, 23)


Unnamed: 0,professor_name,school_name,department_name_clean,stu_tags,student_star,comments,rating_group,tag_list,comment_gender_signal,prof_gender_label
0,Lynn Ketter,Northwest Florida State College,Education,RESPECTED CARING GIVES GOOD FEEDBACK,5.0,Wonderful teacher. Basically all work is done ...,positive,"['RESPECTED', 'CARING', 'GIVES GOOD FEEDBACK']",female,female
1,Lynn Ketter,Northwest Florida State College,Education,PARTICIPATION MATTERS,2.0,Discussion boards are mandatory. Use book for ...,negative,['PARTICIPATION MATTERS'],female,female


## 2. preprocessing

In [None]:
# column checks
required_main_cols = ["comments", "student_star", "rating_group"]
required_merge_cols = ["comments", "star_rating"]

missing_main = [c for c in required_main_cols if c not in main.columns]
missing_merge = [c for c in required_merge_cols if c not in merged.columns]


print("Main comments missing:", main["comments"].isna().sum())
print("Merged comments missing:", merged["comments"].isna().sum())


Main comments missing: 101
Merged comments missing: 101


In [None]:
# normalize comments
def normalize_comment(s: pd.Series) -> pd.Series:
    return (
        s.astype(str)
         .str.replace(r"\s+", " ", regex=True)
         .str.strip()
    )

main["_comment_key"] = normalize_comment(main["comments"])
merged["_comment_key"] = normalize_comment(merged["comments"])

merged_small = merged[["_comment_key", "star_rating"]].copy()

dup_n = merged_small.duplicated("_comment_key").sum()
print("Duplicates in merged by comment key:", dup_n)

merged_small = (
    merged_small
    .groupby("_comment_key", as_index=False)["star_rating"]
    .mean()
)
print("Merged_small shape after dedup:", merged_small.shape)


Duplicates in merged by comment key: 601
Merged_small shape after dedup: (19084, 2)


## 3. merge star_rating into main

In [None]:

df = main.merge(merged_small, on="_comment_key", how="left")

print("After merge shape:", df.shape)
print("star_rating missing after merge:", df["star_rating"].isna().sum())

df.loc[df["star_rating"].isna(), ["comments"]].head(5)


After merge shape: (19685, 12)
star_rating missing after merge: 0


Unnamed: 0,comments


## 4. recode rating groups

In [None]:

if "rating_group" in df.columns:
    df = df.drop(columns=["rating_group"])

df = df.drop(columns=["_comment_key"])

df.head(2)


Unnamed: 0,professor_name,school_name,department_name_clean,stu_tags,student_star,comments,tag_list,comment_gender_signal,prof_gender_label,star_rating
0,Lynn Ketter,Northwest Florida State College,Education,RESPECTED CARING GIVES GOOD FEEDBACK,5.0,Wonderful teacher. Basically all work is done ...,"['RESPECTED', 'CARING', 'GIVES GOOD FEEDBACK']",female,female,3.7
1,Lynn Ketter,Northwest Florida State College,Education,PARTICIPATION MATTERS,2.0,Discussion boards are mandatory. Use book for ...,['PARTICIPATION MATTERS'],female,female,3.7


In [None]:

# 3.5–5.0 good
# 2.5–3.4 average
# 1.0–2.4 poor

def to_rating_category(x):
    if pd.isna(x):
        return np.nan
    try:
        x = float(x)
    except Exception:
        return np.nan

    if 3.5 <= x <= 5.0:
        return "good"
    elif 2.5 <= x < 3.5:
        return "average"
    elif 1.0 <= x < 2.5:
        return "poor"
    else:

        return np.nan

df["star_rating_category"] = df["star_rating"].apply(to_rating_category)
df["student_star_category"] = df["student_star"].apply(to_rating_category)

print("star_rating_category:\n", df["star_rating_category"].value_counts(dropna=False))
print("\nstudent_star_category:\n", df["student_star_category"].value_counts(dropna=False))


star_rating_category:
 star_rating_category
good       13552
average     4010
poor        2123
Name: count, dtype: int64

student_star_category:
 student_star_category
good       13615
poor        4114
average     1943
NaN           13
Name: count, dtype: int64


In [None]:

test_vals = pd.Series([1.0, 2.4, 2.5, 3.4, 3.5, 5.0, np.nan, 0.0, 5.2])
pd.DataFrame({
    "value": test_vals,
    "category": test_vals.apply(to_rating_category)
})


Unnamed: 0,value,category
0,1.0,poor
1,2.4,poor
2,2.5,average
3,3.4,average
4,3.5,good
5,5.0,good
6,,
7,0.0,
8,5.2,


In [None]:
# save output


OUTPUT_DIR = "/content/drive/MyDrive/Colab Notebooks/stats201_final_project/Week_3/"
OUT_CSV = OUTPUT_DIR + "10_RMP_prof_gender_with_star_rating_revised.csv"

df.to_csv(OUT_CSV, index=False)

print("Saved to:", OUT_CSV)


Saved to: /content/drive/MyDrive/Colab Notebooks/stats201_final_project/Week_3/10_RMP_prof_gender_with_star_rating_revised.csv
