<a href="https://colab.research.google.com/github/Lexi-Zhou/stats201-project-zzz/blob/main/Code/W3_2_Data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Import and load data

In [None]:
import re
import html
import unicodedata
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS

pd.set_option("display.max_colwidth", 160)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
FILE_PATH = "/content/drive/MyDrive/Colab Notebooks/stats201_final_project/Week_3/10_RMP_prof_gender_with_star_rating_revised.csv"

df = pd.read_csv(FILE_PATH)

df["comments_raw"] = df["comments"].astype(str)

df.head()

Unnamed: 0,professor_name,school_name,department_name_clean,stu_tags,student_star,comments,tag_list,comment_gender_signal,prof_gender_label,star_rating,star_rating_category,student_star_category,comments_raw
0,Lynn Ketter,Northwest Florida State College,Education,RESPECTED CARING GIVES GOOD FEEDBACK,5.0,"Wonderful teacher. Basically all work is done in class. The grading is very clear, plus she gives feedback on every assignment, and lets you turn it in agai...","['RESPECTED', 'CARING', 'GIVES GOOD FEEDBACK']",female,female,3.7,good,good,"Wonderful teacher. Basically all work is done in class. The grading is very clear, plus she gives feedback on every assignment, and lets you turn it in agai..."
1,Lynn Ketter,Northwest Florida State College,Education,PARTICIPATION MATTERS,2.0,Discussion boards are mandatory. Use book for tests. Personally found her a little rude via e-mail and D2L. Seems very informative with some aspects of the ...,['PARTICIPATION MATTERS'],female,female,3.7,good,poor,Discussion boards are mandatory. Use book for tests. Personally found her a little rude via e-mail and D2L. Seems very informative with some aspects of the ...
2,Lynn Ketter,Northwest Florida State College,Education,GIVES GOOD FEEDBACK LOTS OF HOMEWORK PARTICIPATION MATTERS,3.5,"Honestly, I didn\'t learn anything in this class but Ketter is amazing at giving great feedback and allowing resubmissions. Textbook is absolutely pointless...","['GIVES GOOD FEEDBACK', 'LOTS OF HOMEWORK', 'PARTICIPATION MATTERS']",unknown,female,3.7,good,good,"Honestly, I didn\'t learn anything in this class but Ketter is amazing at giving great feedback and allowing resubmissions. Textbook is absolutely pointless..."
3,Lynn Ketter,Northwest Florida State College,Education,GIVES GOOD FEEDBACK LOTS OF HOMEWORK SKIP CLASS? YOU WON\'T PASS.,4.0,She comes off a little cold at first but she\'s actually really nice. She has a lot of passion for teaching future educators. No tests in her class but quit...,"['GIVES GOOD FEEDBACK', 'LOTS OF HOMEWORK', ""SKIP CLASS? YOU WON\\'T PASS.""]",female,female,3.7,good,good,She comes off a little cold at first but she\'s actually really nice. She has a lot of passion for teaching future educators. No tests in her class but quit...
4,Lynn Ketter,Northwest Florida State College,Education,,5.0,I thought she was a great teacher and always gave helpful feedback. Just don\'t get on her bad side!,[],female,female,3.7,good,good,I thought she was a great teacher and always gave helpful feedback. Just don\'t get on her bad side!


## 2. Preprocessing

In [None]:
# Contraction / negation expansion
#   "doesn't" -> "does not", "she's" -> "she is"

CONTRACTIONS = {
    # negation
    "can't": "can not",
    "cannot": "can not",
    "won't": "will not",
    "n't": " not",

    # common forms
    "i'm": "i am",
    "you're": "you are",
    "we're": "we are",
    "they're": "they are",
    "he's": "he is",
    "she's": "she is",
    "it's": "it is",
    "that's": "that is",
    "there's": "there is",
    "here's": "here is",
    "what's": "what is",
    "who's": "who is",
    "where's": "where is",
    "when's": "when is",
    "why's": "why is",
    "how's": "how is",

    "i've": "i have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",

    "i'll": "i will",
    "you'll": "you will",
    "we'll": "we will",
    "they'll": "they will",
    "he'll": "he will",
    "she'll": "she will",
    "it'll": "it will",

    "i'd": "i would",
    "you'd": "you would",
    "we'd": "we would",
    "they'd": "they would",
    "he'd": "he would",
    "she'd": "she would",
    "it'd": "it would",
}

# Build regex that handles both full contractions and the generic "n't"
_contraction_pattern = re.compile(
    r"\b(" + "|".join(map(re.escape, sorted(CONTRACTIONS.keys(), key=len, reverse=True))) + r")\b",
    flags=re.IGNORECASE
)

def expand_contractions(text: str) -> str:
    t = text
    # first normalize apostrophes
    t = t.replace("’", "'").replace("`", "'")
    t_low = t.lower()

    def _repl(m):
        key = m.group(0).lower()
        return CONTRACTIONS.get(key, key)

    t_expanded = _contraction_pattern.sub(_repl, t_low)

    # handle residual patterns like "doesn't" not caught via whole-word mapping:
    # converting "doesn’t" already fixed; now replace any "(\w+)n't" to "\1 not"
    t_expanded = re.sub(r"(\w+)n't\b", r"\1 not", t_expanded)

    return t_expanded


### Basic cleaning
  - unescape \" \' \\n etc.
  - normalize unicode   
  - remove html entities
  - normalize whitespace


In [None]:

def basic_clean(text: str) -> str:
    if text is None:
        return ""
    t = str(text)

    # Decode HTML entities
    t = html.unescape(t)

    # handle don\'t, she\'s, etc.
    t = t.replace("\\'", "'")

    # Decode other escape sequences if any (e.g. \n)
    t = t.encode("utf-8", "ignore").decode("unicode_escape", "ignore")

    # Normalize unicode
    t = unicodedata.normalize("NFKC", t)

    # Normalize whitespace
    t = t.replace("\n", " ").replace("\r", " ").replace("\t", " ")
    t = re.sub(r"\s+", " ", t).strip()

    return t


## 2.2 stopwords

- use basic ENGLISH_STOP_WORDS
- Include gender pronouns (e.g., she, her, he) as a temporary way. May also remove them in the future if needded.

In [None]:

# Stopwords set

# Base stopwords from scikit-learn
stopwords = set(ENGLISH_STOP_WORDS)

# Keep negation words (IMPORTANT)
NEGATION_WORDS = {"no", "not", "never", "nor", "without"}
stopwords = stopwords - NEGATION_WORDS

# Optional: keep intensity words that can matter in evaluations
INTENSIFIERS = {"very", "really", "so", "too", "quite", "extremely"}
stopwords = stopwords - INTENSIFIERS

len(stopwords), list(sorted(list(NEGATION_WORDS)))


(310, ['never', 'no', 'nor', 'not', 'without'])

## 2.3 Tokenization + negation marking + punctuation removal
   Your requirement order:
   - basic cleaning
   - lowercase (done in contraction expansion)
   - expand contractions
   - punctuation removed
   - A) tokenization + lemmatization + ngrams + negation marking

IMPORTANT DETAIL:
 We mark negation scope BEFORE we "delete punctuation" conceptually,
 because punctuation helps define boundaries. We do this by sentence-splitting.

In [None]:

_sentence_split = re.compile(r"(?<=[.!?])\s+")

_word_pattern = re.compile(r"[a-z]+")  # only letters after lowercasing

def sentence_split(text: str):
    # split by .,!,? boundaries (simple + robust)
    parts = _sentence_split.split(text)
    return [p.strip() for p in parts if p.strip()]

def tokenize_words(text: str):
    # after expansion + lower, extract word tokens
    return _word_pattern.findall(text)

def mark_negation(tokens, scope=1, prefix="NEG_"):
    """
    Mark next `scope` tokens after a negation word with prefix.
    Keep the negation word itself.
    Example: ["not","good","teacher"] -> ["not","NEG_good","teacher"]
    """
    out = []
    neg_left = 0
    for tok in tokens:
        if tok in NEGATION_WORDS:
            out.append(tok)
            neg_left = scope
        else:
            if neg_left > 0:
                out.append(prefix + tok)
                neg_left -= 1
            else:
                out.append(tok)
    return out

def remove_stopwords(tokens):
    return [t for t in tokens if t not in stopwords]


## 2.4 Lemmatization

In [None]:

def build_lemmatizer():
    """
    Returns a function lemmatize_token(tok: str) -> str
    Try spaCy rule lemmatizer, else fallback to identity.
    """
    try:
        import spacy
        from spacy.lang.en import English

        nlp = English()

        nlp.add_pipe("lemmatizer", config={"mode": "rule"})
        try:
            nlp.initialize()
        except Exception:
            raise RuntimeError("spaCy lemmatizer init failed")

        def _lemma(tok: str) -> str:
            doc = nlp(tok)
            if len(doc) == 0:
                return tok
            lemma = doc[0].lemma_
            if not lemma or lemma == "-PRON-":
                return tok
            return lemma

        return _lemma, "spacy_rule"

    except Exception:
        def _identity(tok: str) -> str:
            return tok
        return _identity, "identity_fallback"

lemmatize_token, LEMMA_MODE = build_lemmatizer()
LEMMA_MODE


'identity_fallback'

## 2.5 Full preprocessing

In [None]:

# Outputs:
#   - comments_basic: basic_clean result
#   - comments_expanded: lowercase + expanded contractions
#   - comments_tokens: token list after negation marking + stopwords + lemmatization
#   - comments_final: final string for vectorizers/models (punctuation already removed)


def preprocess_comment(text: str, neg_scope=1):
    # 1) basic clean
    t = basic_clean(text)

    # 2) lower + expand contractions (includes negation expansion)
    t = expand_contractions(t)

    # 3) sentence split (use punctuation boundaries)
    sents = sentence_split(t) if t else []

    all_tokens = []
    for s in sents:
        # tokenize words only (punctuation removed by regex extraction)
        toks = tokenize_words(s)

        # negation marking within the sentence (boundary resets each sentence)
        toks = mark_negation(toks, scope=neg_scope, prefix="NEG_")

        all_tokens.extend(toks)

    # 4) stopwords removal (negations kept by our stopword set)
    all_tokens = remove_stopwords(all_tokens)

    # 5) lemmatization (apply to NEG_ tokens carefully)
    lemmatized = []
    for tok in all_tokens:
        if tok.startswith("NEG_"):
            core = tok[4:]
            lemma = lemmatize_token(core)
            lemmatized.append("NEG_" + lemma)
        else:
            lemmatized.append(lemmatize_token(tok))

    # final string (already lowercase; punctuation gone; space-separated tokens)
    final = " ".join([t for t in lemmatized if t])

    return {
        "comments_basic": basic_clean(text),
        "comments_expanded": expand_contractions(basic_clean(text)),
        "comments_tokens": lemmatized,
        "comments_final": final,
    }


## 2.5 apply preprocessing to dataset

In [None]:

processed = df["comments_raw"].apply(preprocess_comment)

df["comments_basic"] = processed.apply(lambda x: x["comments_basic"])
df["comments_expanded"] = processed.apply(lambda x: x["comments_expanded"])
df["comments_tokens"] = processed.apply(lambda x: x["comments_tokens"])
df["comments_final"] = processed.apply(lambda x: x["comments_final"])

df[["comments_raw", "comments_final"]].head(8)


Unnamed: 0,comments_raw,comments_final
0,"Wonderful teacher. Basically all work is done in class. The grading is very clear, plus she gives feedback on every assignment, and lets you turn it in agai...",wonderful teacher basically work class grading very clear plus gives feedback assignment lets turn points
1,Discussion boards are mandatory. Use book for tests. Personally found her a little rude via e-mail and D2L. Seems very informative with some aspects of the ...,discussion boards mandatory use book tests personally little rude e mail d l very informative aspects class changes things d l content so check stuff day
2,"Honestly, I didn\'t learn anything in this class but Ketter is amazing at giving great feedback and allowing resubmissions. Textbook is absolutely pointless...",honestly did not NEG_learn class ketter amazing giving great feedback allowing resubmissions textbook absolutely pointless
3,She comes off a little cold at first but she\'s actually really nice. She has a lot of passion for teaching future educators. No tests in her class but quit...,comes little cold actually really nice lot passion teaching future educators no NEG_tests class quite papers project end semester really simple journal fiel...
4,I thought she was a great teacher and always gave helpful feedback. Just don\'t get on her bad side!,thought great teacher gave helpful feedback just not NEG_get bad
5,Worst teacher ever,worst teacher
6,Dr. Ketter is an AWESOME teacher. I have had her for several classes both lower division and now upper division. I believe she is one of my easiest classes....,dr ketter awesome teacher classes lower division upper division believe easiest classes yes asks lot quite assignments lays tells exactly expected class thi...
7,"In the classroom, she treats you like you are 3 grade students. This is kind of helpful for teaching techniques, but when she grades its your work she is ex...",classroom treats like grade students kind helpful teaching techniques grades work extremely tough does lot feedback lower division easy upper division insan...


# 3.1 check

In [None]:

# missing/empty checks
print("Total rows:", len(df))
print("Empty raw comments:", (df["comments_raw"].str.strip() == "").sum())
print("Empty final comments:", (df["comments_final"].str.strip() == "").sum())

# token length distribution
df["n_tokens_final"] = df["comments_tokens"].apply(len)
df["n_tokens_final"].describe()


Total rows: 19685
Empty raw comments: 0
Empty final comments: 12


Unnamed: 0,n_tokens_final
count,19685.0
mean,19.635712
std,9.6211
min,0.0
25%,12.0
50%,21.0
75%,28.0
max,65.0


In [None]:

# Build N-gram document-term matrix (A required: ngrams)
#  We create CountVectorizer for 1-3 grams.


vectorizer = CountVectorizer(
    ngram_range=(1, 3),
    min_df=2,
    max_df=0.95
)

X = vectorizer.fit_transform(df["comments_final"])
vocab_size = len(vectorizer.get_feature_names_out())

print("DTM shape:", X.shape)
print("Vocab size:", vocab_size)


DTM shape: (19685, 65476)
Vocab size: 65476


In [None]:

# Inspect frequent ngrams (useful for debugging)

import numpy as np

terms = vectorizer.get_feature_names_out()
counts = np.asarray(X.sum(axis=0)).ravel()
top_idx = counts.argsort()[::-1][:30]

top_ngrams = pd.DataFrame({
    "ngram": terms[top_idx],
    "count": counts[top_idx]
})

top_ngrams


Unnamed: 0,ngram,count
0,class,16196
1,not,12726
2,very,8328
3,professor,5048
4,teacher,4637
5,easy,4012
6,great,3799
7,so,3704
8,really,3529
9,good,3158


In [None]:

# Save processed dataset


OUTPUT_DIR = "/content/drive/MyDrive/Colab Notebooks/stats201_final_project/Week_3/"
OUT_CSV = OUTPUT_DIR + "11_RMP_prof_gender_preprocessed.csv"

df.to_csv(OUT_CSV, index=False)

print("Saved to:", OUT_CSV)

Saved to: /content/drive/MyDrive/Colab Notebooks/stats201_final_project/Week_3/11_RMP_prof_gender_preprocessed.csv
