### 1. Import Libraries

In [41]:
import numpy as np
import pandas as pd
import torch
import tqdm
import transformers
import string
import re
import nltk
import contractions
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     /data/home/huixian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /data/home/huixian/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /data/home/huixian/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

### 2. Read CSV

In [95]:
data = pd.read_csv('/data/home/huixian/Documents/Sentiment_Analysis_of_RateMyProfessor_Reviews/data/new_data/original.csv')
data = data.rename(columns={'Professor ID':'professor_id', 'Comment': 'comment', 'Star Rating': 'star_rating', 'Course Difficulty': 'course_difficulty'})
print(data.columns)


Index(['professor_id', 'comment', 'star_rating', 'course_difficulty',
       'gives_good_feedback', 'caring', 'respected', 'participation_matters',
       'clear_grading_criteria', 'skip_class', 'amazing_lectures',
       'inspirational', 'tough_grader', 'hilarious', 'get_ready_to_read',
       'lots_of_homework', 'accessible_outside_class', 'lecture_heavy',
       'extra_credit', 'graded_by_few_things', 'group_projects',
       'would_take_again', 'skip_class_you_wont_pass', 'test_heavy',
       'so_many_papers', 'beware_of_pop_quizzes', 'tests_are_tough',
       'IsCourseOnline', 'noTag'],
      dtype='object')


### 3. Split by `noTags`

In [96]:
rows_with_tags = data[data['noTag'] == 0]
print(f'Rows with tags: {rows_with_tags.shape[0]}\n')
rows_with_notags = data[data['noTag'] == 1]
print(f'Rows with no tags: {rows_with_notags.shape[0]}\n')

Rows with tags: 34440

Rows with no tags: 89234



In [97]:
rows_with_tags = rows_with_tags.drop(columns=['professor_id', 'skip_class', 'IsCourseOnline', 'noTag'])
print(f'Rows with tags: {rows_with_tags.shape[0]}\n')
rows_with_notags = rows_with_notags.drop(columns=['professor_id', 'skip_class', 'IsCourseOnline', 'noTag'])
print(f'Rows with no tags: {rows_with_notags.shape[0]}\n')

Rows with tags: 34440

Rows with no tags: 89234



In [98]:
rows_with_tags = rows_with_tags.drop(rows_with_tags[rows_with_tags.drop(columns=['comment', 'star_rating', 'course_difficulty']).sum(axis=1) == 0].index) # dropping the 14 rows with no tags
print(f'Rows with tags: {rows_with_tags.shape[0]}\n')

Rows with tags: 34426



### 4. Preprocess Comments

In [99]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

wanted_stopword = ['not', 'no']
for ws in wanted_stopword:
    stop_words.discard(ws)

# Define a function to preprocess comments
def preprocess_text(text):
    text = contractions.fix(str(text))

    # 1. Convert to lowercase
    text = str(text).lower()

    # 2. Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # 3. Tokenize words
    words = word_tokenize(text)

    # 4. Remove stopwords
    # words = [word for word in words if word not in stop_words]

    # 5. Perform stemming (optional)
    words = [stemmer.stem(word) for word in words]

    # 6. Join words back into a single string
    processed_text = ' '.join(words)

    return processed_text



In [100]:
print(rows_with_tags.columns)

Index(['comment', 'star_rating', 'course_difficulty', 'gives_good_feedback',
       'caring', 'respected', 'participation_matters',
       'clear_grading_criteria', 'amazing_lectures', 'inspirational',
       'tough_grader', 'hilarious', 'get_ready_to_read', 'lots_of_homework',
       'accessible_outside_class', 'lecture_heavy', 'extra_credit',
       'graded_by_few_things', 'group_projects', 'would_take_again',
       'skip_class_you_wont_pass', 'test_heavy', 'so_many_papers',
       'beware_of_pop_quizzes', 'tests_are_tough'],
      dtype='object')


#### rows_with_tags

In [101]:
rows_with_tags = rows_with_tags[rows_with_tags['comment'] != 'No Comments']
rows_with_tags = rows_with_tags[rows_with_tags['comment'] != '']
rows_with_tags['comment'] = rows_with_tags['comment'].apply(preprocess_text)
rows_with_tags = rows_with_tags[rows_with_tags['comment'].apply(len) >= 4]
print(f'Rows after dropping empty comments: {rows_with_tags.shape[0]}\n')

Rows after dropping empty comments: 34360



In [102]:
rows_with_tags = rows_with_tags[rows_with_tags['comment'].str.strip() != ""] # dropping ''
rows_with_tags = rows_with_tags.dropna(subset=['comment']) # dropping Nones
print(f'Rows after dropping empty comments: {rows_with_tags.shape[0]}\n')

Rows after dropping empty comments: 34360



#### rows_with_notags

In [103]:
rows_with_notags = rows_with_notags[rows_with_notags['comment'] != 'No Comments']
rows_with_notags = rows_with_notags[rows_with_notags['comment'] != '']
rows_with_notags['comment'] = rows_with_notags['comment'].apply(preprocess_text)
rows_with_notags = rows_with_notags[rows_with_notags['comment'].apply(len) >= 4]
print(f'Rows after dropping empty comments: {rows_with_notags.shape[0]}\n')

Rows after dropping empty comments: 84532



In [104]:
rows_with_notags = rows_with_notags[rows_with_notags['comment'].str.strip() != ""] # dropping ''
rows_with_notags = rows_with_notags.dropna(subset=['comment']) # dropping Nones
print(f'Rows after dropping empty comments: {rows_with_notags.shape[0]}\n')

Rows after dropping empty comments: 84532



#### Mixed_sets

In [110]:
sampled_rows = rows_with_notags.sample(n = int(0.02 * rows_with_tags.shape[0]))
complete_set = pd.concat([rows_with_tags, sampled_rows], ignore_index=True)
print(f'Rows of mixed_set: {complete_set.shape[0]}\n')

Rows of mixed_set: 35047



In [111]:
print(f'Rows of mixed_set: {sampled_rows.shape[0]}\n')

Rows of mixed_set: 687



### 5. Export

In [112]:
# preprocessed with tags
rows_with_tags.to_csv('rows_with_tags.csv', index=False)    
# preprocessed with notags
rows_with_notags.to_csv('rows_with_notags.csv', index=False)
# preprocessed with tags + 2% from notags
complete_set.to_csv('complete_set.csv', index=False)


### 6. Andy's


In [114]:
andy = rows_with_notags.merge(sampled_rows, how='outer', indicator=True).query('_merge == "left_only"').drop(columns=['_merge'])
print(f'Rows of andy\'s: {andy.shape[0]}\n')
andy.to_csv('andy.csv', index=False)

Rows of andy's: 83770

