In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Feature Engineering

In [2]:
df = pd.read_csv("dataset/overall.csv")
# df = pd.read_excel("dataset/training_data.xlsx")
del df["Unnamed: 0"]

df.dropna(inplace = True)

df['year'] = df['term'].apply(lambda x: x[0:7])
df['term'] = df['term'].apply(lambda x: x[8:])
df["before_process_vacancy"].replace(0, np.nan, inplace = True)

sections = df.groupby(['course_code', 'year', 'term'])[['session']].count().reset_index()
sections = sections.rename({'session' : 'num_sections'}, axis = 1)

df = df.merge(sections, how = 'left', on = ['course_code', 'year', 'term'])

df.loc[df['before_process_vacancy'].isnull(), 'before_process_vacancy'] = 0
df.loc[df['before_process_vacancy'] < 0, 'before_process_vacancy'] = 0
df.loc[df['after_process_vacancy'] < 0, 'after_process_vacancy'] = 0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 242894 entries, 0 to 242893
Data columns (total 18 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   term                    242894 non-null  object 
 1   session                 242894 non-null  object 
 2   bidding_window          242894 non-null  object 
 3   course_code             242894 non-null  object 
 4   description             242894 non-null  object 
 5   section                 242894 non-null  object 
 6   vacancy                 242894 non-null  float64
 7   opening_vacancy         242894 non-null  float64
 8   before_process_vacancy  150638 non-null  float64
 9   dice                    242894 non-null  float64
 10  after_process_vacancy   242894 non-null  float64
 11  enrolled_students       242894 non-null  float64
 12  median_bid              242894 non-null  float64
 13  min_bid                 242894 non-null  float64
 14  instructor          

### Save the training dataset

In [35]:
### Save the training dataset
df.to_excel("dataset/training_data.xlsx", engine = 'openpyxl')

#### Save a subset of the training data: Incoming Freshman Round 1 Window 1

In [12]:
### Set the course codes for courses you are interested in
course_codes = ['COR-STAT1203', 'ECON104', 'COR-IS1702', 'IS211', 'COR-COMM1304', 'COR-IS1702', 'COR-2217',
                     'COR-2221','COR-2222', 'COR2100', 'COR1100', 'CS103', 'IS211']
## Filter according to round
# subset_df = df[(df['bidding_window'] == "Incoming Freshmen Rnd 1 Win 1") | 
#                (df['bidding_window'] == "Incoming Freshmen Rnd 1 Win ") ]
subset_course_df = df[df['course_code'].isin(course_codes)]
subset_course_df.to_excel("dataset/subsets/fm_r1w1_training_data.xlsx", engine = 'openpyxl')
subset_course_df.head()

Unnamed: 0,term,session,bidding_window,course_code,description,section,vacancy,opening_vacancy,before_process_vacancy,dice,after_process_vacancy,enrolled_students,median_bid,min_bid,instructor,school,year,num_sections
106066,Term 1,Regular Academic Session,Round 2A Window 3,IS211,Interaction Design and Prototyping,G1,45.0,45.0,1.0,0.0,1.0,44.0,0.0,0.0,BENJAMIN GAN KOK SIEW,SIS,2018-19,171
106067,Term 1,Regular Academic Session,Round 2A Window 3,IS211,Interaction Design and Prototyping,G2,45.0,45.0,2.0,0.0,2.0,43.0,0.0,0.0,BENJAMIN GAN KOK SIEW,SIS,2018-19,171
106068,Term 1,Regular Academic Session,Round 2A Window 3,IS211,Interaction Design and Prototyping,G3,45.0,45.0,0.0,0.0,0.0,45.0,0.0,0.0,KOTARO HARA,SIS,2018-19,171
106069,Term 1,Regular Academic Session,Round 2A Window 3,IS211,Interaction Design and Prototyping,G4,45.0,45.0,1.0,0.0,1.0,44.0,0.0,0.0,KOTARO HARA,SIS,2018-19,171
106070,Term 1,Regular Academic Session,Round 2A Window 3,IS211,Interaction Design and Prototyping,G5,45.0,45.0,1.0,0.0,1.0,44.0,0.0,0.0,KOTARO HARA,SIS,2018-19,171
