In [2]:
import pandas as pd

# Load the cleaned or raw dataset again
df = pd.read_csv('../data/online_course_completion.csv')  # adjust path if needed

df.head()


Unnamed: 0,age,continent,country,education_level,hours_per_week,num_logins_last_month,videos_watched_pct,assignments_submitted,discussion_posts,is_working_professional,preferred_device,completed_course,height_cm,weight_kg,num_siblings,has_pet,favorite_color,birth_month
0,56,Asia,Japan,Bachelors,9.116071,23,0.507679,7,1,1,mobile,0,175.0,54.1,2,0,red,May
1,46,Africa,Kenya,No Formal Education,0.454712,19,0.631424,1,1,1,desktop,0,168.6,57.7,3,0,black,Sep
2,32,South America,Brazil,Bachelors,4.946724,22,0.424817,2,2,0,console,0,176.5,80.2,2,1,green,Jul
3,25,Africa,Nigeria,Bachelors,5.05396,24,0.318874,3,0,1,console,0,185.2,76.3,1,0,purple,Jun
4,38,Asia,India,Bachelors,9.870424,28,1.0,6,0,0,mobile,1,167.7,76.3,2,0,orange,Nov


In [3]:
# Drop irrelevant or unrelated columns (ignore if already dropped)
columns_to_drop = [
    'birth_month',
    'favorite_color',
    'height_cm',
    'weight_kg',
    'num_siblings',
    'has_pet',
    'country'  
]

# errors='ignore' avoids crash if already dropped
df = df.drop(columns=columns_to_drop, errors='ignore')

# Preview after dropping
df.head()

Unnamed: 0,age,continent,education_level,hours_per_week,num_logins_last_month,videos_watched_pct,assignments_submitted,discussion_posts,is_working_professional,preferred_device,completed_course
0,56,Asia,Bachelors,9.116071,23,0.507679,7,1,1,mobile,0
1,46,Africa,No Formal Education,0.454712,19,0.631424,1,1,1,desktop,0
2,32,South America,Bachelors,4.946724,22,0.424817,2,2,0,console,0
3,25,Africa,Bachelors,5.05396,24,0.318874,3,0,1,console,0
4,38,Asia,Bachelors,9.870424,28,1.0,6,0,0,mobile,1


In [4]:
#  Fill missing values only if the column exists

if 'preferred_device' in df.columns:
    df['preferred_device'].fillna(df['preferred_device'].mode()[0], inplace=True)

if 'education_level' in df.columns:
    df['education_level'].fillna(df['education_level'].mode()[0], inplace=True)

if 'videos_watched_pct' in df.columns:
    df['videos_watched_pct'].fillna(df['videos_watched_pct'].median(), inplace=True)

# Confirm no more missing values
df.isnull().sum()

age                        0
continent                  0
education_level            0
hours_per_week             0
num_logins_last_month      0
videos_watched_pct         0
assignments_submitted      0
discussion_posts           0
is_working_professional    0
preferred_device           0
completed_course           0
dtype: int64

In [7]:
# List of categorical columns to encode
# Filter only columns that exist
categorical_cols = ['preferred_device', 'education_level', 'continent']
categorical_cols = [col for col in categorical_cols if col in df.columns]

# One-hot encode only valid columns
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Preview
df.head()

Unnamed: 0,age,hours_per_week,num_logins_last_month,videos_watched_pct,assignments_submitted,discussion_posts,is_working_professional,completed_course,preferred_device_desktop,preferred_device_mobile,...,education_level_Diploma,education_level_High School,education_level_Masters,education_level_No Formal Education,education_level_PhD,continent_Antarctica,continent_Asia,continent_Europe,continent_North America,continent_South America
0,56,9.116071,23,0.507679,7,1,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1,46,0.454712,19,0.631424,1,1,1,0,1,0,...,0,0,0,1,0,0,0,0,0,0
2,32,4.946724,22,0.424817,2,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,25,5.05396,24,0.318874,3,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,38,9.870424,28,1.0,6,0,0,1,0,1,...,0,0,0,0,0,0,1,0,0,0


In [8]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler

# Separate features (X) and target (y)
X = df.drop('completed_course', axis=1)
y = df['completed_course']

# Identify only the numeric columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Scale only the numeric columns
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

# Check scaled values
X.head()

Unnamed: 0,age,hours_per_week,num_logins_last_month,videos_watched_pct,assignments_submitted,discussion_posts,is_working_professional,preferred_device_desktop,preferred_device_mobile,preferred_device_pager,...,education_level_Diploma,education_level_High School,education_level_Masters,education_level_No Formal Education,education_level_PhD,continent_Antarctica,continent_Asia,continent_Europe,continent_North America,continent_South America
0,1.441802,1.672753,0.667011,-0.474764,0.871476,-0.706268,0.816065,0,1,0,...,0,0,0,0,0,0,1,0,0,0
1,0.616912,-1.864788,-0.226442,0.174015,-1.218736,-0.706268,0.816065,1,0,0,...,0,0,0,1,0,0,0,0,0,0
2,-0.537933,-0.030125,0.443648,-0.909203,-0.870367,0.00082,-1.225393,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,-1.115356,0.013673,0.890374,-1.464646,-0.521998,-1.413356,0.816065,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-0.042999,1.980852,1.783827,2.106415,0.523108,-1.413356,-1.225393,0,1,0,...,0,0,0,0,0,0,1,0,0,0
