In [None]:
import pandas as pd

# Step 1: Load the data
file_path = '/Users/seyed/Git/Hub/Learning-Style-Questionnaire/Raw_Data.csv'
df = pd.read_csv(file_path)

# Step 2: Show basic info
print("Shape of dataset:", df.shape)
print("\nColumns:", df.columns.tolist())

# Step 3: Check for missing or empty values
missing_values = df.isnull().sum()
print("\nMissing values per column:\n", missing_values)


Shape of dataset: (58, 21)

Columns: ['ID', 'Start time', 'Completion time', 'Email', 'Name', 'Last modified time', 'Age?', 'Gender?', 'Year of study?', 'From the scale of 1 to 10 how familiar are you with the concept of learning styles?', 'Which learning style best describes you?', 'How many hours do you use visual learning aids (e.g. diagram, charts ,videos)?', 'How many hours do you spend on audio-based learning ( Lectures, podcasts, discussions)?', 'How many hours do you spend on hands-on or practical learning ( experiments, \xa0case studies)?', 'How many hours do you study using written materials (Books, articles, notes)?', 'From the scale of 1 to 10 how much LSBU teaching methods support your learning style?', 'Which LSBU learning method do you find most effective?', "How often do you use LSBU's digital learning resources (Moodle, recorded lectures, online library)?", "How effective do you find LSBU's current teaching methods?", 'What is your average assessment results(%)?', 'Fro

In [3]:
# Step 4: Drop unnecessary columns
columns_to_drop = ['Email', 'Name', 'Last modified time']
df_cleaned = df.drop(columns=columns_to_drop)

# Save the cleaned version 
df_cleaned.to_csv('/Users/seyed/Git/Hub/Learning-Style-Questionnaire/Cleaned_Data.csv', index=False)

In [5]:
# Step 5: Check for exact duplicate rows
duplicate_rows = df_cleaned[df_cleaned.duplicated()]
print("Duplicate rows:")
print(duplicate_rows)


Duplicate rows:
Empty DataFrame
Columns: [ID, Start time, Completion time, Age?, Gender?, Year of study?, From the scale of 1 to 10 how familiar are you with the concept of learning styles?, Which learning style best describes you?, How many hours do you use visual learning aids (e.g. diagram, charts ,videos)?, How many hours do you spend on audio-based learning ( Lectures, podcasts, discussions)?, How many hours do you spend on hands-on or practical learning ( experiments,  case studies)?, How many hours do you study using written materials (Books, articles, notes)?, From the scale of 1 to 10 how much LSBU teaching methods support your learning style?, Which LSBU learning method do you find most effective?, How often do you use LSBU's digital learning resources (Moodle, recorded lectures, online library)?, How effective do you find LSBU's current teaching methods?, What is your average assessment results(%)?, From the scale of 1 to 10 how much do you believe  that LSBU have necessary 

In [18]:
import pandas as pd


# 1. Rename columns to shorter names
rename_map = {
    'ID': 'ID',
    'Start time': 'Start',
    'Completion time': 'End',
    'Age?': 'Age',
    'Gender?': 'Gender',
    'Year of study?': 'Year',
    'From the scale of 1 to 10 how familiar are you with the concept of learning styles?': 'Familiarity',
    'Which learning style best describes you?': 'PrefStyle',
    'How many hours do you use visual learning aids (e.g. diagram, charts ,videos)?': 'VisualHrs',
    'How many hours do you spend on audio-based learning ( Lectures, podcasts, discussions)?': 'AudioHrs',
    'How many hours do you spend on hands-on or practical learning ( experiments,  case studies)?': 'PracticalHrs',
    'How many hours do you study using written materials (Books, articles, notes)?': 'WrittenHrs',
    'From the scale of 1 to 10 how much LSBU teaching methods support your learning style?': 'SupportRating',
    'Which LSBU learning method do you find most effective?': 'BestMethod',
    "How often do you use LSBU's digital learning resources (Moodle, recorded lectures, online library)?": 'DigitalFreq',
    "How effective do you find LSBU's current teaching methods?": 'MethodEffectiveness',
    'What is your average assessment results(%)?': 'AssessmentCat',
    'From the scale of 1 to 10 how much do you believe  that LSBU have necessary resources matching your learning style? ': 'ResourceRating'
}
df_prep = df_cleaned.rename(columns=rename_map)

# 2. Convert Start/End to elapsed minutes
df_prep['Start'] = pd.to_datetime(df_prep['Start'])
df_prep['End']   = pd.to_datetime(df_prep['End'])
df_prep['ElapsedMins'] = (df_prep['End'] - df_prep['Start']).dt.total_seconds() / 60

# 3. Convert Year of study to numeric
year_map = {
    'First year':  1,
    'Second year': 2,
    'Third year':  3
    }
df_prep['YearNum'] = df_prep['Year'].map(year_map)


# 4. Cast numeric columns
num_cols = ['Age', 'YearNum', 'Familiarity', 'VisualHrs', 'AudioHrs', 'PracticalHrs',
             'WrittenHrs', 'SupportRating', 'DigitalFreq', 'MethodEffectiveness', 'ResourceRating']
df_prep[num_cols] = df_prep[num_cols].apply(pd.to_numeric, errors='coerce')

# 5. Mapping style from long text to short category
pref_map = {
    'Reading/ Writing ( Text-based learning)': 'Reading',
    'Mix of styles':                       'Mixed',
    'Visual (Images, charts)':             'Visual',
    'Auditory ( Listening, discussions)':  'Auditory',
    'Kinesthetic ( Hands-on activities)':  'Kinesthetic'
}
df_prep['PrefStyleShort'] = df_prep['PrefStyle'].map(pref_map)

# 6. Recode AssessmentCat into ordered categories or numeric mid-points
#    For example, map to midpoint values:
mapping = {
    'less than 40': 20,
    '41 to 60':     50,
    '60 to 70':     65,
    'Above 70':     85
}
df_prep['AssessmentScore'] = df_prep['AssessmentCat'].map(mapping)

# 7. Create new features
df_prep['TotalStudyHrs'] = df_prep[['VisualHrs','AudioHrs','PracticalHrs','WrittenHrs']].sum(axis=1)

# Check modal alignment—for each row the max‐hours mode vs. PrefStyle
def dominant_mode(row):
    modes = {
        'Visual': row['VisualHrs'],
        'Auditory': row['AudioHrs'],
        'Practical': row['PracticalHrs'],
        'Written': row['WrittenHrs']
    }
    return max(modes, key=modes.get)

df_prep['DominantMode'] = df_prep.apply(dominant_mode, axis=1)
df_prep['ModeMatchesPreference'] = (df_prep['DominantMode'] == df_prep['PrefStyleShort'])

# 8. Save to new CSV (this will become your “Sheet 2” Pre-processed Data)
df_prep.to_csv('/Users/seyed/Git/Hub/Learning-Style-Questionnaire/Preprocessed_Data.csv', index=False)

# 9. Drop redundant columns
# List of columns to remove now that they're redundant
to_drop = [
    'PrefStyle',
    'AssessmentCat',
    'Start',
    'End',
    'Year'
]
# Drop them
df_prep = df_prep.drop(columns=to_drop)

# 10. Quick check
print("Final shape:", df_prep.shape)
print("Any missing in numeric cols?\n", df_prep[num_cols + ['AssessmentScore']].isnull().sum())
print("Mode vs Pref count:\n", df_prep['ModeMatchesPreference'].value_counts())


Final shape: (58, 20)
Any missing in numeric cols?
 Age                    0
YearNum                0
Familiarity            0
VisualHrs              0
AudioHrs               0
PracticalHrs           0
WrittenHrs             0
SupportRating          0
DigitalFreq            0
MethodEffectiveness    0
ResourceRating         0
AssessmentScore        0
dtype: int64
Mode vs Pref count:
 ModeMatchesPreference
False    45
True     13
Name: count, dtype: int64


  df_prep['Start'] = pd.to_datetime(df_prep['Start'])
  df_prep['End']   = pd.to_datetime(df_prep['End'])


In [19]:
print(df_prep.info())
print(df_prep.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58 entries, 0 to 57
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID                     58 non-null     int64  
 1   Age                    58 non-null     int64  
 2   Gender                 58 non-null     object 
 3   Familiarity            58 non-null     int64  
 4   VisualHrs              58 non-null     int64  
 5   AudioHrs               58 non-null     int64  
 6   PracticalHrs           58 non-null     int64  
 7   WrittenHrs             58 non-null     int64  
 8   SupportRating          58 non-null     int64  
 9   BestMethod             58 non-null     object 
 10  DigitalFreq            58 non-null     int64  
 11  MethodEffectiveness    58 non-null     int64  
 12  ResourceRating         58 non-null     int64  
 13  ElapsedMins            58 non-null     float64
 14  YearNum                58 non-null     int64  
 15  PrefStyl

In [20]:
# Filter to the “False” cases
false_df = df_prep[df_prep['ModeMatchesPreference'] == False]

# Compute the stats
mean_elapsed = false_df['ElapsedMins'].mean()
min_elapsed  = false_df['ElapsedMins'].min()
max_elapsed  = false_df['ElapsedMins'].max()

print(f"Elapsed time when ModeMatchesPreference is False:")
print(f"  Mean: {mean_elapsed:.1f} minutes")
print(f"  Min : {min_elapsed:.1f} minutes")
print(f"  Max : {max_elapsed:.1f} minutes")


Elapsed time when ModeMatchesPreference is False:
  Mean: 5.6 minutes
  Min : 0.1 minutes
  Max : 137.1 minutes
