## Data Preprocessing

In [1]:
import os
import pandas as pd
import numpy as np
import openpyxl
data_dir = '../data'

### Generating Courses Data

In [None]:
courses = pd.read_csv(os.path.join(data_dir, 'courses.csv'))
courses.sort_values(by=['category', 'course name'], ascending=[False, True], inplace=True)
courses.reset_index(inplace=True, drop=True)
courses["course_id"] = courses.index
courses.rename(columns={"course name": "course_name"}, inplace=True)
courses = courses[["course_id", "course_name", "category"]]

In [None]:
courses["course_id"] = courses.index
courses.rename(columns={"course name": "course_name"}, inplace=True)
courses = courses[["course_id", "course_name", "category"]]
courses["course_id"] += 1
courses.to_csv(os.path.join(data_dir, "courses.csv"), index=False)

In [3]:
courses = pd.read_csv(os.path.join(data_dir, 'courses.csv'))
courses.head()

Unnamed: 0,course_id,course_name,category
0,1,Art of living,Spiritual
1,2,Introduction to Bhagvadgita,Spiritual
2,3,Pranvidya,Spiritual
3,4,Yog Vidya,Spiritual
4,5,Mentoring School Children (Abhyudaya),Social


### Combine all sheets in a single CSV file


In [14]:
final = pd.DataFrame(columns=['email', 'name', 'uid', 'branch', 'sem', 'course'])
final.head()

Unnamed: 0,email,name,uid,branch,sem,course


In [15]:
sem3 = pd.read_csv(os.path.join(data_dir, 'sheet-1.csv'))
sem4 = pd.read_csv(os.path.join(data_dir, 'sheet-2.csv'))
sem5 = pd.read_csv(os.path.join(data_dir, 'sheet-3.csv'))
sem6 = pd.read_csv(os.path.join(data_dir, 'sheet-4.csv'))

# merge them all into final
final = pd.concat([sem3, sem4, sem5, sem6], ignore_index=True)
final.head()

Unnamed: 0,email,name,uid,branch,sem,course,category
0,shaun.dsouza@spit.ac.in,Shaun D'Souza,2021300031,COMP,1,Basics of Fire Safety,Intellectual
1,ganesh.chaudhari@spit.ac.in,Ganesh Chaudhari,2021510007,MCA,1,Basics of Fire Safety,Intellectual
2,chetan.dhandge@spit.ac.in,Chetan Dhandge,2021510010,MCA,1,Basics of Fire Safety,Intellectual
3,sudhir.gomase@spit.ac.in,Sudhir Narayan Gomase,2021510018,MCA,1,Basics of Fire Safety,Intellectual
4,vishal.padme@spit.ac.in,Vishal Devidas Padme,2021510041,MCA,1,Basics of Fire Safety,Intellectual


In [16]:
# print all unique values in sem column
print(final['sem'].unique())


[1 3 5 7 4 6 8 2]


In [17]:
final.to_csv(os.path.join(data_dir, 'final.csv'), index=False)

In [22]:
df = pd.read_csv(os.path.join(data_dir, "final.csv"))
print(df.shape)
# null values
df.isnull().sum()

(5377, 7)


email       0
name        0
uid         0
branch      0
sem         0
course      0
category    0
dtype: int64

In [12]:
orig_dir = '../data/original'
wb1 = openpyxl.load_workbook(os.path.join(orig_dir, 'odd sem 2022-23.xlsx'))
wb2 = openpyxl.load_workbook(os.path.join(orig_dir, 'even sem 2022-23.xlsx'))
wb3 = openpyxl.load_workbook(os.path.join(orig_dir, 'odd sem 2023-24.xlsx'))
wb4 = openpyxl.load_workbook(os.path.join(orig_dir, 'even sem 2023-24.xlsx'))

c1 = len(wb1.sheetnames)
c2 = len(wb2.sheetnames)
c3 = len(wb3.sheetnames)
c4 = len(wb4.sheetnames)

print("ODD SEM 2022-23: ",c1, "sheets")
print("EVEN SEM 2022-23: ", c2, "sheets")
print("ODD SEM 2023-24: ", c3, "sheets")
print("EVEN SEM 2023-24: ", c4, "sheets")

ODD SEM 2022-23:  21 sheets
EVEN SEM 2022-23:  21 sheets
ODD SEM 2023-24:  27 sheets
EVEN SEM 2023-24:  33 sheets


In [86]:
for sheet_no in range(0, 20):  
    df = pd.read_excel(
        os.path.join(data_dir, "odd sem 2022-23.xlsx"), sheet_name=sheet_no
    )
    df.columns = ["email", "name", "uid", "class", "branch", "course", "sem"]
    if sheet_no == 0:
        df.drop(16, inplace=True)
    df.drop(columns=["class"], inplace=True)
    df = df[["email", "name", "uid", "branch", "sem", "course"]]
    final = pd.concat([final, df], ignore_index=True)

In [87]:
for sheet_no in range(0, 21):
    df = pd.read_excel(
        os.path.join(data_dir, "even sem 2022-23.xlsx"), sheet_name=sheet_no
    )
    if sheet_no == 11:
        df.drop(columns=[df.columns[-1]], inplace=True)
    df.columns = ["timestamp", "email", "name", "uid", "class", "branch", "course", "sem"]
    df.drop(columns=["timestamp", "class"], inplace=True)
    df = df[["email", "name", "uid", "branch", "sem", "course"]]
    final = pd.concat([final, df], ignore_index=True)

In [None]:
for sheet_no in range(0, 27):
    df = pd.read_excel(os.path.join(data_dir, 'odd sem 2023-24.xlsx'), sheet_name=sheet_no)
    df.columns = ['email', 'name', 'uid', 'class', 'branch', 'course', 'sem']
    df.drop(columns=['class'], inplace=True)
    df = df[['email', 'name', 'uid', 'branch', 'sem', 'course']]
    final = pd.concat([final, df], ignore_index=True)

In [None]:
for sheet_no in range(1, 33): # skipping nptel sheet
    df = pd.read_excel(os.path.join(data_dir, 'even sem 2023-24.xlsx'), sheet_name=sheet_no)
    df.columns = ['email', 'name', 'uid', 'class', 'branch', 'course', 'sem']
    df.drop(columns=['class'], inplace=True)
    df = df[['email', 'name', 'uid', 'branch', 'sem', 'course']]
    final = pd.concat([final, df], ignore_index=True)

Normalizing branch names in the final sheet:

In [13]:
import pandas as pd
import os
df = pd.read_csv(os.path.join(data_dir, 'enrolments.csv'))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5377 entries, 0 to 5376
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        0 non-null      float64
 1   email     5377 non-null   object 
 2   name      5377 non-null   object 
 3   uid       5377 non-null   int64  
 4   branch    5377 non-null   object 
 5   sem       5377 non-null   int64  
 6   course    5377 non-null   object 
 7   category  5377 non-null   object 
 8   period    5377 non-null   object 
dtypes: float64(1), int64(2), object(6)
memory usage: 378.2+ KB


In [9]:
for i in df['branch'].unique():
    print(i)

COMP
MCA
AI-ML
CSE-DS
ETRX
EXTC
IT
CSE-AIML
CS-AIML
CS-DS
EXTC-A
CSE


In [10]:
def normalize_branch(branch):
    if branch == 'AI-ML' or branch == 'CS-AIML':
        return 'CSE-AIML'
    if branch == 'CS-DS':
        return 'CSE-DS'
    if branch == 'EXTC-A':
        return 'EXTC'
    return branch

df['branch'] = df['branch'].apply(normalize_branch)

In [11]:
for i in df['branch'].unique():
    print(i)

COMP
MCA
CSE-AIML
CSE-DS
ETRX
EXTC
IT
CSE


In [12]:
df.to_csv(os.path.join(data_dir, 'enrolments.csv'), index=False)

In [16]:
# df.drop(columns='id', inplace=True)
df.head(10)

Unnamed: 0,email,name,uid,branch,sem,course,category,period
0,shaun.dsouza@spit.ac.in,Shaun D'Souza,2021300031,COMP,1,Basics of Fire Safety,Intellectual,ODD_SEM_22_23
1,ganesh.chaudhari@spit.ac.in,Ganesh Chaudhari,2021510007,MCA,1,Basics of Fire Safety,Intellectual,ODD_SEM_22_23
2,chetan.dhandge@spit.ac.in,Chetan Dhandge,2021510010,MCA,1,Basics of Fire Safety,Intellectual,ODD_SEM_22_23
3,sudhir.gomase@spit.ac.in,Sudhir Narayan Gomase,2021510018,MCA,1,Basics of Fire Safety,Intellectual,ODD_SEM_22_23
4,vishal.padme@spit.ac.in,Vishal Devidas Padme,2021510041,MCA,1,Basics of Fire Safety,Intellectual,ODD_SEM_22_23
5,pratik.parale@spit.ac.in,Pratik Parale,2021510045,MCA,1,Basics of Fire Safety,Intellectual,ODD_SEM_22_23
6,amit.rathod@spit.ac.in,Amit Manohar Rathod,2021510053,MCA,1,Basics of Fire Safety,Intellectual,ODD_SEM_22_23
7,sandesh.shivane@spit.ac.in,Sandesh Shivane,2021510063,MCA,1,Basics of Fire Safety,Intellectual,ODD_SEM_22_23
8,prashant.singh2@spit.ac.in,Prashant Singh,2021510064,MCA,1,Basics of Fire Safety,Intellectual,ODD_SEM_22_23
9,rahul.shinde@spit.ac.in,Rahul Pandharinath Shinde,2021600061,CSE-AIML,3,Basics of Fire Safety,Intellectual,ODD_SEM_22_23


course
Art of living                             27
Astitva                                   40
Basic Cooking                            116
Basics of Fire Safety                    286
Basics of Keyboard playing               129
Basics of Music Composition               35
Cinematography                            40
Design Thinking                          374
Film Appreciation                        337
Fundamentals of Photography              291
Idea Lab                                  43
Indian Knowledge System                   49
Innovation and Creativity                294
Integrated Personality Development       198
Introduction to Bhagvadgita              123
Jeevan Vidya (Work Life Balance)          38
Kathak                                    14
Mentoring School Children (Abhyudaya)    225
Mentorship to Juniors                     74
NSS                                      126
Physical Fitness                          37
Pranvidya                                 87
Pri

In [38]:
import plotly.express as px
# df = pd.read_csv
x = df.groupby('course')['uid'].count()
x.name = 'Count of students'
x = pd.merge(x, a, left_index=True, right_on='course_name').sort_values(by='Count of students', ascending=True)
px.bar(x, x='Count of students', y='course_name', orientation='h', color='category')

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [29]:
y = df.groupby('category')['course'].unique()

In [30]:
y

category
Emotional       [Integrated Personality Development, Jeevan Vi...
Intellectual    [Basics of Fire Safety, Basics of Keyboard pla...
Physical        [Kathak, Physical Fitness, Self Defense for Wo...
Social          [Mentoring School Children (Abhyudaya), Mentor...
Spiritual       [Pranvidya, Yog Vidya, Introduction to Bhagvad...
Name: course, dtype: object

In [33]:
a = pd.read_csv(r'C:\Users\arifa\Desktop\college stuff\bap\ise1\data\courses.csv')

Unnamed: 0,Count of students,course_id,course_name,category
10,6,11,Volunteering at Dr. Amte's Anandvan,Social
7,12,8,Principle Centered Leadership,Social
14,14,15,Kathak,Physical
41,16,42,Universal Human Value,Emotional
36,16,37,Social Psychology,Intellectual
38,17,39,Technology Entrepreneurship,Intellectual
21,17,22,Swimming,Physical
0,27,1,Art of living,Spiritual
9,27,10,Terrace Gardening,Social
27,35,28,Basics of Music Composition,Intellectual


In [32]:
a = pd.read_csv(r'C:\Users\arifa\Desktop\college stuff\bap\ise1\data\courses.csv')
a.head()

Unnamed: 0,course_id,course_name,category
0,1,Art of living,Spiritual
1,2,Introduction to Bhagvadgita,Spiritual
2,3,Pranvidya,Spiritual
3,4,Yog Vidya,Spiritual
4,5,Mentoring School Children (Abhyudaya),Social
