# *Task 1 Data Handling with NumPy & Pandas*

<p style='font-size:20px;'><i>The aim of this task is to build a strong foundation in data loading, data cleaning, and data manipulation using the core python libraries Pandas and Numpy.</i></p>

## *Data Loading and Cleaning*

In [1]:
# Import the necessary libraries
import pandas as pd
import numpy as np

In [2]:
# Load the dataset into a pandas dataframe
df = pd.read_csv('Student Performance.csv')

In [3]:
# Display the first five rows of the dataframe
df.head()

Unnamed: 0,Student_ID,Age,Gender,Study_Hours_per_Week,Preferred_Learning_Style,Online_Courses_Completed,Participation_in_Discussions,Assignment_Completion_Rate (%),Exam_Score (%),Attendance_Rate (%),Use_of_Educational_Tech,Self_Reported_Stress_Level,Time_Spent_on_Social_Media (hours/week),Sleep_Hours_per_Night,Final_Grade
0,S00001,18,Female,48,Kinesthetic,14,Yes,100,69,66,Yes,High,9,8,C
1,S00002,29,Female,30,Reading/Writing,20,No,71,40,57,Yes,Medium,28,8,D
2,S00003,20,Female,47,Kinesthetic,11,No,60,43,79,Yes,Low,13,7,D
3,S00004,23,Female,13,Auditory,0,Yes,63,70,60,Yes,Low,24,10,B
4,S00005,19,Female,24,Auditory,19,Yes,59,63,93,Yes,Medium,26,8,C


In [4]:
# Get the number of rows and columns of the dataframe
df.shape

(10000, 15)

In [5]:
# Get the column names of the dataframe
df.columns

Index(['Student_ID', 'Age', 'Gender', 'Study_Hours_per_Week',
       'Preferred_Learning_Style', 'Online_Courses_Completed',
       'Participation_in_Discussions', 'Assignment_Completion_Rate (%)',
       'Exam_Score (%)', 'Attendance_Rate (%)', 'Use_of_Educational_Tech',
       'Self_Reported_Stress_Level', 'Time_Spent_on_Social_Media (hours/week)',
       'Sleep_Hours_per_Night', 'Final_Grade'],
      dtype='object')

In [6]:
# Display overview of the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column                                   Non-Null Count  Dtype 
---  ------                                   --------------  ----- 
 0   Student_ID                               10000 non-null  object
 1   Age                                      10000 non-null  int64 
 2   Gender                                   10000 non-null  object
 3   Study_Hours_per_Week                     10000 non-null  int64 
 4   Preferred_Learning_Style                 10000 non-null  object
 5   Online_Courses_Completed                 10000 non-null  int64 
 6   Participation_in_Discussions             10000 non-null  object
 7   Assignment_Completion_Rate (%)           10000 non-null  int64 
 8   Exam_Score (%)                           10000 non-null  int64 
 9   Attendance_Rate (%)                      10000 non-null  int64 
 10  Use_of_Educational_Tech                  10000 non-null  ob

In [7]:
# Display overall statistical summary of the numerical features
df.describe()

Unnamed: 0,Age,Study_Hours_per_Week,Online_Courses_Completed,Assignment_Completion_Rate (%),Exam_Score (%),Attendance_Rate (%),Time_Spent_on_Social_Media (hours/week),Sleep_Hours_per_Night
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,23.4788,27.1303,10.0079,74.922,70.1889,75.0851,14.9365,6.9793
std,3.461986,13.002547,6.136726,14.675437,17.649447,14.749251,9.022639,1.996965
min,18.0,5.0,0.0,50.0,40.0,50.0,0.0,4.0
25%,20.0,16.0,5.0,62.0,55.0,62.0,7.0,5.0
50%,23.0,27.0,10.0,75.0,70.0,75.0,15.0,7.0
75%,27.0,38.0,15.0,88.0,85.0,88.0,23.0,9.0
max,29.0,49.0,20.0,100.0,100.0,100.0,30.0,10.0


In [8]:
# Check for any null values 
df.isnull().sum()

Student_ID                                 0
Age                                        0
Gender                                     0
Study_Hours_per_Week                       0
Preferred_Learning_Style                   0
Online_Courses_Completed                   0
Participation_in_Discussions               0
Assignment_Completion_Rate (%)             0
Exam_Score (%)                             0
Attendance_Rate (%)                        0
Use_of_Educational_Tech                    0
Self_Reported_Stress_Level                 0
Time_Spent_on_Social_Media (hours/week)    0
Sleep_Hours_per_Night                      0
Final_Grade                                0
dtype: int64

In [9]:
# Check for duplicates
df.duplicated().sum()

0

## *Data Manipulation*

### *Basic Statistics*

In [10]:
# Convert numerical columns of the dataframe to a Numpy Array
numeric_data = df.select_dtypes(include=[np.number]).to_numpy()
numeric_data

array([[18, 48, 14, ..., 66,  9,  8],
       [29, 30, 20, ..., 57, 28,  8],
       [20, 47, 11, ..., 79, 13,  7],
       ...,
       [26, 23,  3, ..., 70, 20,  8],
       [18, 41,  7, ..., 90,  6,  8],
       [24,  8, 20, ..., 81, 17,  4]], dtype=int64)

In [21]:
# Calculate the mean of each column in numeric_data
numeric_data_mean = np.mean(numeric_data, axis=0)  # axis=0 specifies that we want the mean across rows (for each column)
print('Mean of Each Numerical Feature:\n', numeric_data_mean.tolist())

Mean of Each Numerical Feature:
 [23.4788, 27.1303, 10.0079, 74.922, 70.1889, 75.0851, 14.9365, 6.9793]


In [22]:
# Calculate the median of each column in numeric_data
numeric_data_median = np.median(numeric_data, axis=0)
print('Median of Each Numerical Feature:\n',numeric_data_median.tolist())

Median of Each Numerical Feature:
 [23.0, 27.0, 10.0, 75.0, 70.0, 75.0, 15.0, 7.0]


In [23]:
# Calculate the standard deviation of each column in numeric_data
numeric_data_std = np.std(numeric_data, axis=0)
print('Standard Deviation of Each Numerical Feature:\n',numeric_data_std.tolist())

Standard Deviation of Each Numerical Feature:
 [3.461813189644987, 13.00189685815112, 6.136418954895436, 14.674703267868827, 17.648564156610586, 14.748513755290736, 9.022187525761144, 1.9968654211037857]


In [24]:
# Create a pandas dataframe with numeric features as columns and statistics as index
feature_names = df.select_dtypes(include=[np.number]).columns
feature_stats = pd.DataFrame(
    np.round([numeric_data_mean,numeric_data_median,numeric_data_std],2),
    index = ['Mean', 'Median','Standard Deviation'],
    columns = feature_names
)

In [25]:
feature_stats

Unnamed: 0,Age,Study_Hours_per_Week,Online_Courses_Completed,Assignment_Completion_Rate (%),Exam_Score (%),Attendance_Rate (%),Time_Spent_on_Social_Media (hours/week),Sleep_Hours_per_Night
Mean,23.48,27.13,10.01,74.92,70.19,75.09,14.94,6.98
Median,23.0,27.0,10.0,75.0,70.0,75.0,15.0,7.0
Standard Deviation,3.46,13.0,6.14,14.67,17.65,14.75,9.02,2.0


### *Numpy Array Operations*

In [41]:
# Get the index position of 'Exam_Score (%)' column in feaure_names
index = feature_names.get_loc('Exam_Score (%)')
# Extract the Exam_Score data from numeric_data
Exam_Score = numeric_data[:,index]
# Convert the array to float64 before multiplication to handle decimal results
Exam_Score = Exam_Score.astype(float)
# Multiply each exam score by 1.02 to increase it by 2 percent
Exam_Score_increased = Exam_Score * 1.02

In [42]:
# Display the new exam scores
Exam_Score_increased

array([70.38, 40.8 , 43.86, ..., 40.8 , 45.9 , 40.8 ])

In [43]:
# Normalize the numeric data
normalized_numeric_data = (numeric_data - numeric_data_mean)/numeric_data_std

In [44]:
# Display the normalized numeric data
normalized_numeric_data

array([[-1.58263884,  1.60512733,  0.65055858, ..., -0.61600105,
        -0.65798898,  0.51115112],
       [ 1.59488675,  0.22071395,  1.62832754, ..., -1.22623203,
         1.44793045,  0.51115112],
       [-1.00490691,  1.52821548,  0.1616741 , ...,  0.2654437 ,
        -0.21463752,  0.01036625],
       ...,
       [ 0.72828887, -0.31766903, -1.14201785, ..., -0.34478728,
         0.56122753,  0.51115112],
       [-1.58263884,  1.06674435, -0.49017188, ...,  1.01128156,
        -0.99050258,  0.51115112],
       [ 0.15055694, -1.47134685,  1.62832754, ...,  0.40105058,
         0.22871393, -1.49198838]])