In [None]:
data = [50,50,47,97,49,3,53,42,26,74,82,62,37,15,70,27,36,35,48,52,63,64]
print(data)

load the data into a NumPy array.

In [None]:
import numpy as np

grades = np.array(data)
print(grades)

differences between a list and a NumPy array
data types behave different, an expression that multiplies by 2 would show this

In [None]:
print (type(data),'x 2:', data * 2)
print('---')
print (type(grades),'x 2:', grades * 2)

In [None]:
grades.shape

In [None]:
grades[0]

You can apply aggregations across the elements in the array

In [None]:
grades.mean()

add a second set of data for the same students. This time, we'll record the typical number of hours per week they devoted to studying.

In [None]:
# Define an array of study hours
study_hours = [10.0,11.5,9.0,16.0,9.25,1.0,11.5,9.0,8.5,14.5,15.5,
               13.75,9.0,8.0,15.5,8.0,9.0,6.0,10.0,12.0,12.5,12.0]

# Create a 2D array (an array of arrays)
student_data = np.array([study_hours, grades])

# display the array
student_data

In [None]:
# Show shape of 2D array
student_data.shape

In [None]:
# Show the first element of the first element
student_data[0][0]

In [None]:
# Get the mean value of each sub-array
avg_study = student_data[0].mean()
avg_grade = student_data[1].mean()

print('Average study hours: {:.2f}\nAverage grade: {:.2f}'.format(avg_study, avg_grade))

# Exploring tabular data with Pandas

reate a DataFrame with three columns. The first column is a list of student names, and the second and third columns are the NumPy arrays containing the study time and grade data

In [None]:
import pandas as pd

df_students = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic', 'Jimmie', 
                                     'Rhonda', 'Giovanni', 'Francesca', 'Rajab', 'Naiyana', 'Kian', 'Jenny',
                                     'Jakeem','Helena','Ismat','Anila','Skye','Daniel','Aisha'],
                            'StudyHours':student_data[0],
                            'Grade':student_data[1]})

df_students 

# Finding and filtering data in a DataFrame

In [None]:
# Get the data for index value 5
df_students.loc[5]

In [None]:
# Get the rows with index values from 0 to 5
df_students.loc[0:5]

In [None]:
# Get data in the first five rows
df_students.iloc[0:5]

In [None]:
# you can use it to find the values for the columns in positions 1 and 2 in row 0, like this:
df_students.iloc[0,[1,2]]

In [None]:
# In the absence of an explicit index column, the rows in our DataFrame are
# indexed as integer values, but the columns are identified by name:
df_students.loc[0,'Grade']

In [None]:
df_students.loc[df_students['Name']=='Aisha']

In [None]:
# you don't need to explicitly use the loc method to do this.
# You can simply apply a DataFrame filtering expression, like this:
df_students[df_students['Name']=='Aisha']
# with good measure use query
df_students.query('Name=="Aisha"')

In [None]:
# You can specify the column name as a named index value (as in the df_students['Name']
df_students[df_students.Name == 'Aisha']

## Loading a DataFrame from a file

In [None]:
# replace the student grades DataFrame with the contents of a text file.
!wget https://raw.githubusercontent.com/MicrosoftDocs/mslearn-introduction-to-machine-learning/main/Data/ml-basics/grades.csv
df_students = pd.read_csv('grades.csv',delimiter=',',header='infer')
df_students.head()

## Hanlding missing values

You can use the isnull method to identify which individual values are null,

In [None]:
df_students.isnull()
# get the sum of missing values for each column
df_students.isnull().sum()

In [None]:
#  filter the DataFrame to include only rows where any of the columns (axis 1 of the DataFrame) are null.
df_students[df_students.isnull().any(axis=1)]

In [None]:
# impute replacement the missing value with fillna()
df_students.StudyHours = df_students.StudyHours.fillna(df_students.StudyHours.mean())
df_students

In [None]:
# you can drop rows or columns that contain null values by using the dropna method
df_students = df_students.dropna(axis=0, how='any')
df_students

In [None]:
# explore the data in the DataFrame comparing the mean study hours and grades.

# Get the mean study hours using to column name as an index
mean_study = df_students['StudyHours'].mean()

# Get the mean grade using the column name as a property (just to make the point!)
mean_grade = df_students.Grade.mean()

# Print the mean study hours and mean grade
print('Average weekly study hours: {:.2f}\nAverage grade: {:.2f}'.format(mean_study, mean_grade))

In [None]:
# filter the DataFrame to find only the students who studied for more than the average amount of time.

# Get students who studied for the mean or more hours
df_students[df_students.StudyHours > mean_study]


In [None]:
# find the average grade for students who undertook more than the average amount of study time.
df_students[df_students.StudyHours > mean_study].Grade.mean()


assume that the passing grade for the course is 60.
create a Pandas Series containing the pass/fail indicator 
(True or False), and then we'll concatenate that series as a 
new column (axis 1) in the DataFrame.

In [None]:
passes  = pd.Series(df_students['Grade'] >= 60)
df_students = pd.concat([df_students, passes.rename("Pass")], axis=1)

df_students

 use the groupby method to group the student data into groups based on the Pass column you added previously and to count the number of names in each group. In other words, you can determine how many students passed and failed.

In [None]:
print(df_students.groupby(df_students.Pass).Name.count())

aggregate multiple fields in a group using any available aggregation function. For example, you can find the mean study time and grade for the groups of students who passed and failed the course.

In [None]:
print(df_students.groupby(df_students.Pass)['StudyHours', 'Grade'].mean())

In [None]:
# Create a DataFrame with the data sorted by Grade (descending)
df_students = df_students.sort_values('Grade', ascending=False)

# Show the DataFrame
df_students