--- 
**Author** : Huzaifa Ali

**Email** : *huzaifaa0303@gmail.com*

---

In [50]:
# pip install -r requirements.txt

1. Create a DataFrame from a dictionary.

In [51]:
import pandas as pd

data = pd.DataFrame({
    'Name' : ['Huzaifa', 'Amina', 'Ali', 'Izza', 'Fizza'],
    'Age' : [23,25,26,27,None],
    'Subject' : ['Computer', 'Arts', 'Physics', 'Maths', 'Chemistry'],
    'Marks' : [90, 70, 60, 50, None] 
})

---

 2. Read a CSV into a DataFrame.


In [52]:
# Creating a csv in order to read it later
# Create CSV
data.to_csv('sample_data.csv', index=False)

# Explanation: 
# Index is set to false because not wanted to add index numbers to csv file, it already contain those by default 

# Read CSV
data = pd.read_csv('sample_data.csv')

--- 

3. Display first 5 rows.

In [53]:
print(data.head())

      Name   Age    Subject  Marks
0  Huzaifa  23.0   Computer   90.0
1    Amina  25.0       Arts   70.0
2      Ali  26.0    Physics   60.0
3     Izza  27.0      Maths   50.0
4    Fizza   NaN  Chemistry    NaN


---

4. Fill missing values with column mean.

In [54]:
# First check if there is any missing value
data.isnull().sum()

Name       0
Age        1
Subject    0
Marks      1
dtype: int64

In [55]:
# Fill missing values of column which contains numeric data with their mean 
cleaned_data = data.fillna(
    {
        'Age' : int(data['Age'].mean()),
        'Marks' : int(data['Marks'].mean())
     },
    inplace=False
)

print(cleaned_data)

      Name   Age    Subject  Marks
0  Huzaifa  23.0   Computer   90.0
1    Amina  25.0       Arts   70.0
2      Ali  26.0    Physics   60.0
3     Izza  27.0      Maths   50.0
4    Fizza  25.0  Chemistry   67.0


--- 

5. Sort DataFrame by a column.


In [56]:
sorted_data = cleaned_data.sort_values('Name')

print(sorted_data)

      Name   Age    Subject  Marks
2      Ali  26.0    Physics   60.0
1    Amina  25.0       Arts   70.0
4    Fizza  25.0  Chemistry   67.0
0  Huzaifa  23.0   Computer   90.0
3     Izza  27.0      Maths   50.0


---

 6. Filter rows (marks > 80).


In [57]:
filtered_data = sorted_data.query('Marks > 80')
print(filtered_data)

      Name   Age   Subject  Marks
0  Huzaifa  23.0  Computer   90.0


---

 7. Add pass/fail column.

In [58]:
cleaned_data.loc[cleaned_data['Marks'] > 50, 'Result'] = 'Pass'
cleaned_data.loc[cleaned_data['Marks'] <= 50, 'Result'] = 'Fail'

print(f'New Data : \n{cleaned_data}')

New Data : 
      Name   Age    Subject  Marks Result
0  Huzaifa  23.0   Computer   90.0   Pass
1    Amina  25.0       Arts   70.0   Pass
2      Ali  26.0    Physics   60.0   Pass
3     Izza  27.0      Maths   50.0   Fail
4    Fizza  25.0  Chemistry   67.0   Pass


---

 8. Group by column and compute mean.

In [59]:
numeric_data = cleaned_data.select_dtypes(include='number')
mean_of_numeric_data = numeric_data.groupby('Age')['Marks'].mean()

print(f'Mean of Data Value : \n{mean_of_numeric_data}')

Mean of Data Value : 
Age
23.0    90.0
25.0    68.5
26.0    60.0
27.0    50.0
Name: Marks, dtype: float64
