In [1]:
import pandas as pd

# On the given 5 data sets perform these analysis

# 1. Basic Information and Exploration
- What are the first and last 5 rows of the dataset?

- What is the summary statistics of the numerical columns (mean, std, min, max, etc.)?
- Are there any missing values in the dataset? If so, how many in each column?

# 2. Data Cleaning
- Are there any duplicate rows in the DataFrame?
- How would you handle missing data? Should you drop, fill, or leave them as is?
- How would you handle incorrect or invalid data entries (e.g., incorrect dates or non-numeric values)?
- How can you convert a column into a different data type (e.g., from string to datetime)?
- How would you deal with duplicate rows in the DataFrame?

# 3. Data Transformation
- How can you create new columns derived from existing ones (e.g., by performing arithmetic operations)?
- How can you filter rows based on a specific condition in a column (e.g., only rows where salary > 50000)?
- How can you replace specific values in a column (e.g., replacing 'NaN' with a default value)?
- How can you apply a custom function to each value in a column?

# 4. Data Aggregation
- How can you calculate summary statistics like sum, mean, or count for a specific column or group of columns?
- How would you group the data by a particular column (e.g., by department) and calculate aggregated values (sum, mean)?

# 5. Data Transformation
- Perform split and merge
- Pivot and unpivot columns  

# 6. Data Analysis		
- perform masking
- groupby some columns
- create one new conditonal column  

# 7. Exporting and Saving Data
- How would you save the DataFrame to a CSV file or Excel file?
- How can you save the DataFrame with specific columns or without the index?
- How can you export a DataFrame to a different format (e.g., JSON, HDF5)?
- How would you import data from an external file into a DataFrame?

In [2]:
# 1. Student Grades Dataset  

students = {
    "Student": [
        "Alice Johnson", "Bob Smith", "Charlie Lee", "Diana White", "Ethan Brown",
        "Fiona Green", "George Davis", "Hannah Taylor", "Ian Scott", "Jack Moore",
        "Kelly Fox", "Liam Young", "Mia Hill", "Noah Clark", "Olivia Allen"
    ],
    "Grade": [90, 85, 88, 92, 76, 95, 89, 78, 84, 91, 87, 93, 85, 90, 88],
    "Course": ["Math", "Science", "History", "Math", "Science", "History", "Math", "History", "Math", "Science", "History", "Math", "Science", "History", "Math"]
}

stud=pd.DataFrame(students)

### 1. Basic Information and Exploration¶


In [3]:
# What are the first and last 5 rows of the dataset?
print(stud.head())
stud.tail()

         Student  Grade   Course
0  Alice Johnson     90     Math
1      Bob Smith     85  Science
2    Charlie Lee     88  History
3    Diana White     92     Math
4    Ethan Brown     76  Science


Unnamed: 0,Student,Grade,Course
10,Kelly Fox,87,History
11,Liam Young,93,Math
12,Mia Hill,85,Science
13,Noah Clark,90,History
14,Olivia Allen,88,Math


In [4]:
# What data types of each 
stud.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Student  15 non-null     object
 1   Grade    15 non-null     int64 
 2   Course   15 non-null     object
dtypes: int64(1), object(2)
memory usage: 492.0+ bytes


In [5]:
# What is the summary statistics of the numerical columns (mean, std, min, max, etc.)?
stud.describe()

Unnamed: 0,Grade
count,15.0
mean,87.4
std,5.220837
min,76.0
25%,85.0
50%,88.0
75%,90.5
max,95.0


In [6]:
# Are there any missing values in the dataset? If so, how many in each column?
stud.isnull().sum()

Student    0
Grade      0
Course     0
dtype: int64

### 2. Data Cleaning

In [7]:
# Are there any duplicate rows in the DataFrame?
stud.duplicated().sum()

# How would you handle incorrect or invalid data entries (e.g., incorrect dates or non-numeric values)?
# How can you convert a column into a different data type (e.g., from string to datetime)?
# How would you deal with duplicate rows in the DataFrame?

np.int64(0)

In [8]:
# How would you handle missing data? Should you drop, fill, or leave them as is?
# there is no missing values in Dataset

In [9]:
# - How would you handle incorrect or invalid data entries (e.g., incorrect dates or non-numeric values)?
# No incorrect dt is found
# - How can you convert a column into a different data type (e.g., from string to datetime)?
# with help of astype or to_datetime
# - How would you deal with duplicate rows in the DataFrame?
# just use drop_duplecates()

### 3. Data Transformation

In [10]:
# How can you create new columns derived from existing ones (e.g., by performing arithmetic operations)?
# How can you apply a custom function to each value in a column?
def gradestatus(x):
    if x in range(90,101):
        return 'Distinct'
    elif x in range(80,90):
        return 'First Class'
    elif x in range(40,90):
        return 'Pass'
    else:
        return 'Fail'
stud['GradeStatus']=stud['Grade'].apply(gradestatus)




In [11]:
stud

Unnamed: 0,Student,Grade,Course,GradeStatus
0,Alice Johnson,90,Math,Distinct
1,Bob Smith,85,Science,First Class
2,Charlie Lee,88,History,First Class
3,Diana White,92,Math,Distinct
4,Ethan Brown,76,Science,Pass
5,Fiona Green,95,History,Distinct
6,George Davis,89,Math,First Class
7,Hannah Taylor,78,History,Pass
8,Ian Scott,84,Math,First Class
9,Jack Moore,91,Science,Distinct


In [12]:
# How can you filter rows based on a specific condition in a column (e.g., only rows where salary > 50000)?
stud[stud['Grade']>90]

Unnamed: 0,Student,Grade,Course,GradeStatus
3,Diana White,92,Math,Distinct
5,Fiona Green,95,History,Distinct
9,Jack Moore,91,Science,Distinct
11,Liam Young,93,Math,Distinct


In [13]:
# How can you replace specific values in a column (e.g., replacing 'NaN' with a default value)?
# with help of fillna and set_default function

### 4. Data Aggregation¶

In [14]:
# How can you calculate summary statistics like sum, mean, or count for a specific column or group of columns?
stud['Grade'].describe()



count    15.000000
mean     87.400000
std       5.220837
min      76.000000
25%      85.000000
50%      88.000000
75%      90.500000
max      95.000000
Name: Grade, dtype: float64

In [15]:
# How would you group the data by a particular column (e.g., by department) and calculate aggregated values (sum, mean)?
stud.groupby('Course')['Grade'].sum()

Course
History    438
Math       536
Science    337
Name: Grade, dtype: int64

### 5. Data Transformation
- Perform split and merge
- Pivot and unpivot columns 

In [16]:
products = {
    "Product": [
        "Laptop", "Phone", "Tablet", "Smartwatch", "Headphones",
        "Keyboard", "Mouse", "Monitor", "Charger", "Speakers",
        "Webcam", "Printer", "Router", "Hard Drive", "Flash Drive"
    ],
    "Price": [1200, 800, 600, 250, 150, 100, 30, 350, 20, 120, 75, 150, 200, 100, 25],
    "Category": ["Electronics", "Electronics", "Electronics", "Electronics", "Electronics", 
                 "Accessories", "Accessories", "Electronics", "Accessories", "Accessories", 
                 "Accessories", "Accessories", "Electronics", "Accessories", "Accessories"]
}

product=pd.DataFrame(products)

In [17]:
product.head()

Unnamed: 0,Product,Price,Category
0,Laptop,1200,Electronics
1,Phone,800,Electronics
2,Tablet,600,Electronics
3,Smartwatch,250,Electronics
4,Headphones,150,Electronics


In [18]:
product.tail()

Unnamed: 0,Product,Price,Category
10,Webcam,75,Accessories
11,Printer,150,Accessories
12,Router,200,Electronics
13,Hard Drive,100,Accessories
14,Flash Drive,25,Accessories


In [19]:
product.describe()

Unnamed: 0,Price
count,15.0
mean,278.0
std,337.288558
min,20.0
25%,87.5
50%,150.0
75%,300.0
max,1200.0


In [20]:
product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Product   15 non-null     object
 1   Price     15 non-null     int64 
 2   Category  15 non-null     object
dtypes: int64(1), object(2)
memory usage: 492.0+ bytes


In [21]:
product.isnull().sum()

Product     0
Price       0
Category    0
dtype: int64

### 2. Data Cleaning

In [22]:
product.duplicated().sum()

np.int64(0)

In [23]:
# No Duplicates found
