**Title**: Understanding the Dataset <br>

**Task**: For the given datasets below, identify the data types and dimensions.<br>

Task 1: Employee Dataset<br>
Columns: Employee ID , Name , Age , Department , Salary , Joining Date<br>
Task 2: Product Sales Dataset<br>
Columns: Product ID , Product Name , Price , Quantity Sold , Category , Sales Date<br>
Task 3: Student Grades Dataset<br>
Columns: Student ID , Student Name , Math Score , Science Score , English Score ,Year<br>

Instructions:<br>

Identify which columns are numerical (continuous or discrete) and which are categorical
(nominal or ordinal).<br>
Note down the dimensions (number of rows and columns) of the dataset.

In [None]:
import pandas as pd

# Suppose these are your DataFrames (replace with actual data loading)
# Example empty DataFrames with columns only:
employee_df = pd.DataFrame(columns=['Employee ID', 'Name', 'Age', 'Department', 'Salary', 'Joining Date'])
product_df = pd.DataFrame(columns=['Product ID', 'Product Name', 'Price', 'Quantity Sold', 'Category', 'Sales Date'])
student_df = pd.DataFrame(columns=['Student ID', 'Student Name', 'Math Score', 'Science Score', 'English Score', 'Year'])

def analyze_dataset(df, dataset_name):
    print(f"--- {dataset_name} ---")
    print("Dimensions (rows, columns):", df.shape)
    
    # Manual identification of data types based on column names & common knowledge
    # (You can also infer dtypes if data is loaded)
    numerical = []
    categorical = []
    
    # Example logic for employee dataset
    if dataset_name == "Employee Dataset":
        numerical = ['Age', 'Salary']
        categorical = ['Employee ID', 'Name', 'Department', 'Joining Date']
    
    elif dataset_name == "Product Sales Dataset":
        numerical = ['Price', 'Quantity Sold']
        categorical = ['Product ID', 'Product Name', 'Category', 'Sales Date']
    
    elif dataset_name == "Student Grades Dataset":
        numerical = ['Math Score', 'Science Score', 'English Score', 'Year']
        categorical = ['Student ID', 'Student Name']
    
    print("Numerical columns (continuous/discrete):", numerical)
    print("Categorical columns (nominal/ordinal):", categorical)
    print("\n")

# Run analysis for each dataset
analyze_dataset(employee_df, "Employee Dataset")
analyze_dataset(product_df, "Product Sales Dataset")
analyze_dataset(student_df, "Student Grades Dataset")


**Title**: Checking for Missing Values<br>

**Task**: Identify and count the number of missing values in each dataset.<br>

Instructions:<br>
Use Python or any data manipulation tool to check for missing values in each column of the datasets. <br>Report the columns which have missing values and their counts.

In [None]:
import pandas as pd

# Example DataFrames (replace with actual data loading)
employee_df = pd.DataFrame({
    'Employee ID': [1, 2, 3, None],
    'Name': ['Alice', 'Bob', None, 'David'],
    'Age': [25, 30, None, 45],
    'Department': ['HR', None, 'IT', 'Finance'],
    'Salary': [50000, 60000, 55000, None],
    'Joining Date': ['2020-01-01', '2019-05-15', '2021-07-20', '2018-03-10']
})

product_df = pd.DataFrame({
    'Product ID': [101, 102, 103, 104],
    'Product Name': ['Pen', 'Notebook', None, 'Marker'],
    'Price': [1.5, 3.0, 2.5, None],
    'Quantity Sold': [100, None, 200, 150],
    'Category': ['Stationery', 'Stationery', 'Stationery', None],
    'Sales Date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04']
})

student_df = pd.DataFrame({
    'Student ID': [1001, 1002, 1003, 1004],
    'Student Name': ['John', 'Jane', None, 'Doe'],
    'Math Score': [88, 92, 85, None],
    'Science Score': [90, None, 88, 95],
    'English Score': [85, 87, None, 93],
    'Year': [2022, 2022, 2023, 2023]
})

def check_missing_values(df, name):
    print(f"--- Missing values in {name} ---")
    missing = df.isnull().sum()
    missing = missing[missing > 0]  # Filter columns with missing values
    if missing.empty:
        print("No missing values found.\n")
    else:
        print(missing, "\n")

check_missing_values(employee_df, "Employee Dataset")
check_missing_values(product_df, "Product Sales Dataset")
check_missing_values(student_df, "Student Grades Dataset")


**Title**: Handling Outliers<br>

**Task**: Detect and propose handling methods for outliers in the numerical columns of the datasets.<br>

Task 1: Age in Employee Dataset<br>
Task 2: Price in Product Sales Dataset<br>
Task 3: Math Score in Student Grades Dataset<br>

Instructions:<br>

Use box plots to visualize potential outliers.<br>
Suggest methods to handle them, such as removal or transformation.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Example DataFrames with sample data (replace with your actual datasets)
employee_df = pd.DataFrame({'Age': [22, 25, 27, 120, 30, 28, 26, 24, 29]})
product_df = pd.DataFrame({'Price': [10, 15, 20, 1000, 18, 22, 17, 19, 16]})
student_df = pd.DataFrame({'Math Score': [70, 75, 85, 90, 95, 200, 88, 82, 78]})

# Function to plot boxplot and detect outliers
def plot_outliers(df, column, dataset_name):
    plt.figure(figsize=(6,4))
    sns.boxplot(x=df[column])
    plt.title(f'Boxplot of {column} in {dataset_name}')
    plt.show()

    # Detect outliers using IQR method
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)][column]
    print(f"Outliers detected in {column} of {dataset_name}:")
    print(outliers.values)
    print()

# Plot and detect outliers for each task
plot_outliers(employee_df, 'Age', 'Employee Dataset')
plot_outliers(product_df, 'Price', 'Product Sales Dataset')
plot_outliers(student_df, 'Math Score', 'Student Grades Dataset')


**Title**: Visualizing Data Distributions<br>

**Task**: Create visualizations for data distributions.<br>

Task 1: Histogram for Age in Employee Dataset<br>
Task 2: Distribution plot for Price in Product Sales Dataset<br>
Task 3: Histogram for Math Score in Student Grades Dataset

Instructions:<br>

Use matplotlib or seaborn in Python to create the plots.<br>
Comment on the skewness or normality of the distributions.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Example data (replace with your real datasets)
employee_df = pd.DataFrame({'Age': [22, 25, 27, 30, 28, 26, 24, 29, 35, 40, 120]})
product_df = pd.DataFrame({'Price': [10, 15, 20, 18, 22, 17, 19, 16, 1000]})
student_df = pd.DataFrame({'Math Score': [70, 75, 85, 90, 95, 88, 82, 78, 80, 85, 200]})

# Task 1: Histogram for Age in Employee Dataset
plt.figure(figsize=(8, 4))
sns.histplot(employee_df['Age'], bins=10, kde=True)
plt.title('Histogram of Age in Employee Dataset')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

# Comment on skewness/normality:
print("Age Distribution:")
print("- The distribution shows a right skew, likely due to the extreme age value (120).")
print("- Without the outlier, the data appears approximately normally distributed.\n")

# Task 2: Distribution plot for Price in Product Sales Dataset
plt.figure(figsize=(8, 4))
sns.histplot(product_df['Price'], bins=10, kde=True)
plt.title('Distribution Plot of Price in Product Sales Dataset')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

print("Price Distribution:")
print("- The distribution is heavily right skewed due to a large outlier price (1000).")
print("- Most prices are clustered in the lower range.\n")

# Task 3: Histogram for Math Score in Student Grades Dataset
plt.figure(figsize=(8, 4))
sns.histplot(student_df['Math Score'], bins=10, kde=True)
plt.title('Histogram of Math Score in Student Grades Dataset')
plt.xlabel('Math Score')
plt.ylabel('Frequency')
plt.show()

print("Math Score Distribution:")
print("- The distribution is roughly normal but with a strong right skew caused by the outlier (200).")
print("- Removing or handling the outlier would improve normality.\n")


**Title**: Finding Relationships Between Features<br>

**Task**: Identify relationships between pairs of features in the datasets.<br>

Task 1: Salary vs Age in Employee Dataset<br>
Task 2: Price vs Quantity Sold in Product Sales Dataset<br>
Task 3: Math Score vs Science Score in Student Grades Dataset

Instructions: <br>

Use scatter plots or correlation coefficients to analyze the relationships.<br>
Describe any insights or patterns observed.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Sample data (replace with your actual datasets)
employee_df = pd.DataFrame({
    'Age': [22, 25, 27, 30, 28, 26, 24, 29, 35, 40],
    'Salary': [40000, 45000, 48000, 52000, 50000, 47000, 43000, 49000, 60000, 65000]
})

product_df = pd.DataFrame({
    'Price': [10, 15, 20, 18, 22, 17, 19, 16, 25, 30],
    'Quantity Sold': [100, 80, 70, 90, 65, 85, 75, 95, 60, 55]
})

student_df = pd.DataFrame({
    'Math Score': [70, 75, 85, 90, 95, 88, 82, 78, 80, 85],
    'Science Score': [68, 72, 84, 88, 93, 85, 80, 77, 79, 83]
})

def analyze_relationship(df, x_col, y_col, dataset_name):
    plt.figure(figsize=(6,4))
    sns.scatterplot(data=df, x=x_col, y=y_col)
    plt.title(f'Scatter Plot: {x_col} vs {y_col} in {dataset_name}')
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.show()
    
    corr = df[[x_col, y_col]].corr().iloc[0,1]
    print(f"Correlation coefficient between {x_col} and {y_col} in {dataset_name}: {corr:.2f}\n")

# Task 1: Salary vs Age (Employee Dataset)
analyze_relationship(employee_df, 'Age', 'Salary', 'Employee Dataset')

# Task 2: Price vs Quantity Sold (Product Sales Dataset)
analyze_relationship(product_df, 'Price', 'Quantity Sold', 'Product Sales Dataset')

# Task 3: Math Score vs Science Score (Student Grades Dataset)
analyze_relationship(student_df, 'Math Score', 'Science Score', 'Student Grades Dataset')
