In [1]:
# Import the necessary library
import pandas as pd

In [3]:
# 1. Read the CSV file using pandas.
file_path = 'C:/Users/Admin/Downloads/sample_data.csv'
df = pd.read_csv(file_path)
print("Initial DataFrame:")
print(df.head())

Initial DataFrame:
    ID     Name   Age Department   Salary           City
0  101    Alice  25.0         HR  50000.0       New York
1  102      Bob  30.0         IT  60000.0    Los Angeles
2  103  Charlie  35.0    Finance  70000.0        Chicago
3  104    David  40.0  Marketing  80000.0  San Francisco
4  105      Eve   NaN         IT  75000.0        Houston


# 2. Data Cleaning

In [5]:
# Check for missing values in each column.
print("\nMissing values per column:")
print(df.isnull().sum())


Missing values per column:
ID            0
Name          0
Age           1
Department    0
Salary        1
City          0
dtype: int64


In [7]:
# Remove rows with any missing values.
df_cleaned = df.dropna()
print("\nDataFrame after dropping missing values:")
print(df_cleaned.head())


DataFrame after dropping missing values:
    ID     Name   Age Department   Salary           City
0  101    Alice  25.0         HR  50000.0       New York
1  102      Bob  30.0         IT  60000.0    Los Angeles
2  103  Charlie  35.0    Finance  70000.0        Chicago
3  104    David  40.0  Marketing  80000.0  San Francisco
5  106    Frank  45.0    Finance  65000.0        Seattle


In [9]:
# Ensure data is in appropriate data types.
# (Attempt to convert each column to numeric if possible, ignoring errors)
for col in df_cleaned.columns:
    df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='ignore')

print("\nData types after conversion:")
print(df_cleaned.dtypes)


Data types after conversion:
ID              int64
Name           object
Age           float64
Department     object
Salary        float64
City           object
dtype: object


  df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='ignore')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='ignore')


# 3. Data Manipulation

In [13]:
# For demonstration, we will filter and sort using the first numeric column found.
numeric_cols = df_cleaned.select_dtypes(include=['number']).columns.tolist()
print("\nNumeric columns available for filtering/sorting:", numeric_cols)

if numeric_cols:
    # Select the first numeric column for our operations.
    filter_col = numeric_cols[0]
    
    # Example filtering: keep rows where the chosen numeric column's value is greater than its median.
    median_val = df_cleaned[filter_col].median()
    filtered_df = df_cleaned[df_cleaned[filter_col] > median_val]
    print(f"\nFiltered DataFrame (rows where '{filter_col}' > {median_val}):")
    print(filtered_df.head())
    
    # Sorting: sort the filtered data in descending order based on the same column.
    sorted_df = filtered_df.sort_values(by=filter_col, ascending=False)
    print(f"\nSorted DataFrame by '{filter_col}' in descending order:")
    print(sorted_df.head())
else:
    print("\nNo numeric columns available for filtering or sorting.")


Numeric columns available for filtering/sorting: ['ID', 'Age', 'Salary']

Filtered DataFrame (rows where 'ID' > 105.0):
    ID   Name   Age Department   Salary     City
5  106  Frank  45.0    Finance  65000.0  Seattle
6  107  Grace  50.0         HR  72000.0   Boston
7  108   Hank  28.0  Marketing  55000.0   Denver
8  109    Ivy  33.0         IT  62000.0    Miami

Sorted DataFrame by 'ID' in descending order:
    ID   Name   Age Department   Salary     City
8  109    Ivy  33.0         IT  62000.0    Miami
7  108   Hank  28.0  Marketing  55000.0   Denver
6  107  Grace  50.0         HR  72000.0   Boston
5  106  Frank  45.0    Finance  65000.0  Seattle
